# sk_learn Outlier removal

In [270]:
import pandas as pd;
import numpy as np;
# import matplotlib.pyplot as plt
# from typing import List, Any, Callable
# import calendar
from functools import reduce
# import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

pd.set_option('display.float_format', lambda x: '%.1f' % x)

In [271]:
### CUSTOM OUTLIER REMOVAL TRANSFORMERS ###
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.exceptions import NotFittedError

## OUTLIER BASE CLASS
class OutlierRemover(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None):        
        self.columns = columns  
        self.fitted = False

    def fit(self, X, y=None): 
        self.fitted = True
        return self

    def transform(self, X: pd.DataFrame,y=None):
        if not self.fitted: raise NotFittedError()
        filters = reduce(lambda filter, d: filter & self._append_filter(X, d), self._cols(X), True) 
        return X.copy()[filters]
    
    def _cols(self, X: pd.DataFrame): return self.columns or list(X.columns)

    def _append_filter(self, X: pd.DataFrame, f: str) -> bool:
        min, max = self._range[f]
        return (X[f] >= min) & (X[f] <= max)

## REMOVE OUTLIERS USING A CUSTOM RANGE (min, max)    
class MinMaxOutlierRemover(OutlierRemover):
    def __init__(self,range=(0,1), columns=None):
        self._threshold = range
        super().__init__(columns)
    
    def fit(self, X, y=None):
        self._range = {f: self._threshold for f in self._cols(X)}
        return super().fit(X, y)

## REMOVE OUTLIERS USING IQR
class IQROutlierRemover(OutlierRemover):
    def __init__(self,factor=1.5, columns=None):
        self._factor = factor
        super().__init__(columns)
    
    def _find_range(self, X: pd.DataFrame, f: str):
        Q1 = X[f].quantile(0.25)
        Q3 = X[f].quantile(0.75)
        IQR = Q3 - Q1
        return(Q1 - self._factor * IQR, Q3 + self._factor * IQR)

    def fit(self, X, y=None):
        self._range = {f: self._find_range(X,f) for f in self._cols(X)}
        return super().fit(X, y)


## REMOVE OUTLIERS USING Z-SCORE
class ZScoreOutlierRemover(OutlierRemover):
    def __init__(self, factor=3, columns=None):
        self._factor = factor
        super().__init__(columns)
    
    def _find_range(self, X: pd.DataFrame, f: str):
        mean = X[f].mean()
        sd = X[f].std()
        return (mean - sd * self._factor, mean + sd * self._factor)

    def fit(self, X, y=None):
        self._range = {f: self._find_range(X,f) for f in self._cols(X)}
        return super().fit(X, y)

## TRANSFORMATOR TO MERGE AND GROUP VALUES
class MergeValues(BaseEstimator, TransformerMixin):
    def __init__(self, labels, category, columns=None):        
        self.labels = labels  
        self.category = category  
        self.columns = columns  

    def fit(self, X, y=None): return self

    def transform(self,X: pd.DataFrame,y=None):
        X = X.copy()
        cols = self.columns or list(X.columns)
        _from = {c: self.labels for c in cols}
        _to = {c: self.category for c in cols}
        return X.replace(to_replace=_from, value=_to)


In [273]:
headers = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'ft_income'];
df = pd.read_csv('../../datasets/adults.csv', engine='python', sep=', ', names=headers, na_values=['?'])

df.head(5)
df.isna().sum()

age                  0
workclass         1836
fnlwgt               0
education            0
education-num        0
marital-status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital-gain         0
capital-loss         0
hours-per-week       0
native-country     583
ft_income            0
dtype: int64

In [274]:
X = df.drop('ft_income',axis=1)
y = df['ft_income']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42 ) 
steps = []

In [275]:
steps.extend([
    ('IQR outlier removal', IQROutlierRemover(factor=1.5, columns=["fnlwgt", "hours-per-week"])),
    ('Threshold outlier removal', MinMaxOutlierRemover(range=(0, 30000), columns=["capital-gain"]))
])

In [276]:
continues_tf = [
    (
        'Continues features', 
        StandardScaler(), 
        ['fnlwgt','capital-gain', 'capital-loss', 'hours-per-week']
    )   
]

### 3. Categorial features

In [277]:
categorical_tf = [
    (
        'Impute missing workclass', 
        SimpleImputer(strategy='most_frequent'), 
        ["workclass", "occupation", "native-country"]
    ),
    (
        'Merge high-school dropouts',
        MergeValues(labels=['9th','10th','11th'], category="HS-dropout"),
        ["education"]
    ),
    (
        'Merge middle-school dropouts',
        MergeValues(labels=['7th-8th','5th-6th','1st-4th'], category="MS-dropout"),
        ["education"]
    )
]

In [278]:
# transformerA = MergeValues(labels=['9th','10th','11th','12th'], category="HS-dropout", columns=["education"])
# X_train = transformerA.transform(X_train)

# transformerB = MergeValues(labels=['7th-8th','5th-6th','1st-4th'], category="MS-dropout", columns=["education"])
# X_train = transformerB.transform(X_train)

print(f"before: {X_train.shape[0]} rows")
transformerA = ZScoreOutlierRemover(factor=3, columns=["fnlwgt", "hours-per-week"])
X_train = transformerA.fit_transform(X_train)
transformerB = MinMaxOutlierRemover(range=(0, 30000), columns=["capital-gain"])
X_train = transformerB.fit_transform(X_train)
print(f"after: {X_train.shape[0]} rows")

# imputed = pd.DataFrame(X_train, columns=X_train.columns)
# imputed["education"].unique()

before: 26048 rows
after: 25295 rows


In [279]:
steps.append((
    'Scaling data',
    ColumnTransformer( transformers=continues_tf + categorical_tf, remainder='passthrough')
))

pipeline = Pipeline(steps=steps, verbose=True)

print(f"before: {X_train.shape[0]} rows")
X_train = pipeline.fit_transform(X_train)
print(f"after: {X_train.shape[0]} rows")
X_train


before: 25295 rows
[Pipeline]  (step 1 of 3) Processing IQR outlier removal, total=   0.0s
[Pipeline]  (step 2 of 3) Processing Threshold outlier removal, total=   0.0s
[Pipeline] ...... (step 3 of 3) Processing Scaling data, total=   0.0s
after: 18145 rows


array([[0.20820734714084438, -0.24009247651848103, -0.2215890038225421,
        ..., 'Not-in-family', 'White', 'Female'],
       [-1.0945354178269502, -0.24009247651848103, 4.412662356511648,
        ..., 'Husband', 'White', 'Male'],
       [0.26483008591392454, -0.24009247651848103, -0.2215890038225421,
        ..., 'Not-in-family', 'White', 'Male'],
       ...,
       [1.742828156433489, -0.24009247651848103, -0.2215890038225421,
        ..., 'Husband', 'White', 'Male'],
       [0.45274854927203, -0.24009247651848103, -0.2215890038225421, ...,
        'Husband', 'White', 'Male'],
       [2.088767903945792, -0.24009247651848103, -0.2215890038225421,
        ..., 'Husband', 'White', 'Male']], dtype=object)