# sk_learn Outlier removal

In [449]:
import pandas as pd;
import numpy as np;
from functools import reduce
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

pd.set_option('display.float_format', lambda x: '%.1f' % x)

In [450]:
### CUSTOM OUTLIER REMOVAL TRANSFORMERS ###
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.exceptions import NotFittedError

## OUTLIER BASE CLASS
class OutlierRemover(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None):        
        self.columns = columns  
        self.fitted = False

    def fit(self, X, y=None): 
        self.fitted = True
        return self

    def transform(self, X: pd.DataFrame,y=None):
        if not self.fitted: raise NotFittedError()
        filters = reduce(lambda filter, d: filter & self._append_filter(X, d), self._cols(X), True) 
        return X.copy()[filters]
    
    def _cols(self, X: pd.DataFrame): return self.columns or list(X.columns)

    def _append_filter(self, X: pd.DataFrame, f: str) -> bool:
        min, max = self._range[f]
        return (X[f] >= min) & (X[f] <= max)

## REMOVE OUTLIERS USING A CUSTOM RANGE (min, max)    
class MinMaxOutlierRemover(OutlierRemover):
    def __init__(self,range=(0,1), columns=None):
        self._threshold = range
        super().__init__(columns)
    
    def fit(self, X, y=None):
        self._range = {f: self._threshold for f in self._cols(X)}
        return super().fit(X, y)

## REMOVE OUTLIERS USING IQR
class IQROutlierRemover(OutlierRemover):
    def __init__(self,factor=1.5, columns=None):
        self._factor = factor
        super().__init__(columns)
    
    def _find_range(self, X: pd.DataFrame, f: str):
        Q1 = X[f].quantile(0.25)
        Q3 = X[f].quantile(0.75)
        IQR = Q3 - Q1
        return(Q1 - self._factor * IQR, Q3 + self._factor * IQR)

    def fit(self, X, y=None):
        self._range = {f: self._find_range(X,f) for f in self._cols(X)}
        return super().fit(X, y)


## REMOVE OUTLIERS USING Z-SCORE
class ZScoreOutlierRemover(OutlierRemover):
    def __init__(self, factor=3, columns=None):
        self._factor = factor
        super().__init__(columns)
    
    def _find_range(self, X: pd.DataFrame, f: str):
        mean = X[f].mean()
        sd = X[f].std()
        return (mean - sd * self._factor, mean + sd * self._factor)

    def fit(self, X, y=None):
        self._range = {f: self._find_range(X,f) for f in self._cols(X)}
        return super().fit(X, y)

## TRANSFORMATOR TO MERGE AND GROUP VALUES
class MergeValues(BaseEstimator, TransformerMixin):
    def __init__(self, labels, category, columns=None):        
        self._from = labels  
        self._to = category  
        self.columns = columns  
        self._fitted = False

    def _cols(self, X: pd.DataFrame): return self.columns or list(X.columns)

    def fit(self, X, y=None): 
        self._to = [self._to] if not isinstance(self._to, list) else self._to
        self._from = [self._from] if len(self._from) < 1 or not isinstance(self._from[0], list) else self._from
        print(self._from)
        print(self._to)
        if(len(self._from) != len(self._to)): raise ValueError("labels size does not match category size")
        self._fitted = True
        return self

    def transform(self,X: pd.DataFrame,y=None):
        if not self._fitted: raise NotFittedError()
        mapc = lambda tags, i: {c: tags[i] for c in self._cols(X)}
        return reduce(
            lambda D, i: D.replace(to_replace=mapc(self._from, i), value=mapc(self._to, i)),
            range(len(self._to)), X
        )
 


## Using the pipeline to clean datasets

In [451]:
headers = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'ft_income'];
df = pd.read_csv('../../datasets/adults.csv', engine='python', sep=', ', names=headers, na_values=['?'])

df.head(5)
df.isna().sum()

age                  0
workclass         1836
fnlwgt               0
education            0
education-num        0
marital-status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital-gain         0
capital-loss         0
hours-per-week       0
native-country     583
ft_income            0
dtype: int64

In [452]:
X = df.drop('ft_income',axis=1)
y = df['ft_income']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42 ) 
steps = []

In [453]:
steps.extend([
    ('IQR outlier removal', IQROutlierRemover(factor=1.5, columns=["fnlwgt", "hours-per-week"])),
    ('Threshold outlier removal', MinMaxOutlierRemover(range=(0, 30000), columns=["capital-gain"]))
])

In [454]:
continues_tf = [
    # (
    #     'Continues features', 
    #     StandardScaler(), 
    #     ['fnlwgt','capital-gain', 'capital-loss', 'hours-per-week']
    # )   
]

### 1. Categorial features

In [455]:
categorical_tf = [
    (
        'Impute missing workclass', 
        SimpleImputer(strategy='most_frequent'), 
        ["workclass", "occupation", "native-country"]
    ),
    (
        'Merge high-school dropouts',
        MergeValues(labels=['9th','10th','11th'], category="HS-dropout"),
        ["education"]
    ),
    # (
    #     'Merge middle-school dropouts',
    #     MergeValues(labels=['7th-8th','5th-6th','1st-4th'], category="MS-dropout"),
    #     ["education"]
    # )
]

In [456]:
transformer = MergeValues(
    labels=[['7th-8th','5th-6th','1st-4th'], ['9th','10th','11th','12th']], 
    category=["MS-dropout", "HS-dropout"], 
    columns=["education"]
)
X_train = transformer.fit_transform(X_train)

# print(f"before: {X_train.shape[0]} rows")
# transformerA = ZScoreOutlierRemover(factor=3, columns=["fnlwgt", "hours-per-week"])
# X_train = transformerA.fit_transform(X_train)
# transformerB = MinMaxOutlierRemover(range=(0, 30000), columns=["capital-gain"])
# X_train = transformerB.fit_transform(X_train)
# print(f"after: {X_train.shape[0]} rows")

imputed = pd.DataFrame(X_train, columns=X_train.columns)
imputed["education"].unique()

[['7th-8th', '5th-6th', '1st-4th'], ['9th', '10th', '11th', '12th']]
['MS-dropout', 'HS-dropout']


array(['Bachelors', 'Assoc-voc', 'HS-dropout', 'Some-college', 'HS-grad',
       'Prof-school', 'Assoc-acdm', 'Masters', 'MS-dropout', 'Doctorate',
       'Preschool'], dtype=object)

In [457]:
# steps.append((
#     'Scaling data',
#     ColumnTransformer( transformers=continues_tf + categorical_tf, remainder='passthrough')
# ))

# pipeline = Pipeline(steps=steps, verbose=True)

# print(f"before: {X_train.shape[0]} rows")
# clean_data = pipeline.fit_transform(X_train)
# print(f"after: {clean_data.shape[0]} rows")
# print(X_train.shape)
# print(clean_data.shape)
# pd.DataFrame(clean_data, columns=X_train.columns)
