In [1]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.datasets import load_iris
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression

In [2]:
test = pd.DataFrame({'col1':[100,200,300,999],'col2':[0,0,1,2],'col3':[-10,0,1,2]})
test

Unnamed: 0,col1,col2,col3
0,100,0,-10
1,200,0,0
2,300,1,1
3,999,2,2


In [3]:
def outlier_removal(X,factor):
    X = pd.DataFrame(X).copy()
    for i in range(X.shape[1]):
        x = pd.Series(X.iloc[:,i]).copy()
        q1 = x.quantile(0.25)
        q3 = x.quantile(0.75)
        iqr = q3 - q1
        lower_bound = q1 - (factor * iqr)
        upper_bound = q3 + (factor * iqr)
        X.iloc[((X.iloc[:,i] < lower_bound) | (X.iloc[:,i] > upper_bound)),i] = np.nan 
    return X

#creating outlier_remover object using FunctionTransformer with factor=1.5
outlier_remover = FunctionTransformer(outlier_removal,kw_args={'factor':1.5})

In [4]:
outlier_remover.fit_transform(test)

Unnamed: 0,col1,col2,col3
0,100.0,0.0,
1,200.0,0.0,0.0
2,300.0,1.0,1.0
3,,2.0,2.0


Now create data pipeline

In [5]:
data = pd.DataFrame(load_iris()['data'],columns=['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm'])
data.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [8]:
#ColumnTransformer to remove outliers
ct = ColumnTransformer(transformers=[['outlier_remover',outlier_removal,list(range(data.shape[1]))]],remainder='passthrough')

In [14]:
outlier_remover = FunctionTransformer(outlier_removal)
ct = ColumnTransformer(transformers=[['outlier_remover', outlier_remover,list(range(data.shape[1]))]],remainder='passthrough')

In [18]:
X = data.copy()
y = load_iris()['target'].copy()

pipeline = Pipeline(steps=[['outlier_removal', ct],['imputer',SimpleImputer()],['regressor',LogisticRegression(max_iter=1000)]]) 

param_grid = {'outlier_removal__outlier_remover__kw_args':[{'factor':0},{'factor':1},{'factor':2},{'factor':3},{'factor':4}],
              'imputer__strategy':['mean','median','most_frequent'],
              'regressor__C':[0.01,0.1,1,10,100]}

gs = GridSearchCV(estimator=pipeline, param_grid=param_grid, scoring='accuracy', cv=3)

gs.fit(X,y)

gs.best_params_

{'imputer__strategy': 'mean',
 'outlier_removal__outlier_remover__kw_args': {'factor': 1},
 'regressor__C': 1}

A major difference in the method 2 is, we need to tune the ‘kw_args’ hyperparameter, unlike in the other transformers (including the one discussed in method 1). In the above code snippet, we’ve tuned the ‘kw_args’ hyperparameter using the list of values [{‘factor’:0},{‘factor’:1},{‘factor’:2},{‘factor’:3},{‘factor’:4}]. This may make it difficult to tune multiple hyperparameters of a custom transformer.