# Scikit-Transformers : Pipelines

## Imports 

Import warnings and disable warnings for this notebook.

In [1]:
import warnings
warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

Import the data libraries

In [2]:
import numpy as np
import pandas as pd

Import Scikit learn libraries

In [3]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.impute import KNNImputer, SimpleImputer

Import the transformers from Scikit-Transformers

In [4]:
try:
    from sktransf import get_titanic
except Exception as e:
    print(e)
    print("Please install the package using the following command")
    print("pip install scikit-transformers")
    from sktransf._get_titanic import get_titanic

In [5]:
from sktransf import LogColumnTransformer, DropUniqueColumnTransformer, BoolColumnTransformer

Disable some warnings for pandas

In [6]:
pd.set_option("future.no_silent_downcasting", True)

## Data

Get the data from the [Kaggle](https://www.kaggle.com/c/titanic/data) Titanic dataset.

In [7]:
X, y = get_titanic()

Display X

In [8]:
X

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare
0,3,22.0,1,0,7.2500
1,1,38.0,1,0,71.2833
2,3,26.0,0,0,7.9250
3,1,35.0,1,0,53.1000
4,3,35.0,0,0,8.0500
...,...,...,...,...,...
886,2,27.0,0,0,13.0000
887,1,19.0,0,0,30.0000
888,3,,1,2,23.4500
889,1,26.0,0,0,30.0000


Display y

In [9]:
y

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

Add Bool and Unique columns to the data

In [10]:
X["bool_column"] = np.random.choice(["A", "B"], size=X.shape[0])
X["unique_column"] = "dummy"
X

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,bool_column,unique_column
0,3,22.0,1,0,7.2500,A,dummy
1,1,38.0,1,0,71.2833,B,dummy
2,3,26.0,0,0,7.9250,B,dummy
3,1,35.0,1,0,53.1000,A,dummy
4,3,35.0,0,0,8.0500,B,dummy
...,...,...,...,...,...,...,...
886,2,27.0,0,0,13.0000,B,dummy
887,1,19.0,0,0,30.0000,B,dummy
888,3,,1,2,23.4500,B,dummy
889,1,26.0,0,0,30.0000,A,dummy


Using a dummy fillna function to fill the missing values

In [11]:
_X = X.copy()
_X.Age = _X.Age.fillna(_X.Age.mean())
_X

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,bool_column,unique_column
0,3,22.000000,1,0,7.2500,A,dummy
1,1,38.000000,1,0,71.2833,B,dummy
2,3,26.000000,0,0,7.9250,B,dummy
3,1,35.000000,1,0,53.1000,A,dummy
4,3,35.000000,0,0,8.0500,B,dummy
...,...,...,...,...,...,...,...
886,2,27.000000,0,0,13.0000,B,dummy
887,1,19.000000,0,0,30.0000,B,dummy
888,3,29.699118,1,2,23.4500,B,dummy
889,1,26.000000,0,0,30.0000,A,dummy


## Using an Scikit Learn Pipeline

Create a pipeline with the following steps:

In [12]:
pipeline = Pipeline(
    [
        ("unique", DropUniqueColumnTransformer(force_df_out=True)),
        ("bool", BoolColumnTransformer(force_df_out=True)),
        ("logger", LogColumnTransformer(force_df_out=True)),
        ("scaler", StandardScaler()),
        ("estimator", RandomForestClassifier()),
    ]
)

pipeline

Creating a preprocessor : 

In [13]:
preprocessor = pipeline[:-2]
preprocessor

Check X before transformation

In [14]:
X

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,bool_column,unique_column
0,3,22.0,1,0,7.2500,A,dummy
1,1,38.0,1,0,71.2833,B,dummy
2,3,26.0,0,0,7.9250,B,dummy
3,1,35.0,1,0,53.1000,A,dummy
4,3,35.0,0,0,8.0500,B,dummy
...,...,...,...,...,...,...,...
886,2,27.0,0,0,13.0000,B,dummy
887,1,19.0,0,0,30.0000,B,dummy
888,3,,1,2,23.4500,B,dummy
889,1,26.0,0,0,30.0000,A,dummy


Check the data type of the new columns

In [15]:
tmp = preprocessor.fit_transform(_X)
tmp = pd.DataFrame(tmp)
tmp

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,bool_column
0,3,22.000000,0.693147,0,2.110213,0
1,1,38.000000,0.693147,0,4.280593,1
2,3,26.000000,0.000000,0,2.188856,1
3,1,35.000000,0.693147,0,3.990834,0
4,3,35.000000,0.000000,0,2.202765,1
...,...,...,...,...,...,...
886,2,27.000000,0.000000,0,2.639057,1
887,1,19.000000,0.000000,0,3.433987,1
888,3,29.699118,0.693147,2,3.196630,1
889,1,26.000000,0.000000,0,3.433987,0


## Using a Grid Search CV

Create the param grid for the GridSearchCV

In [16]:
param_grid = {
    "logger__threshold": [0.5, 1, 1.5, 3, 3.5],
    "scaler": [StandardScaler(), "passthrough"],
    "estimator__n_estimators": [100, 200, 300],
}

In [17]:
grid = GridSearchCV(
    pipeline,
    param_grid=param_grid,
    cv=5,
    refit=True,
    return_train_score=True,
    n_jobs=-1,
    verbose=0,
)

In [18]:
X

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,bool_column,unique_column
0,3,22.0,1,0,7.2500,A,dummy
1,1,38.0,1,0,71.2833,B,dummy
2,3,26.0,0,0,7.9250,B,dummy
3,1,35.0,1,0,53.1000,A,dummy
4,3,35.0,0,0,8.0500,B,dummy
...,...,...,...,...,...,...,...
886,2,27.0,0,0,13.0000,B,dummy
887,1,19.0,0,0,30.0000,B,dummy
888,3,,1,2,23.4500,B,dummy
889,1,26.0,0,0,30.0000,A,dummy


In [19]:
grid.fit(X, y)
grid

## Analyzing the results

Uncomment the following line to install the scikit-res library

In [20]:
# !pip install scikit-res

In [21]:
from skres import SkRes

In [22]:
res = SkRes(grid)
res.head()

Unnamed: 0,mean_fit_time,mean_score_time,param_estimator__n_estimators,param_logger__threshold,param_scaler,params,mean_test_score,std_test_score,mean_train_score,std_train_score
3,0.3632,0.0267,100,1.0,passthrough,"{'estimator__n_estimators': 100, 'logger__thre...",0.733,0.0535,0.8272,0.0109
20,0.9179,0.044,300,0.5,StandardScaler(),"{'estimator__n_estimators': 300, 'logger__thre...",0.7319,0.0591,0.8288,0.0094
29,0.7104,0.0401,300,3.5,passthrough,"{'estimator__n_estimators': 300, 'logger__thre...",0.7319,0.0532,0.8308,0.0085
5,0.3004,0.02,100,1.5,passthrough,"{'estimator__n_estimators': 100, 'logger__thre...",0.7319,0.0458,0.8224,0.0113
11,0.6183,0.0346,200,0.5,passthrough,"{'estimator__n_estimators': 200, 'logger__thre...",0.7296,0.0567,0.8311,0.007
