In [40]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import chi2,SelectKBest
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.preprocessing import MinMaxScaler


In [41]:
df = pd.read_csv('titanic.csv')
df.drop(columns= ['PassengerId','Name','Ticket','Cabin'], inplace= True)
df.sample(5)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
400,1,1,female,30.0,0,0,164.8667,S
354,1,3,female,0.17,1,2,20.575,S
20,0,1,male,55.0,1,0,59.4,C
225,1,3,female,,0,2,15.2458,C
18,1,3,female,27.0,1,0,7.925,S


In [42]:
df.isnull().sum()

Survived     0
Pclass       0
Sex          0
Age         86
SibSp        0
Parch        0
Fare         1
Embarked     0
dtype: int64

In [43]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['Survived']),df['Survived'],test_size=0.2)
X_test

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
41,1,male,,0,0,26.5500,S
76,3,male,,0,0,8.0500,S
342,3,male,,1,9,69.5500,S
141,1,female,33.0,0,0,151.5500,S
226,3,male,23.0,0,0,7.7958,S
...,...,...,...,...,...,...,...
0,3,male,34.5,0,0,7.8292,Q
388,3,male,21.0,0,0,7.7500,Q
312,3,male,,0,0,7.5750,S
409,3,female,3.0,1,1,13.7750,S


In [44]:
#Column Transformer
trf1 = ColumnTransformer([
    ('impute_age',SimpleImputer(),[2]), # 2 is column no.
    ('impute_fare',SimpleImputer(strategy= 'most_frequent'),[5])
],remainder= 'passthrough')

In [45]:
# One Hot encoder
trf2 = ColumnTransformer([
   ('ohe_sex_embarked',OneHotEncoder(sparse_output= False, handle_unknown= 'ignore'),[1,6]) 
],remainder= 'passthrough')


In [46]:
# Scaling
trf3 = ColumnTransformer([
    ('scale',MinMaxScaler(),slice(0,10)) # 1 to 10 means 10 columns would be made after splitting 
])

In [47]:
# Feature Selection
trf4 = SelectKBest(score_func= chi2, k=8) # 8 means 8 best features would be selected from 10

In [48]:
# Decision tree
trf5 = DecisionTreeClassifier()

Pipelining

In [49]:
pipe = Pipeline([
    ('trf1',trf1),
    ('trf2',trf2),
    ('trf3',trf3),
    ('trf4',trf4),
    ('trf5',trf5)
])

In [50]:
# Alternate syntax:
#pipe = make_pipeline(trf1,trf2,trf3,trf4,trf5)

In [51]:
#train
pipe.fit(X_train,y_train)

Explore the Pipeline:

In [52]:
pipe.named_steps

{'trf1': ColumnTransformer(remainder='passthrough',
                   transformers=[('impute_age', SimpleImputer(), [2]),
                                 ('impute_fare',
                                  SimpleImputer(strategy='most_frequent'),
                                  [5])]),
 'trf2': ColumnTransformer(remainder='passthrough',
                   transformers=[('ohe_sex_embarked',
                                  OneHotEncoder(handle_unknown='ignore',
                                                sparse_output=False),
                                  [1, 6])]),
 'trf3': ColumnTransformer(transformers=[('scale', MinMaxScaler(), slice(0, 10, None))]),
 'trf4': SelectKBest(k=8, score_func=<function chi2 at 0x13ebb7380>),
 'trf5': DecisionTreeClassifier()}

In [53]:
# display pipeline 
from sklearn import set_config
set_config(display= 'diagram')

In [55]:
# predict
y_pred = pipe.predict(X_test)
y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [56]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.6190476190476191

In [57]:
# GridSearch using the pipeline
params = {
    'trf5__max_depth':[1,2,3,4,5,None]
}

In [59]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(pipe,params,cv=5,scoring='accuracy')
grid.fit(X_train,y_train)

In [60]:
# Exporting the pipeline
import pickle
pickle.dump(pipe,open('pipe.pkl','wb'))