<p>Pipelines chains together multiple steps so that the output of each step is used as input to the next step<p/>
<p>Piplelines make it easy to apply same preprocessing to train and test<p/>

#Training model without pipeline

In [None]:
# Loading dataset
import numpy as np
import pandas as pd

df = pd.read_csv('train.csv')
df.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [None]:
# Dropping not needed columns
df.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'], inplace=True)

In [None]:
# checking missing values
df.isnull().sum()

Unnamed: 0,0
Survived,0
Pclass,0
Sex,0
Age,177
SibSp,0
Parch,0
Fare,0
Embarked,2


In [None]:
# Train test split
from sklearn.model_selection import train_test_split
X = df.drop(columns=['Survived'])
Y = df['Survived']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2)

In [None]:
# Handling missing values of age and embarked
from sklearn.impute import SimpleImputer

si_age = SimpleImputer()
si_embarked = SimpleImputer(strategy='most_frequent')

X_train_age = si_age.fit_transform(X_train[['Age']])
X_train_embarked = si_embarked.fit_transform(X_train[['Embarked']])

X_test_age = si_age.transform(X_test[['Age']])
X_test_embarked = si_embarked.transform(X_test[['Embarked']])

In [None]:
# Applying one hot encoding on sex and Embarked
from sklearn.preprocessing import OneHotEncoder

ohe_sex = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
ohe_embarked = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

X_train_sex = ohe_sex.fit_transform(X_train[['Sex']])
X_train_embarked = ohe_embarked.fit_transform(X_train_embarked)

X_test_sex = ohe_sex.transform(X_test[['Sex']])
X_test_embarked = ohe_embarked.transform(X_test_embarked)

In [None]:
# Dropping columns where transformation was applied
X_train_rem = X_train.drop(columns = ['Sex', 'Age', 'Embarked'])
X_test_rem = X_test.drop(columns = ['Sex', 'Age', 'Embarked'])

In [None]:
# Combining the columns
X_train_transformed = np.concatenate((X_train_rem, X_train_age, X_train_sex, X_train_embarked), axis=1)
X_test_transformed = np.concatenate((X_test_rem, X_test_age, X_test_sex, X_test_embarked), axis=1)

In [None]:
# Applying decision tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

clf = DecisionTreeClassifier()
clf.fit(X_train_transformed, Y_train)
y_pred = clf.predict(X_test_transformed)
accuracy_score(Y_test, y_pred)

0.776536312849162

In [None]:
# Exporting sex and embarked OHE and decsion tree model
import pickle
import os

# Create the models directory if it doesn't exist
if not os.path.exists('models'):
    os.makedirs('models')

pickle.dump(ohe_sex, open('models/ohe_sex.pkl', 'wb'))
pickle.dump(ohe_embarked, open('models/ohe_embarked.pkl', 'wb'))
pickle.dump(clf, open('models/clf.pkl', 'wb'))

## Predict without pipeline

In [None]:
# Loading models
ohe_sex = pickle.load(open('models/ohe_sex.pkl', 'rb'))
ohe_embarked = pickle.load(open('models/ohe_embarked.pkl', 'rb'))
clf = pickle.load(open('models/clf.pkl', 'rb'))

In [None]:
# Sample input
test_input = np.array([2, 'male', 31.0, 0, 0, 10.5, 'S'], dtype=object).reshape(1,7)
test_input

array([[2, 'male', 31.0, 0, 0, 10.5, 'S']], dtype=object)

In [None]:
# Applying OHE on sex
test_input_sex = ohe_sex.transform(test_input[:,1].reshape(1,1))
test_input_sex



array([[0., 1.]])

In [None]:
# Applying OHE on embarked
test_input_embarked = ohe_embarked.transform(test_input[:, -1].reshape(1,1))
test_input_embarked

array([[0., 0., 1.]])

In [None]:
# Combining the attributes
test_input_age = test_input[:,2].reshape(1,1)
test_input_transformed = np.concatenate((test_input[:,[0,3,4,5]], test_input_age, test_input_sex, test_input_embarked), axis=1)

In [None]:
# predict output
clf.predict(test_input_transformed)

array([0])

# Training model with pipeline

In [None]:
# Loading dataset
import numpy as np
import pandas as pd

df = pd.read_csv('train.csv')
df.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [None]:
# Dropping not needed columns
df.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'], inplace=True)

<p>Our Pipeline :-<p/>
<p>Data -> Missing values -> OHE -> Scaling -> Feature_selection (top 5) -> Decision tree -> output<p/>

In [None]:
# Train test split
from sklearn.model_selection import train_test_split
X = df.drop(columns=['Survived'])
Y = df['Survived']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2)

In [None]:
# Imputation transformer (pass index of col. instead of col_name, since the op is numpy array)
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

trf1 = ColumnTransformer([
    ('impute_age', SimpleImputer(), [2]),
    ('impute_embarked', SimpleImputer(strategy='most_frequent'), [6])
], remainder='passthrough')

In [None]:
# One hot encoding transformer
from sklearn.preprocessing import OneHotEncoder

trf2 = ColumnTransformer([
    ('ohe_sex_embarked', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), [1,6])
], remainder='passthrough')

In [None]:
# Scaling transformer
# slice(0, 10) will apply MinMaxScaler() on (8-2) + 2 + 3 = 6 + 2 + 3 = 11 columns
from sklearn.preprocessing import MinMaxScaler

trf3 = ColumnTransformer([
    ('scale', MinMaxScaler(), slice(0, 11))
])

In [None]:
# Feature selection transformer to select top 10 imp columns
from sklearn.feature_selection import SelectKBest, chi2
trf4 = SelectKBest(score_func=chi2, k=10)

In [None]:
# train the model
from sklearn.tree import DecisionTreeClassifier

trf5 = DecisionTreeClassifier()

In [None]:
# Create Pipeline
# Pipeline -> requires naming of steps
from sklearn.pipeline import Pipeline, make_pipeline

pipe = Pipeline([
    ('trf1', trf1),
    ('trf2', trf2),
    ('trf3', trf3),
    ('trf4', trf4),
    ('trf5', trf5)
])

In [None]:
# make_pipeline -> Doesn't require nameing of steps
pipe = make_pipeline(trf1, trf2, trf3, trf4, trf5)

In [None]:
# train
pipe.fit(X_train, Y_train)

# Note :- if your pipleline had only Imputation,OHE and scaling then :-
# You would have to call pipe.fit_transform(X_train)

In [None]:
# List the steps in our pipeline
pipe.named_steps

{'trf1': ColumnTransformer(remainder='passthrough',
                   transformers=[('impute_age', SimpleImputer(), [2]),
                                 ('impute_embarked',
                                  SimpleImputer(strategy='most_frequent'),
                                  [6])]),
 'trf2': ColumnTransformer(remainder='passthrough',
                   transformers=[('ohe_sex_embarked',
                                  OneHotEncoder(handle_unknown='ignore',
                                                sparse_output=False),
                                  [1, 6])]),
 'trf3': ColumnTransformer(transformers=[('scale', MinMaxScaler(), slice(0, 11, None))]),
 'trf4': SelectKBest(score_func=<function chi2 at 0x7c71626d8f40>),
 'trf5': DecisionTreeClassifier()}

In [None]:
# List transformers in trf1
pipe.named_steps['trf1'].transformers_

[('impute_age', SimpleImputer(), [2]),
 ('impute_embarked', SimpleImputer(strategy='most_frequent'), [6]),
 ('remainder',
  FunctionTransformer(accept_sparse=True, check_inverse=False,
                      feature_names_out='one-to-one'),
  [0, 1, 3, 4, 5])]

In [None]:
# Simple Imputer object of impute_embarked
pipe.named_steps['trf1'].transformers_[1][1]

In [None]:
# Get mode of impute_embarked
pipe.named_steps['trf1'].transformers_[1][1].statistics_

array(['S'], dtype=object)

In [None]:
# Get mean of impute_age
pipe.named_steps['trf1'].transformers_[0][1].statistics_

array([29.78590426])

In [None]:
# Predict
y_pred = pipe.predict(X_test)

In [None]:
# Accuracy check
from sklearn.metrics import accuracy_score
accuracy_score(Y_test, y_pred)

0.6368715083798883

In [None]:
# Cross Validation using pipeline
from sklearn.model_selection import cross_val_score
cross_val_score(pipe, X_train, Y_train, cv=5, scoring='accuracy').mean()

np.float64(0.6151876292721363)

In [None]:
# GridSearchcv using pipeline
from sklearn.model_selection import GridSearchCV

params = {'trf5__max_depth':[1,2,3,4,5,None]}
grid = GridSearchCV(pipe, params, cv=5, scoring='accuracy')
grid.fit(X_train, Y_train)

In [None]:
# best score
grid.best_score_

np.float64(0.6151876292721363)

In [None]:
# best params
grid.best_params_

{'trf5__max_depth': 1}

In [None]:
# Exporting the pipeline
import pickle
pickle.dump(pipe, open('models/pipe.pkl', 'wb'))

## Predict with pipeline

In [None]:
# Loading pipeline
pipe = pickle.load(open('models/pipe.pkl', 'rb'))

In [None]:
# Sample input
test_input = np.array([2, 'male', 31.0, 0, 0, 10.5, 'S'], dtype=object).reshape(1,7)
test_input

array([[2, 'male', 31.0, 0, 0, 10.5, 'S']], dtype=object)

In [None]:
# Predict output
pipe.predict(test_input)



array([0])