In [389]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.tree import DecisionTreeClassifier

In [390]:
df = pd.read_csv('titanic.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [391]:
df = df.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [392]:
y = df.Survived
X = df.drop("Survived", axis=1)

In [393]:
df.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [394]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [395]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
140,3,female,,0,2,15.2458,C
439,2,male,31.0,0,0,10.5,S
817,2,male,31.0,1,1,37.0042,C
378,3,male,20.0,0,0,4.0125,C
491,3,male,21.0,0,0,7.25,S


In [396]:
# # Define categorical and numerical cols
# categorical_cols = ["Age", "Embarked"]
# numerical_cols = ['Age', 'Fare', 'Pclass', 'SibSp', 'Parch']

In [397]:
# # imputation transformer, fills in the missing values and sends the new columns for the next steps
# inputation_transformer = ColumnTransformer(transformers=[
#     ('meanImpute', SimpleImputer(), [2]),
#     ('most_frequent', SimpleImputer(strategy='most_frequent'), [6])
# ], remainder='passthrough')

In [398]:
# # OHE transformer, transforms the categorical columns into numerical columns and sends the columns for the next process
# ohe_transformer = ColumnTransformer(transformers=[
#     ('ohe_sex_embarked', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), [1, 6])
# ], remainder='passthrough')

In [399]:
# # Scaler transformer, scales all the columns. denoted by (0, 11) since there are 11 columns in our df now.
# scaled_transformer = ColumnTransformer(transformers=[
#     ('scaling', MinMaxScaler(), slice(0, 10))
# ], remainder='passthrough')

In [400]:
# feature selection
# feature_selection = SelectKBest(score_func=chi2, k=8)

In [401]:
# initialize the model
# model = DecisionTreeClassifier()

In [402]:
# create a pipeline, connect all the transformers

In [403]:
# pipeline = Pipeline([
#     ('trnf1', inputation_transformer),
#     ('trnf2', ohe_transformer),
#     ('trnf3', scaled_transformer),
#     ('trnf4', feature_selection),
#     ('trnf5', model)
# ])

In [404]:
# imputation transformer
trf1 = ColumnTransformer([
    ('impute_age', SimpleImputer(), [2]),
    ('impute_embarked', SimpleImputer(strategy='most_frequent'), [6])
], remainder='passthrough')
 
# one hot encoding
trf2 = ColumnTransformer([
    ('ohe_sex_embarked', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), [1, 6])
], remainder='passthrough')

# Scaling
trf3 = ColumnTransformer([
    ('scale', MinMaxScaler(), slice(0,10))
])

# Feature selection
trf4 = SelectKBest(score_func=chi2, k=8)

# train the model
trf5 = DecisionTreeClassifier()

pipe = Pipeline([
    ('trf1', trf1),
    ('trf2', trf2),
    ('trf3', trf3),
    ('trf4', trf4),
    ('trf5', trf5)
])

In [405]:
pipe.fit(X_train, y_train)

In [406]:
pred = pipe.predict(X_test)

In [407]:
pred.shape

(179,)

In [408]:
accuracy = accuracy_score(y_test, pred)
accuracy

0.6759776536312849

In [409]:
cross_val = cross_val_score(pipe, X_train, y_train, cv=6, scoring='accuracy').mean()

In [410]:
cross_val

0.6250415420405451

In [411]:
pipe.named_steps['trf1'].transformers_[0][1].statistics_

array([29.74518389])

How this works: pipe.named_steps gives list of all transformers, by using ['trnf'] we select which transformer we want and then apply transformers steps which returns all the transformers inside that list. Then [0] refers to the first item in the list and inside the [1] selects the second element and .statistics_ returns the stats. In this case, it was the SimpleImputer so it gave a mean of 29.745... .

In [413]:
pipe.named_steps['trf2'].transformers_[0][1]

<h4 style='color:red'>Pipeline V/S make_pipeline</h4>

<h5>Pipeline requires naming of the pipeline steps, for ex: 'trnf1', 'trnf2' but make_pipeline doesn't, making the code less short. (Similar logic applies to ColumnTransformer V/S make_column_transformer)</h5>

In [416]:
# pipe = make_pipeline(inputation_transformer, ohe_transformer, scaled_transformer, feature_selection, model)

import pickle

In [446]:
import pickle
pickle.dump(pipe, open('pipe.pkl', 'wb'))