In [1]:
# Importing the libraries

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import pickle

import warnings
warnings.filterwarnings('ignore')

In [2]:
# importing dataset

df = pd.read_csv('datasets/train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
# Dropping the unnecessary columns

df.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB


In [5]:
# Checking for missing values

df.isna().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

### Plan for pipleline:

- `Impute` missing values in the columns `Age` and `Embarked` using one `ColumnTransformer`.
- Then we will use the output of the previous transformation as input in the next `ColumnTransformer` to do `OneHotEncode` the columns `Sex` and `Embarked`.
- Then again we will use the output of the second transformation as input in the 3rd `ColumnTransformer` to do `scaling` so all the columns come under same range.
- Then we will do `Feature Selection` to select the best 8 features.
- Then we will create a `DecisionTree` model and train it.

#### Doing train test split

In [6]:
# Creating independent and dependent variables

X = df.drop(columns=['Survived'], axis=1)
y = df['Survived']

In [7]:
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape

((712, 7), (179, 7))

In [8]:
X_train.head(2)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
331,1,male,45.5,0,0,28.5,S
733,2,male,23.0,0,0,13.0,S


#### Creating 1st transformer

- **It is a good practice to use index of the column instead of column names when creating pipelines**.

In [9]:
# imputation transformer
# here we used the index values for the columns like 'Age' is represented by 2
# we didnot use the column names because the result we get from this transformation is a numpy array 
# and numpy arrays don't have column names
# For 'Age' we use 'mean' to fill the missing values
# For 'Embarked' we will use the 'most_frequent' value appeared in the column i.e. the 'S' value


trf1 = ColumnTransformer([
    ('impute_age', SimpleImputer(), [2]),
    ('impute_embarked', SimpleImputer(strategy='most_frequent'), [6])
], remainder='passthrough')

#### Creating 2nd transformer

- **Here we will use the outputs of the 1st transformer as inputs**

In [10]:
# one hot encoding
# Here we will do one hot encoding for columns 'Sex' and 'Embarked'
# Here we didnot use the parameter drop=first as we are using Decision Trees


trf2 = ColumnTransformer([
    ('ohe_sex_embarked', OneHotEncoder(sparse=False, handle_unknown='ignore'), [1,6])
], remainder='passthrough')

#### Creating 3rd transformer

- **Here we will scale the data**
- Here we are using **MinMaxScaler** because we are doing **Feature Selection**.

In [11]:
# Scaling all the columns
# slice() apply tansformation to all the columns inside the range passed in it
# Here we are using (0,10) as we have 10 columns after the OHE step. 
# Here last value of range is excluded.


trf3 = ColumnTransformer([
    ('scale', MinMaxScaler(), slice(0,10))
])

#### Now doing `Feature Selection`

- Here we dont need to use any **ColumnTransformer**
- Instead we need to use the function **`SelectKBest()`** and need to pass a parameter of **`score_func`**

In [12]:
# Feature selection
# Here k=5 means we will use the top 8 features

fs = SelectKBest(score_func=chi2, k=8)

#### Now creating the model 

In [13]:
dt_clf = DecisionTreeClassifier()

#### Creating the `pipelines`

- Here we need to create a list tuples where the dirst value in the tuple will be the transformation name, and the second value will be the object of that transformation.

In [14]:
# Creating the pipeline object

pipe = Pipeline([
    ('Transformer_1', trf1),
    ('Transformer_2', trf2),
    ('Transformer_3', trf3),
    ('Feature_selection', fs),
    ('model', dt_clf)
])

In [15]:
# Checking the object

pipe

##### We can also do the same using the `make_pipeline`
##### It is more  easier to use, but the `Pipeline` allows us to customize the names of the transformers when creating the pipeline, so later it is a bit easier to identify them to use for various reasons.
##### The same logic applies in case of `ColumnTransformer` vs `make_column_transformer`

In [16]:
# Doing the same using make_pipeline

pipe1 = make_pipeline(trf1, trf2, trf3, fs, dt_clf)
pipe1

In [17]:
# now training the model
# Here we are doing 'fit' and not 'fit_transform' with the training data 
# as we have already created the model in pipeline.
# If the model was not created in the pipeline then we had to do the 'fit_transform'
# So if the model training is not done through the pipeline then we need to use the 'fit_transform'

pipe.fit(X_train,y_train)

#### Some options of the pipe object we created

- After the pipeline object is created, if we press tab after `.` and select `named_steps` then we can see all the steps in the pipeline we created.

In [18]:
# Checking the path followed by the pipeline we created

pipe.named_steps

{'Transformer_1': ColumnTransformer(remainder='passthrough',
                   transformers=[('impute_age', SimpleImputer(), [2]),
                                 ('impute_embarked',
                                  SimpleImputer(strategy='most_frequent'),
                                  [6])]),
 'Transformer_2': ColumnTransformer(remainder='passthrough',
                   transformers=[('ohe_sex_embarked',
                                  OneHotEncoder(handle_unknown='ignore',
                                                sparse=False),
                                  [1, 6])]),
 'Transformer_3': ColumnTransformer(transformers=[('scale', MinMaxScaler(), slice(0, 10, None))]),
 'Feature_selection': SelectKBest(k=8, score_func=<function chi2 at 0x00000230932EFAF0>),
 'model': DecisionTreeClassifier()}

In [19]:
# Checking what are the transformers we used in transformer1

pipe.named_steps['Transformer_1'].transformers_

[('impute_age', SimpleImputer(), [2]),
 ('impute_embarked', SimpleImputer(strategy='most_frequent'), [6]),
 ('remainder', 'passthrough', [0, 1, 3, 4, 5])]

In [20]:
# finding the mean age we passed using the simpleimputer

pipe.named_steps['Transformer_1'].transformers_[0][1].statistics_

array([29.49884615])

In [21]:
# now doing Prediction

y_pred = pipe.predict(X_test)

In [22]:
# Checking accuracy of the model

from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of the model without tuning is: {(accuracy)*100:.2f}%")

Accuracy of the model without tuning is: 62.57%


**Notes**

- Here we see lower accuracy due to we had droped some columns through **Feature Selection**.

### Cross validate the pipeline

In [23]:
# cross validation using cross_val_score

from sklearn.model_selection import cross_val_score

cross_val_score(pipe, X_train, y_train, cv=5, scoring='accuracy').mean()

0.6391214419383433

In [24]:
pipe.get_feature_names_out

<bound method Pipeline.get_feature_names_out of Pipeline(steps=[('Transformer_1',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('impute_age', SimpleImputer(),
                                                  [2]),
                                                 ('impute_embarked',
                                                  SimpleImputer(strategy='most_frequent'),
                                                  [6])])),
                ('Transformer_2',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('ohe_sex_embarked',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse=False),
                                                  [1, 6])])),
                ('Transformer_3',
                 ColumnTransformer(transformers=[('scale', MinMaxScaler(),

### Doing hyper parameter tuning using `Gridsearchcv`

In [25]:
# Remember here in pipeline we need to pass the modelname along with '__' with the parameter name
# As here model name is 'model' that we give when we create the pipeline object and parameter is 'max_depth'

params = {
    'model__max_depth':[1, 2, 3, 4, 5, None]
}

In [26]:
from sklearn.model_selection import GridSearchCV

grid = GridSearchCV(pipe, params, cv=5, scoring='accuracy')
grid.fit(X_train, y_train)

In [27]:
# getting the best parameters

grid.best_params_

{'model__max_depth': 2}

- After that we can create a best fit model and save it for future use.
- Here we don't need to pass all the other objects separately as the pipe is already have all of them.

In [28]:
# Exporting the model

try:
    pickle.dump(pipe, open('models/pipe_model.pkl', 'wb'))

except Exception as err:
    print("Error is: ", err)
else:
    print("Model saved succesfully.")

Model saved succesfully.


### Now trying with new user input data

In [29]:
# loading the model

pipe = pickle.load(open('models/pipe_model.pkl','rb'))

In [30]:
# User input

test_input = np.array([1, 'male', 45.5, 0, 0, 28.5, 'S'], dtype=object).reshape(1,7)
test_input

array([[1, 'male', 45.5, 0, 0, 28.5, 'S']], dtype=object)

In [31]:
pipe.predict(test_input)

array([0], dtype=int64)

**Notes:**

- So here we can see that we need to do very less coding.