In [29]:
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer, make_column_transformer #to create transformers
from sklearn.pipeline import Pipeline, make_pipeline

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score

from sklearn.tree import DecisionTreeClassifier

from sklearn import set_config

In [30]:
df = pd.read_csv('./train.csv')
df.drop(['Name', 'Ticket', 'PassengerId', 'Cabin'], axis = 1, inplace = True)
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [31]:
df.Embarked.value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB


In [33]:
x = df.drop(['Survived'], axis = 1)
y = df.Survived
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)



# column transformer
        
        without transformers, we will have to apply encoders, scalers, etc one by one manually. but we caneasily automate this process using the column transformers.

In [34]:
transformer = ColumnTransformer( transformers=[
    ('simple imputer', SimpleImputer(), ['Age']),
    ('one hot encoding', OneHotEncoder(drop = 'first', sparse_output = False), ['Sex', 'Embarked'])
],
    remainder='passthrough'
)

# ColumnTransformer() -> receives 2 values. first is list of transformers and second is remainder.
        # 'transformers' is a list of tuples where each tuple contains transformer name, instance of function, list of column names.

        # 'remainder' parameter can receive 2 values, 'passthrough' or 'drop'. this parameter specifies what happens to the other columns that are not included in the column transformer. 
            # if remainder = passthrough, nothing happens to the other columns
            # if remainder = drop, other columns are dropped.


# make_Column_transformer() -> this is also used to create transformers. here we dont include transformer name. example : 
"""
            transformer = make_Column_transformer(transformers = [
            (OneHotEncoder(), [2, 3, 4]),
            (MinMaxScaler(), [1, 6])
            ],
            remainder = 'passthrough')
""" 

# NOTE : here we have used column names while creating transformers. but this is not good practice because while using transformers in pipelines, functions like OneHotEncoder() returns numpy array as output so in pipeline the output of one tramsformer is passed to the next transformer so it will raise error if column names are used. so, it is best practice to just use index values.


"\n            transformer = make_Column_transformer(transformers = [\n            (OneHotEncoder(), [2, 3, 4]),\n            (MinMaxScaler(), [1, 6])\n            ],\n            remainder = 'passthrough')\n"

In [35]:
xtrain = transformer.fit_transform(x_train)
xtest = transformer.transform(x_test)

# pipelines

        pipelines are used to chain multiple steps together.
        its easy for us to apply same preprocessing for both train and test datasets.

In [36]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [37]:
# we will create a pipeline that will do the following things:
    # -> handle missing values in Age and Embarked columns
    # -> then perform one hot encoding on Sex and Embarked columns.
    # -> perform min-max scaling
    # -> then select best features using SelectKBest()
    # -> then train decision tree model.

In [38]:
# splitting the data
x = df.drop('Survived', axis = 1)
y = df.Survived
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2, random_state=1)

In [39]:
xtrain.head(2)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
301,3,male,,2,0,23.25,Q
309,1,female,30.0,0,0,56.9292,C


In [40]:
# transformer to handle missing values using simple imputer.

trf1 = ColumnTransformer(transformers=[
    ('imputing age', SimpleImputer(), [2]),
    ('imputing embarked', SimpleImputer(strategy = 'most_frequent'), [6])
],
remainder='passthrough')

In [41]:
# transformer to perform one hot encoding on Sex and embarked Columns

trf2 = ColumnTransformer(transformers= [
    ('one hot encoding sex and embarked', OneHotEncoder(drop='first', sparse_output=False), [1, 3])
],
remainder='passthrough')

In [42]:
#  transformer to perform min max scaling

trf3 = ColumnTransformer(transformers=[
    ('min max scaling', MinMaxScaler(), slice(0, 9))
],
remainder='passthrough')

In [43]:
# selecting top 6 columns out of 8

trf4 = SelectKBest(score_func=chi2, k=6)

In [44]:
# decision tree model

trf5 = DecisionTreeClassifier()

In [45]:
# creating a pipeline

pipeline = Pipeline([
    ('trf1', trf1),
    ('trf2', trf2),
    ('trf3', trf3),
    ('trf4', trf4),
    ('trf5', trf5),
])

# make_pipeline() : 
        # you can also create a pipeline using make_pipeline().
        # the difference is that we don't include name of the transformer here.

        # example : 
            # pipeline = make_pipeline(trf1, trf2, trf3, trf4, trf5))

In [46]:
set_config(display = 'diagram')  # to see the pipeline
pipeline

In [47]:
pipeline.fit(x_train, y_train)

# the functions like fit() used here depends on the last transformer of the pipeline. in our pipeline, the last transformer is DecisionTreeClassifier() which has fit(), predict(), etc functions so we can use pipeline.fit(), pipeline.perdict(), etc. 
# if our pipeline had last transformer which uses StandardScaler(), then our pipeline would have fit(), transform() and fit_pransform().

In [48]:
pipeline.named_steps #returns all the transformers inside the pipeline.

{'trf1': ColumnTransformer(remainder='passthrough',
                   transformers=[('imputing age', SimpleImputer(), [2]),
                                 ('imputing embarked',
                                  SimpleImputer(strategy='most_frequent'),
                                  [6])]),
 'trf2': ColumnTransformer(remainder='passthrough',
                   transformers=[('one hot encoding sex and embarked',
                                  OneHotEncoder(drop='first',
                                                sparse_output=False),
                                  [1, 3])]),
 'trf3': ColumnTransformer(remainder='passthrough',
                   transformers=[('min max scaling', MinMaxScaler(),
                                  slice(0, 9, None))]),
 'trf4': SelectKBest(k=6, score_func=<function chi2 at 0x000001A026F86020>),
 'trf5': DecisionTreeClassifier()}

In [49]:
pipeline.named_steps['trf1'].transformers_[1][1] # returns the model used in the transformer. 2ns function in out transformer 'trf1' is SimpleImputer(stratergy = 'most_frequent'). we can see what value did SimpleImputer used to replace Nan values with using 'statistics_' parameter.

In [50]:
pipeline.named_steps['trf1'].transformers_[1][1].statistics_

array(['S'], dtype=object)

In [51]:
y_pred = pipeline.predict(x_test)
print('accuracy score = ',accuracy_score(y_test, y_pred))
print('cross validation score = ', cross_val_score(pipeline, x_train, y_train, cv = 5, scoring = 'accuracy').mean()) # cross_val_score() returns the accuracy of our model by predicting multiple times. 'cv' parameter determines how many times it will predict. 'scoring' parameter speciifies what metrics to use.

accuracy score =  0.7877094972067039
cross validation score =  0.7753077908007485


### hyperparameter tuning using pipelines

In [52]:
# when you use hyperparameters with pipelines, you have to specify each of the parameters in this format : 
        # transformer-name__parameter-name : parameter-values
params = {
    'trf5__max_depth' : [1, 2, 3, 4, 5, None]
}

gcv = GridSearchCV(pipeline, param_grid = params, cv = 5, scoring = 'accuracy')

best_model = gcv.fit(x_train, y_train)

gcv.best_params_


{'trf5__max_depth': 3}

In [53]:
ypred = best_model.predict(x_test)

In [54]:
best_model.score(x_test, y_test)

0.8044692737430168

In [55]:
gcv.best_params_

{'trf5__max_depth': 3}

In [56]:
# updating the parameters in the transformer

pipeline.set_params(**gcv.best_params_)

pipeline.fit(x_train, y_train)
y_pred = pipeline.predict(x_test)
accuracy_score(y_test, y_pred)

0.8044692737430168