#### Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings(action = 'ignore')

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression

#### Importing the Dataset

In [2]:
dataset = pd.read_csv('Titanic.csv')
dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
dataset.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [4]:
dataset.drop(columns=['PassengerId','Name','Ticket','Cabin'],inplace=True)

In [5]:
x = dataset.drop(columns=['Survived'])
y = dataset['Survived']

#### Splitting the data into train and test data

In [6]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=7)

In [7]:
x_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
205,3,female,2.0,0,1,10.4625,S
718,3,male,,0,0,15.5,Q
835,1,female,39.0,1,1,83.1583,C
851,3,male,74.0,0,0,7.775,S
773,3,male,,0,0,7.225,C


#### Using Column Transformer for Libraries

In [8]:
numeric_features = ["Age", "Fare"]

In [9]:
transformer1 = ColumnTransformer([
    ('imputer_numerical', SimpleImputer(strategy = 'median'), [2]),
    ('imputer_categorical', SimpleImputer(strategy = 'most_frequent'), [6])
], remainder = 'passthrough')

In [10]:
transformer2 = ColumnTransformer([
    ('scaler', MinMaxScaler(), slice(0,10))
])

In [11]:
transformer3 = ColumnTransformer([
    ('encoding', OneHotEncoder(sparse=False,handle_unknown='ignore'), [1, 6])
], remainder = 'passthrough')

In [12]:
transformer4 = LogisticRegression()

#### Creating Pipeline to pass all Transformers

In [13]:
pipe = Pipeline([
    ('trf1',transformer1),
    ('trf2',transformer3),
    ('trf3',transformer2),
    ('trf4',transformer4)
])

In [14]:
pipe.fit(x_train,y_train)

Pipeline(steps=[('trf1',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('imputer_numerical',
                                                  SimpleImputer(strategy='median'),
                                                  [2]),
                                                 ('imputer_categorical',
                                                  SimpleImputer(strategy='most_frequent'),
                                                  [6])])),
                ('trf2',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('encoding',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse=False),
                                                  [1, 6])])),
                ('trf3',
                 ColumnTransformer(transformers=[('scaler', MinMaxScaler(),
   

#### Visualising Pipeline flow

In [15]:
from sklearn import set_config
set_config(display='diagram')
pipe

In [16]:
y_pred = pipe.predict(x_test)

#### Calculating accuracy score before using Grid Search CV

In [17]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.6145251396648045

#### Using Grid Search CV to test and use best parameters

In [18]:
param_grid = {
    'trf1__imputer_numerical__strategy': ['mean', 'median'],
    'trf1__imputer_categorical__strategy': ['most_frequent', 'constant'],
    'trf4__C': [0.1, 1.0, 10, 100]
}

grid_search = GridSearchCV(pipe, param_grid, cv=5, scoring= 'accuracy')
grid_search.fit(x_train, y_train)

In [19]:
grid_search.best_score_

0.6419284940411701

In [20]:
grid_search.best_params_

{'trf1__imputer_categorical__strategy': 'most_frequent',
 'trf1__imputer_numerical__strategy': 'mean',
 'trf4__C': 0.1}

#### Listing all the accuracy scores of different parameters

In [21]:
cv_results = pd.DataFrame(grid_search.cv_results_)
cv_results = cv_results.sort_values("mean_test_score", ascending=False)
cv_results[['param_trf4__C','param_trf1__imputer_numerical__strategy','param_trf1__imputer_categorical__strategy','mean_test_score']]

Unnamed: 0,param_trf4__C,param_trf1__imputer_numerical__strategy,param_trf1__imputer_categorical__strategy,mean_test_score
0,0.1,mean,most_frequent,0.641928
1,1.0,mean,most_frequent,0.641928
4,0.1,median,most_frequent,0.641928
5,1.0,median,most_frequent,0.641928
8,0.1,mean,constant,0.641928
9,1.0,mean,constant,0.641928
12,0.1,median,constant,0.641928
13,1.0,median,constant,0.641928
2,10.0,mean,most_frequent,0.64052
3,100.0,mean,most_frequent,0.64052
