In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_csv("titanic_train.csv")

In [3]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [4]:
df.drop(columns=["PassengerId","Name","Ticket","Cabin"], axis = 1, inplace=True)

In [5]:
df.sample(5)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
493,0,1,male,71.0,0,0,49.5042,C
550,1,1,male,17.0,0,2,110.8833,C
809,1,1,female,33.0,1,0,53.1,S
769,0,3,male,32.0,0,0,8.3625,S
437,1,2,female,24.0,2,3,18.75,S


In [6]:
X = df.drop("Survived", axis = 1)
y = df["Survived"]

In [7]:
X

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,22.0,1,0,7.2500,S
1,1,female,38.0,1,0,71.2833,C
2,3,female,26.0,0,0,7.9250,S
3,1,female,35.0,1,0,53.1000,S
4,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...
886,2,male,27.0,0,0,13.0000,S
887,1,female,19.0,0,0,30.0000,S
888,3,female,,1,2,23.4500,S
889,1,male,26.0,0,0,30.0000,C


In [8]:
X_train,X_test,y_train,y_test   = train_test_split(X,y,test_size=0.2, random_state=2)

In [9]:
numerical_features = ["Age","Fare"]
categorical_features = ["Embarked","Sex"]

In [10]:
scaler = StandardScaler()
ohe = OneHotEncoder(handle_unknown="ignore")

In [11]:
numerical_transformer = Pipeline(steps=[("imputer", SimpleImputer(strategy="median")),("scaling",scaler)])
categorical_transformer = Pipeline(steps=[("imputer",SimpleImputer(strategy="most_frequent")),("encoding", ohe)])

In [12]:
preprocessor = ColumnTransformer(transformers=[("numerical", numerical_transformer,numerical_features),
                                              ("categorical",categorical_transformer,categorical_features)])

In [13]:
clf = Pipeline(steps=[("preprocessing",preprocessor),("LR", LogisticRegression())])

In [14]:
from sklearn import set_config

In [15]:
set_config(display="diagram")
clf

In [16]:
clf.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'preprocessing', 'LR', 'preprocessing__n_jobs', 'preprocessing__remainder', 'preprocessing__sparse_threshold', 'preprocessing__transformer_weights', 'preprocessing__transformers', 'preprocessing__verbose', 'preprocessing__numerical', 'preprocessing__categorical', 'preprocessing__numerical__memory', 'preprocessing__numerical__steps', 'preprocessing__numerical__verbose', 'preprocessing__numerical__imputer', 'preprocessing__numerical__scaling', 'preprocessing__numerical__imputer__add_indicator', 'preprocessing__numerical__imputer__copy', 'preprocessing__numerical__imputer__fill_value', 'preprocessing__numerical__imputer__missing_values', 'preprocessing__numerical__imputer__strategy', 'preprocessing__numerical__imputer__verbose', 'preprocessing__numerical__scaling__copy', 'preprocessing__numerical__scaling__with_mean', 'preprocessing__numerical__scaling__with_std', 'preprocessing__categorical__memory', 'preprocessing__categorical__steps', 'preproces

In [20]:
parm_grid = {"preprocessing__numerical__imputer__strategy":["mean","median"],
             "preprocessing__categorical__imputer__strategy":["most_frequent","constant"],
             "LR__C":[1.0, 10, 100]}

In [21]:
grid_search = GridSearchCV(clf,parm_grid,cv = 10)

In [22]:
grid_search.fit(X_train,y_train)

In [23]:
print(f"Best params:")
print(grid_search.best_params_)

Best params:
{'LR__C': 1.0, 'preprocessing__categorical__imputer__strategy': 'most_frequent', 'preprocessing__numerical__imputer__strategy': 'mean'}


In [24]:
print(f"Internal CV score: {grid_search.best_score_:.3f}")

Internal CV score: 0.788


In [26]:
grid_search.get_params().keys()

dict_keys(['cv', 'error_score', 'estimator__memory', 'estimator__steps', 'estimator__verbose', 'estimator__preprocessing', 'estimator__LR', 'estimator__preprocessing__n_jobs', 'estimator__preprocessing__remainder', 'estimator__preprocessing__sparse_threshold', 'estimator__preprocessing__transformer_weights', 'estimator__preprocessing__transformers', 'estimator__preprocessing__verbose', 'estimator__preprocessing__numerical', 'estimator__preprocessing__categorical', 'estimator__preprocessing__numerical__memory', 'estimator__preprocessing__numerical__steps', 'estimator__preprocessing__numerical__verbose', 'estimator__preprocessing__numerical__imputer', 'estimator__preprocessing__numerical__scaling', 'estimator__preprocessing__numerical__imputer__add_indicator', 'estimator__preprocessing__numerical__imputer__copy', 'estimator__preprocessing__numerical__imputer__fill_value', 'estimator__preprocessing__numerical__imputer__missing_values', 'estimator__preprocessing__numerical__imputer__strate

In [31]:
import pandas as pd

cv_results = pd.DataFrame(grid_search.cv_results_)
cv_results = cv_results.sort_values("mean_test_score", ascending=False)
cv_results[['param_LR__C','param_preprocessing__categorical__imputer__strategy','param_preprocessing__numerical__imputer__strategy','mean_test_score']]

Unnamed: 0,param_LR__C,param_preprocessing__categorical__imputer__strategy,param_preprocessing__numerical__imputer__strategy,mean_test_score
0,1,most_frequent,mean,0.787852
1,1,most_frequent,median,0.787852
2,1,constant,mean,0.787852
3,1,constant,median,0.787852
4,10,most_frequent,mean,0.787852
5,10,most_frequent,median,0.787852
6,10,constant,mean,0.787852
7,10,constant,median,0.787852
8,100,most_frequent,mean,0.787852
9,100,most_frequent,median,0.787852


In [30]:
grid_search.cv_results_

{'mean_fit_time': array([0.0308744 , 0.03297584, 0.03290198, 0.03117993, 0.03034165,
        0.03060524, 0.03207486, 0.03150403, 0.02663722, 0.02610569,
        0.02919741, 0.03034379]),
 'std_fit_time': array([0.0066195 , 0.00404976, 0.00375656, 0.0010015 , 0.00075228,
        0.00102709, 0.00199391, 0.00246816, 0.00116946, 0.00072771,
        0.00060419, 0.00047448]),
 'mean_score_time': array([0.00980484, 0.01135664, 0.01063426, 0.00952706, 0.00982699,
        0.00949824, 0.00951657, 0.00943809, 0.00979605, 0.00957301,
        0.00913923, 0.0095221 ]),
 'std_score_time': array([0.00045914, 0.00167465, 0.00253562, 0.000294  , 0.00041867,
        0.00039173, 0.00029825, 0.00034086, 0.00053081, 0.00033062,
        0.00012207, 0.00025166]),
 'param_LR__C': masked_array(data=[1.0, 1.0, 1.0, 1.0, 10, 10, 10, 10, 100, 100, 100, 100],
              mask=[False, False, False, False, False, False, False, False,
                    False, False, False, False],
        fill_value='?',
         