In [27]:
import pandas as pd 
import numpy as np 

from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.compose import ColumnTransformer 
from sklearn.pipeline import Pipeline 
from sklearn.impute import SimpleImputer 
from sklearn.preprocessing import StandardScaler, OneHotEncoder 
from sklearn.linear_model import LogisticRegression 

In [28]:
df = pd.read_csv('titanic_5000_missing.csv')
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3.0,female,,0.0,2.0,67.368005,S
1,1,3.0,male,56.001493,1.0,5.0,68.666394,S
2,0,3.0,female,17.279829,0.0,1.0,136.228014,Q
3,0,3.0,female,35.857144,,1.0,108.507546,S
4,0,3.0,,6.135062,0.0,1.0,80.157715,C


In [29]:
df.isnull().mean() * 100

Survived     0.00
Pclass       9.72
Sex          9.72
Age         10.28
SibSp        9.96
Parch        9.60
Fare        10.38
Embarked    10.06
dtype: float64

In [30]:
x = df.drop('Survived', axis=1)
y = df['Survived']

In [31]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=2)

In [32]:
x_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
2416,2.0,male,62.315997,5.0,2.0,199.242458,Q
2417,2.0,female,39.426319,,3.0,,C
2513,3.0,male,21.178834,1.0,,124.210588,C
1698,,female,8.963769,1.0,0.0,106.511171,Q
3322,2.0,female,35.363436,4.0,5.0,157.126101,Q


In [40]:
numerical_col = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']

numerical_Transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_features = ['Sex', 'Embarked']

categorical_Transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='ignore'))
])

In [41]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_Transformer, numerical_col),
        ('cat', categorical_Transformer, categorical_features)
    ]
)

In [42]:
clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])

In [43]:
param_grid = {
    'preprocessor__num__imputer__strategy': ['mean', 'median'],
    'preprocessor__cat__imputer__strategy': ['most_frequent', 'constant'],
    'classifier__C': [0.1, 1.0, 100]
}

In [44]:
grid_search = GridSearchCV(clf, param_grid, cv=10, n_jobs=-1)

grid_search.fit(x_train, y_train)

print("Best Params:")
print(grid_search.best_params_)

Best Params:
{'classifier__C': 100, 'preprocessor__cat__imputer__strategy': 'constant', 'preprocessor__num__imputer__strategy': 'mean'}
