In [2]:
import os
import gc
import joblib
import numpy as np
import pandas as pd
import mlflow
import shap
import catboost as cat
import xgboost as xgb
import featuretools as ft
import matplotlib.pyplot as plt
from pandas_profiling import ProfileReport
from typing import Tuple, Callable
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier
from category_encoders import TargetEncoder
from sklearn.preprocessing import OneHotEncoder 
from lightgbm import LGBMClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.impute import KNNImputer

`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.


In [3]:
# Features
PASSENGER_ID  = 'PassengerId'
SURVIVED = 'Survived'
PCLASS = 'Pclass'
NAME = 'Name'
SEX = 'Sex'
AGE = 'Age'
SIBSP = 'SibSp'
PARCH = 'Parch'
TICKET = 'Ticket'
FARE = 'Fare'
CABIN = 'Cabin'
EMBARKED = 'Embarked'
CAT_FEATURES = [PCLASS, SEX, TICKET, CABIN, EMBARKED]
TRANS_PRIMITIVES = ['multiply_numeric', 'divide_numeric']

DEFAULT_OTHER_NAME = 'Other'
DEFAULT_NAN_NAME = 'NaN'
DEFAULT_OTHER_FOR_NUMERIC = -99999
DEFAULT_NAN_FOR_NUMERIC = -88888
TRAIN_FILE = "data/train.csv"
TEST_FILE = "data/test.csv"
REPORT_PATH = 'reports/titanic_report.html'

In [40]:
train = pd.read_csv(TRAIN_FILE)

In [41]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [10]:
def fill_numeric_knn(df: pd.DataFrame, imputer_params:dict = {"n_neighbors": 5, "metric": "nan_euclidean", "weights": "uniform"})->pd.DataFrame:
    knn_imputer = KNNImputer(**imputer_params)
    transformed_df = knn_imputer.fit_transform(df)
    return transformed_df

# with Label Encoder    
def fill_categorical_label(dataset: pd.DataFrame, column_name: str, fillna=True):
    if fillna:
        dataset[column_name].fillna(value=DEFAULT_NAN_NAME, inplace=True)
    label_encoder = LabelEncoder().fit(np.sort(dataset[column_name].unique()))
    dataset[column_name] = dataset[column_name].map(
        dict(zip(dataset[column_name].unique(), label_encoder.transform(dataset[column_name].unique()))))
    return label_encoder

#Target Encoder
def fill_categorical_target(dataset: pd.DataFrame, column_name: str, target_column: str, fillna=True):
    if fillna:
        dataset[column_name].fillna(value=DEFAULT_NAN_NAME, inplace=True)
    target_encoder = TargetEncoder()
    dataset[column_name] = target_encoder.fit_transform(dataset[column_name], dataset[target_column])
    return target_encoder
    

In [42]:
train.drop(columns=[PASSENGER_ID, NAME], inplace=True)
y_train = train[SURVIVED]
label_encoded_train = train.copy()
for cat_feature in CAT_FEATURES:
    fill_categorical_label(label_encoded_train, cat_feature)
    
target_encoded_train = train.copy()
for cat_feature in CAT_FEATURES:
    fill_categorical_target(target_encoded_train, cat_feature, SURVIVED)

`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.
is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead
is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead
is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead
is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead


In [43]:
target_encoded_train.isna().sum()

`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.


Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin         0
Embarked      0
dtype: int64

In [47]:
# Fill Age
label_encoded_train.drop(SURVIVED, inplace=True, axis=1)
target_encoded_train.drop(SURVIVED, inplace=True, axis=1)
filled_label_train = pd.DataFrame(fill_numeric_knn(label_encoded_train))
filled_target_train = pd.DataFrame(fill_numeric_knn(target_encoded_train))

In [48]:
filled_label_train.columns = label_encoded_train.columns
filled_target_train.columns = target_encoded_train.columns

`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.


In [49]:
filled_label_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,2.0,1.0,22.0,1.0,0.0,523.0,7.25,146.0,3.0
1,0.0,0.0,38.0,1.0,0.0,596.0,71.2833,81.0,0.0
2,2.0,0.0,26.0,0.0,0.0,669.0,7.925,146.0,3.0
3,0.0,0.0,35.0,1.0,0.0,49.0,53.1,55.0,3.0
4,2.0,1.0,35.0,0.0,0.0,472.0,8.05,146.0,3.0


In [50]:
filled_target_train.head()

`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3.0,0.188908,22.0,1.0,0.0,0.383838,7.25,0.299854,0.336957
1,1.0,0.742038,38.0,1.0,0.0,0.383838,71.2833,0.383838,0.553571
2,3.0,0.742038,26.0,0.0,0.0,0.383838,7.925,0.299854,0.336957
3,1.0,0.742038,35.0,1.0,0.0,0.468759,53.1,0.468759,0.336957
4,3.0,0.188908,35.0,0.0,0.0,0.383838,8.05,0.299854,0.336957


In [51]:
filled_label_train.isna().sum()

`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.


Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Ticket      0
Fare        0
Cabin       0
Embarked    0
dtype: int64

In [52]:
filled_target_train.isna().sum()

`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.


Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Ticket      0
Fare        0
Cabin       0
Embarked    0
dtype: int64

In [64]:
rfc_params = {
    "n_estimators": 10,
    "criterion": "gini",
    "min_samples_split": 10,
    "max_features": "auto",
    "bootstrap": True,
    "n_jobs": -1
}


In [65]:
rfc = RandomForestClassifier(**rfc_params)

In [66]:
rf_l_scores = cross_validate(rfc, filled_label_train, y_train, cv=10, scoring=["accuracy", "f1_macro", "roc_auc"], n_jobs=-1, return_estimator=True)
rf_l_scores

{'fit_time': array([0.05567384, 0.04786849, 0.05680037, 0.04956961, 0.03287244,
        0.0403049 , 0.03659844, 0.03997612, 0.0352037 , 0.02583218]),
 'score_time': array([0.04133511, 0.01616096, 0.04299593, 0.01564693, 0.02611113,
        0.03371024, 0.02151513, 0.01861072, 0.01524115, 0.01119876]),
 'estimator': (RandomForestClassifier(min_samples_split=10, n_estimators=10, n_jobs=-1),
  RandomForestClassifier(min_samples_split=10, n_estimators=10, n_jobs=-1),
  RandomForestClassifier(min_samples_split=10, n_estimators=10, n_jobs=-1),
  RandomForestClassifier(min_samples_split=10, n_estimators=10, n_jobs=-1),
  RandomForestClassifier(min_samples_split=10, n_estimators=10, n_jobs=-1),
  RandomForestClassifier(min_samples_split=10, n_estimators=10, n_jobs=-1),
  RandomForestClassifier(min_samples_split=10, n_estimators=10, n_jobs=-1),
  RandomForestClassifier(min_samples_split=10, n_estimators=10, n_jobs=-1),
  RandomForestClassifier(min_samples_split=10, n_estimators=10, n_jobs=-1),
 

In [67]:
max(rf_l_scores["test_accuracy"])

`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.


0.9101123595505618

In [68]:
rf_t_scores = cross_validate(rfc, filled_target_train, y_train, cv=10, scoring=["accuracy", "f1_macro", "roc_auc"], n_jobs=-1, return_estimator=True)
rf_t_scores

{'fit_time': array([0.03745079, 0.03970098, 0.05195355, 0.05174041, 0.03328133,
        0.05555534, 0.03371572, 0.02499628, 0.02647972, 0.02964497]),
 'score_time': array([0.04384398, 0.04224443, 0.01964116, 0.0238266 , 0.02516294,
        0.02430964, 0.01710153, 0.02826524, 0.01366591, 0.01242161]),
 'estimator': (RandomForestClassifier(min_samples_split=10, n_estimators=10, n_jobs=-1),
  RandomForestClassifier(min_samples_split=10, n_estimators=10, n_jobs=-1),
  RandomForestClassifier(min_samples_split=10, n_estimators=10, n_jobs=-1),
  RandomForestClassifier(min_samples_split=10, n_estimators=10, n_jobs=-1),
  RandomForestClassifier(min_samples_split=10, n_estimators=10, n_jobs=-1),
  RandomForestClassifier(min_samples_split=10, n_estimators=10, n_jobs=-1),
  RandomForestClassifier(min_samples_split=10, n_estimators=10, n_jobs=-1),
  RandomForestClassifier(min_samples_split=10, n_estimators=10, n_jobs=-1),
  RandomForestClassifier(min_samples_split=10, n_estimators=10, n_jobs=-1),
 

In [69]:
max(rf_t_scores["test_accuracy"])

`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.


0.9438202247191011