In [10]:
import os
import gc
import joblib
import numpy as np
import pandas as pd
import mlflow
import shap
import catboost as cat
import xgboost as xgb
import featuretools as ft
import matplotlib.pyplot as plt
from pandas_profiling import ProfileReport
from typing import Tuple, Callable
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier
from category_encoders import TargetEncoder
from sklearn.preprocessing import OneHotEncoder 
from lightgbm import LGBMClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.impute import KNNImputer
from sklearn.cluster import KMeans, DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import faiss

In [2]:
# Features
PASSENGER_ID  = 'PassengerId'
SURVIVED = 'Survived'
PCLASS = 'Pclass'
NAME = 'Name'
SEX = 'Sex'
AGE = 'Age'
SIBSP = 'SibSp'
PARCH = 'Parch'
TICKET = 'Ticket'
FARE = 'Fare'
CABIN = 'Cabin'
EMBARKED = 'Embarked'
INCOME_LEVEL = "income_level"
CAT_FEATURES = [PCLASS, SEX, TICKET, CABIN, EMBARKED]
TRANS_PRIMITIVES = ['multiply_numeric', 'divide_numeric']

DEFAULT_OTHER_NAME = 'Other'
DEFAULT_NAN_NAME = 'NaN'
DEFAULT_OTHER_FOR_NUMERIC = -99999
DEFAULT_NAN_FOR_NUMERIC = -88888
TRAIN_FILE = "data/train.csv"
TEST_FILE = "data/test.csv"
REPORT_PATH = 'reports/titanic_report.html'

`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.


In [3]:
train = pd.read_csv(TRAIN_FILE)
y_train = train[SURVIVED]

`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.


In [4]:
def fill_numeric_knn(df: pd.DataFrame, imputer_params:dict = {"n_neighbors": 5, "metric": "nan_euclidean", "weights": "uniform"})->pd.DataFrame:
    knn_imputer = KNNImputer(**imputer_params)
    transformed_df = knn_imputer.fit_transform(df)
    return transformed_df

# with Label Encoder    
def fill_categorical_label(dataset: pd.DataFrame, column_name: str, fillna=True):
    if fillna:
        dataset[column_name].fillna(value=DEFAULT_NAN_NAME, inplace=True)
    label_encoder = LabelEncoder().fit(np.sort(dataset[column_name].unique()))
    dataset[column_name] = dataset[column_name].map(
        dict(zip(dataset[column_name].unique(), label_encoder.transform(dataset[column_name].unique()))))
    return label_encoder

#Target Encoder
def fill_categorical_target(dataset: pd.DataFrame, column_name: str, target_column: str, fillna=True):
    if fillna:
        dataset[column_name].fillna(value=DEFAULT_NAN_NAME, inplace=True)
    target_encoder = TargetEncoder()
    dataset[column_name] = target_encoder.fit_transform(dataset[column_name], dataset[target_column])
    return target_encoder

In [5]:
train.drop(columns=[PASSENGER_ID, NAME], inplace=True)

label_encoded_train = train.copy()
for cat_feature in CAT_FEATURES:
    fill_categorical_label(label_encoded_train, cat_feature)
    
target_encoded_train = train.copy()
for cat_feature in CAT_FEATURES:
    fill_categorical_target(target_encoded_train, cat_feature, SURVIVED)

is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead
is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead
is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead
is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead


In [6]:
label_encoded_train.drop(SURVIVED, inplace=True, axis=1)
target_encoded_train.drop(SURVIVED, inplace=True, axis=1)
filled_label_train = pd.DataFrame(fill_numeric_knn(label_encoded_train))
filled_target_train = pd.DataFrame(fill_numeric_knn(target_encoded_train))
filled_label_train.columns = label_encoded_train.columns
filled_target_train.columns = target_encoded_train.columns

`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.


# Function for adding additional features with FAISS

In [107]:
def add_faiss(df: pd.DataFrame, columns: list, k_neighbors: int=3, nprobe: int=2, stats: list=["std", "var", "median"])->pd.DataFrame:
    n_dimensions = df[columns].shape[1]
    quantizer = faiss.IndexFlatL2(n_dimensions)
    index = faiss.IndexIVFFlat(quantizer, n_dimensions, k_neighbors, faiss.METRIC_L2)
    df_arr = df[columns].to_numpy().astype(np.float32)
    df_arr = df_arr.copy(order="C") # if you don't do this you get an exception
    print(df_arr.shape)
    index.train(df_arr)
    index.add(df_arr)
    # counting stats
    distances, indices = index.search(df_arr, k_neighbors)
    out_columns = []
    # mean
    mean_column = []
    for i in indices:
            vals = df_arr[i]
            mean = np.mean(vals, axis=0)
            mean_column.append(mean)
    resulting_set = np.array(mean_column)
    out_columns.extend(["mean_" + col_name for col_name in columns])
    
    # std
    if "std" in stats:
        std_column = []
        for i in indices:
            vals = df_arr[i]
            std = np.std(vals, axis=0) # std of a all neighbor features
            std_column.append(std)
        std_column = np.array(std_column)
        resulting_set = np.hstack((resulting_set, std_column))
        out_columns.extend(["std_" + col_name for col_name in columns])
    out_df = pd.DataFrame(resulting_set)
    out_df.columns = out_columns
    return out_df
    
    
    

In [108]:
faissed = add_faiss(filled_label_train, columns=[FARE, SEX])

(891, 2)


In [109]:
faissed.shape

(891, 4)

In [110]:
faissed.describe()

Unnamed: 0,mean_Fare,mean_Sex,std_Fare,std_Sex
count,891.0,891.0,891.0,891.0
mean,32.205997,0.643098,0.104489,0.02857
std,49.719093,0.465078,0.510082,0.112543
min,0.0,0.0,0.0,0.0
25%,7.9104,0.0,0.0,0.0
50%,14.455566,1.0,0.0,0.0
75%,31.137501,1.0,0.033376,0.0
max,512.329224,1.0,7.002338,0.471405


# Train XGBoost with a new dataset

In [111]:
xgb_params = {
    'random_state': 42,
    'tree_method': 'hist',
    'grow_policy': 'lossguide',
    'n_estimators': 10,
    'eta': 0.02,
    'max_depth': 3,
    'min_child_weight': 1,
    'reg_lambda': 1,
    'max_bin': 120,
    'subsample': 0.9
}


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.


In [112]:
xgbc = xgb.XGBClassifier(**xgb_params)

In [113]:
xgb_t_scores = cross_validate(xgbc, faissed, y_train, cv=10, scoring=["accuracy", "f1_macro", "roc_auc"], n_jobs=-1, return_estimator=True)
xgb_t_scores

{'fit_time': array([16.150455  , 15.67064691, 16.31616044, 16.30536842, 16.43218303,
        16.65528512, 16.38747907, 16.71162415, 10.48739409, 10.09708929]),
 'score_time': array([0.31923294, 0.29114628, 0.28732944, 0.29925466, 0.28313756,
        0.3672688 , 0.32321453, 0.2830627 , 0.19316578, 0.00441194]),
 'estimator': (XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                colsample_bynode=1, colsample_bytree=1, eta=0.02, gamma=0,
                gpu_id=-1, grow_policy='lossguide', importance_type='gain',
                interaction_constraints='', learning_rate=0.0199999996,
                max_bin=120, max_delta_step=0, max_depth=3, min_child_weight=1,
                missing=nan, monotone_constraints='()', n_estimators=10, n_jobs=4,
                num_parallel_tree=1, random_state=42, reg_alpha=0, reg_lambda=1,
                scale_pos_weight=1, subsample=0.9, tree_method='hist',
                validate_parameters=1, verbosity=None),
  XGBClass