In [32]:
import dill
from datetime import datetime
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from bank_churn_functions.estimators import FeatureBuilder

## Prepare data

In [33]:
y_col = "Exited"

x_cols_by_type = {
    "cat": ('Geography', 'Gender'),
    "num": ('CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts'),
    "bool": ('HasCrCard', 'IsActiveMember', 'EstimatedSalary'),
    "text": ('Surname',)
}

x_cols = [col for col_ls in x_cols_by_type.values() for col in col_ls]

feature_builder_config = {
    "BalanceToEstimatedSalary": lambda x: x['Balance'] / (x['EstimatedSalary'] + 1.0),
    "CreditScoreToBalance":     lambda x: x['CreditScore'] / (x['Balance'] + 1.0),
    "BalanceToAge":             lambda x: x['Balance'] / x['Age'],
    "CreditScoreToAge":         lambda x: x['CreditScore'] / x['Age'],
    "NumOfProductsToTenure":    lambda x: x['NumOfProducts'] / (x['Tenure'] + 1.0),
    "AgeEntered":               lambda x: x['Age'] - x['Tenure']
}

In [34]:
df = pd.read_csv("data/train.csv")

x = df[x_cols]
y = df[y_col]

In [35]:
x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.2, random_state=420, shuffle=True, stratify=y)

## Build a Baseline model

In [36]:
col_transformer = ColumnTransformer(
    transformers=[
        ('feature_build', FeatureBuilder(config=feature_builder_config), x_cols),
        ('cat_transform', OneHotEncoder(drop='if_binary'), x_cols_by_type['cat']),
        ('num_transform', StandardScaler(), x_cols_by_type['num']),
        ('bool_transform', 'passthrough', x_cols_by_type['bool'])
        ],
    remainder='drop'
    )

pipeline = Pipeline([
    ("col_transform", col_transformer),
    ("model", XGBClassifier())
])

pipeline

In [37]:
pipeline.fit(x_train, y_train)
pipeline.score(x_valid, y_valid)

0.8649074438755415

## Compare models performance

In [38]:
pipeline.get_params();

In [39]:
# Compare tree-based models

search_regressors = [
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    XGBClassifier(),
    CatBoostClassifier(verbose=0)
    ]

param_grid = {
    'model': search_regressors
    }

grid_search = GridSearchCV(
    estimator=pipeline, 
    param_grid=param_grid,
    scoring='roc_auc'
    )

grid_search

In [40]:
# fit and check best hyperparameters

grid_search.fit(x_train, y_train)
grid_search.best_params_

{'model': <catboost.core.CatBoostClassifier at 0x7f00bdff6290>}

In [41]:
y_pred = grid_search.predict_proba(x_valid)[:, 1]

In [42]:
roc_auc_score(y_valid, y_pred)

0.890071870183214

In [43]:
models_comparison = pd.DataFrame(grid_search.cv_results_). \
    sort_values('rank_test_score'). \
    reset_index(drop=True)

models_comparison

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,9.854471,0.729025,0.099116,0.002585,<catboost.core.CatBoostClassifier object at 0x...,{'model': <catboost.core.CatBoostClassifier ob...,0.88587,0.891364,0.886349,0.881803,0.892539,0.887585,0.003917,1
1,0.338558,0.019765,0.037057,0.001434,"XGBClassifier(base_score=None, booster=None, c...","{'model': XGBClassifier(base_score=None, boost...",0.882474,0.888457,0.883646,0.878275,0.88836,0.884242,0.003842,2
2,18.46297,0.232538,0.452478,0.011155,RandomForestClassifier(),{'model': RandomForestClassifier()},0.873477,0.876514,0.871818,0.867399,0.877327,0.873307,0.003566,3
3,1.272601,0.018262,0.029599,0.003288,DecisionTreeClassifier(),{'model': DecisionTreeClassifier()},0.70195,0.711854,0.702658,0.702639,0.702245,0.704269,0.003802,4


In [44]:
print(f"""
The best model: {models_comparison.loc[0, "param_model"]}
Score CV Mean:  {models_comparison.loc[0, "mean_test_score"]}
Score CV Std:   {models_comparison.loc[0, "std_test_score"]}
""")


The best model: <catboost.core.CatBoostClassifier object at 0x7f00bdff6290>
Score CV Mean:  0.8875849279532577
Score CV Std:   0.003917374931317561



Conclusions:
- Following the default settings CatBoost model returns the best score.
- XGBoost scores second with results very close to CatBoost.

In [45]:
pd.DataFrame({
    "feature": grid_search.best_estimator_.get_params()['col_transform'].get_feature_names_out(),
    "importance": grid_search.best_estimator_.get_params()['model'].get_feature_importance()
    }).sort_values('importance', ascending=False)

Unnamed: 0,feature,importance
14,num_transform__NumOfProducts,32.203393
11,num_transform__Age,15.699922
16,bool_transform__IsActiveMember,8.01106
13,num_transform__Balance,5.642112
17,bool_transform__EstimatedSalary,4.94704
10,num_transform__CreditScore,4.50564
7,cat_transform__Geography_Germany,4.312792
1,feature_build__CreditScoreToBalance,3.883989
3,feature_build__CreditScoreToAge,3.840952
5,feature_build__AgeEntered,3.700275


In [46]:
save_a1_model = False

if save_a1_model:

    TIME_LABEL = datetime.now().strftime("_%m%d_%H%M")

    with open(f"models/best{TIME_LABEL}", "wb") as file:
        dill.dump(grid_search.best_estimator_, file)

## Approach 2: Utilizing Surname embedding

In [89]:
from bank_churn_functions.estimators import TopNWordMoversEmbedder
from gensim import corpora

wmd_col = ["Surname",]

surnames = pd.concat([
    pd.read_csv("data/train.csv", usecols=['Surname']), 
    pd.read_csv("data/test.csv", usecols=['Surname'])
    ], ignore_index=True).Surname

corpus = corpora.Dictionary(
    [[surname,] for surname in surnames]
    )

In [90]:
EMBED_V_LEN = 100
W2V_V_LEN = 100
PHRASE_LEN = 2

col_transformer_wmd = ColumnTransformer(
    transformers=[
        ('feature_build', FeatureBuilder(config=feature_builder_config), x_cols),
        ('cat_transform', OneHotEncoder(drop='if_binary'), x_cols_by_type['cat']),
        ('num_transform', StandardScaler(), x_cols_by_type['num']),
        ('bool_transform', 'passthrough', x_cols_by_type['bool']),
        ('text_embed', TopNWordMoversEmbedder(corpus, EMBED_V_LEN, W2V_V_LEN, PHRASE_LEN), wmd_col[0])
        ],
    remainder='drop'
    )

pipeline_wmd = Pipeline([
    ("col_transform", col_transformer_wmd),
    ("model", CatBoostClassifier(verbose=0))
])

In [91]:
# hyperparameter training:
# do not run: takes ~8h

run_hp_wmd = False

if run_hp_wmd:

    param_grid_wmd = {
        'col_transform__text_embed__embed_v_len': [50,100,200,300],
        'col_transform__text_embed__w2v_v_len': [50,100,200,300],
        'col_transform__text_embed__phrase_len': [2,3]
    }

    grid_search_wmd = GridSearchCV(
        pipeline_wmd,
        param_grid_wmd,
        scoring="roc_auc"
        )

    grid_search_wmd.fit(x_train, y_train)
    grid_search.best_params_

    # Best params:
    # {
    #  'col_transform__text_embed__embed_v_len': 300,
    #  'col_transform__text_embed__phrase_len': 3,
    #  'col_transform__text_embed__w2v_v_len': 300
    #  }

    # Runner up params: 
    # {
    #  'col_transform__text_embed__embed_v_len': 200,
    #  'col_transform__text_embed__phrase_len': 2,
    #  'col_transform__text_embed__w2v_v_len': 200
    #  }

    # Best results on Kaggle: 
    # {
    #  'col_transform__text_embed__embed_v_len': 100,
    #  'col_transform__text_embed__phrase_len': 2,
    #  'col_transform__text_embed__w2v_v_len': 100
    #  }

    roc_auc_score(y_valid, grid_search_wmd.predict_proba(x_valid)[:, 1])
    
    TIME_LABEL = datetime.now().strftime("_%m%d_%H%M")

    with open(f"models/best{TIME_LABEL}", "wb") as file:
        dill.dump(grid_search_wmd.best_estimator_, file)

### Training on full dataset

In [92]:
pipeline_wmd.fit(x, y)

In [93]:
TIME_LABEL = datetime.now().strftime("_%m%d_%H%M")

with open(f"models/best{TIME_LABEL}", "wb") as file:
    dill.dump(pipeline_wmd, file)

In [94]:
pipeline_wmd

In [100]:
pd.DataFrame({
    "feature": pipeline_wmd.get_params()['col_transform'].get_feature_names_out(),
    "importance": pipeline_wmd.get_params()['model'].get_feature_importance()
    }).sort_values('importance', ascending=False)[0:25]

Unnamed: 0,feature,importance
14,num_transform__NumOfProducts,28.253513
11,num_transform__Age,12.197984
16,bool_transform__IsActiveMember,6.868989
7,cat_transform__Geography_Germany,3.97251
0,feature_build__BalanceToEstimatedSalary,2.953564
10,num_transform__CreditScore,2.812089
13,num_transform__Balance,2.614723
9,cat_transform__Gender_Male,2.605363
1,feature_build__CreditScoreToBalance,2.589375
3,feature_build__CreditScoreToAge,2.536238
