In [1]:
import dill
from datetime import datetime
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from bank_churn_functions.estimators import FeatureBuilder

## Prepare data

In [2]:
y_col = "Exited"

x_cols_by_type = {
    "cat": ('Geography', 'Gender'),
    "num": ('CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts'),
    "bool": ('HasCrCard', 'IsActiveMember', 'EstimatedSalary')
}

x_cols = [col for col_ls in x_cols_by_type.values() for col in col_ls]

feature_builder_config = {
    "BalanceToEstimatedSalary": lambda x: x['Balance'] / (x['EstimatedSalary'] + 1.0),
    "CreditScoreToBalance":     lambda x: x['CreditScore'] / (x['Balance'] + 1.0),
    "BalanceToAge":             lambda x: x['Balance'] / x['Age'],
    "CreditScoreToAge":         lambda x: x['CreditScore'] / x['Age'],
    "NumOfProductsToTenure":    lambda x: x['NumOfProducts'] / (x['Tenure'] + 1.0),
    "AgeEntered":               lambda x: x['Age'] - x['Tenure']
}

In [3]:
df = pd.read_csv("data/train.csv")

x = df[x_cols]
y = df[y_col]

In [4]:
x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.2, random_state=420, shuffle=True, stratify=y)

## Build a Baseline model

In [5]:
col_transformer = ColumnTransformer(
    transformers=[
        ('feature_build', FeatureBuilder(config=feature_builder_config), x_cols),
        ('cat_transform', OneHotEncoder(drop='if_binary'), x_cols_by_type['cat']),
        ('num_transform', StandardScaler(), x_cols_by_type['num']),
        ('bool_transform', 'passthrough', x_cols_by_type['bool'])
        ],
    remainder='drop'
    )

pipeline = Pipeline([
    ("col_transform", col_transformer),
    ("model", XGBClassifier())
])

pipeline

In [6]:
pipeline.fit(x_train, y_train)
pipeline.score(x_valid, y_valid)

0.8649074438755415

## Compare models performance

In [7]:
pipeline.get_params();

In [8]:
# Compare tree-based models

search_regressors = [
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    XGBClassifier(),
    CatBoostClassifier(verbose=0)
    ]

param_grid = {
    'model': search_regressors
    }

grid_search = GridSearchCV(
    estimator=pipeline, 
    param_grid=param_grid,
    # error_score=np.NaN,
    scoring='roc_auc'
    )

grid_search

In [9]:
# fit and check best hyperparameters

grid_search.fit(x_train, y_train)
grid_search.best_params_

{'model': <catboost.core.CatBoostClassifier at 0x7fa3e28eb4c0>}

In [32]:
y_pred = grid_search.predict_proba(x_valid)[:, 1]

In [11]:
grid_search.score(x_valid, y_valid)

0.890071870183214

In [12]:
models_comparison = pd.DataFrame(grid_search.cv_results_). \
    sort_values('rank_test_score'). \
    reset_index(drop=True)

models_comparison

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,9.729959,0.265297,0.097925,0.003053,<catboost.core.CatBoostClassifier object at 0x...,{'model': <catboost.core.CatBoostClassifier ob...,0.88587,0.891364,0.886349,0.881803,0.892539,0.887585,0.003917,1
1,0.336259,0.018556,0.035269,0.000725,"XGBClassifier(base_score=None, booster=None, c...","{'model': XGBClassifier(base_score=None, boost...",0.882474,0.888457,0.883646,0.878275,0.88836,0.884242,0.003842,2
2,18.97365,0.302154,0.450159,0.003635,RandomForestClassifier(),{'model': RandomForestClassifier()},0.874205,0.877716,0.872138,0.866723,0.876769,0.87351,0.003918,3
3,1.304254,0.063711,0.027036,0.000642,DecisionTreeClassifier(),{'model': DecisionTreeClassifier()},0.702706,0.711352,0.702599,0.702408,0.700852,0.703983,0.003745,4


In [13]:
print(f"""
The best model: {models_comparison.loc[0, "param_model"]}
Score CV Mean:  {models_comparison.loc[0, "mean_test_score"]}
Score CV Std:   {models_comparison.loc[0, "std_test_score"]}
""")


The best model: <catboost.core.CatBoostClassifier object at 0x7fa3e28eb4c0>
Score CV Mean:  0.8875849279532577
Score CV Std:   0.003917374931317561



Conclusions:
- Following the default settings CatBoost model returns the best score.
- XGBoost scores second with results very close to CatBoost.
- CatBoost provides possibility to integrate with tensorflow and more flexibility for feeding character variables, so this is the space I will explore next.

## Inspect the best model

In [14]:
pd.DataFrame({
    "feature": grid_search.best_estimator_.get_params()['col_transform'].get_feature_names_out(),
    "importance": grid_search.best_estimator_.get_params()['model'].get_feature_importance()
    }).sort_values('importance', ascending=False)

Unnamed: 0,feature,importance
14,num_transform__NumOfProducts,32.203393
11,num_transform__Age,15.699922
16,bool_transform__IsActiveMember,8.01106
13,num_transform__Balance,5.642112
17,bool_transform__EstimatedSalary,4.94704
10,num_transform__CreditScore,4.50564
7,cat_transform__Geography_Germany,4.312792
1,feature_build__CreditScoreToBalance,3.883989
3,feature_build__CreditScoreToAge,3.840952
5,feature_build__AgeEntered,3.700275


In [26]:
TIME_LABEL = datetime.now().strftime("_%m%d_%H%M")

with open(f"models/best{TIME_LABEL}", "wb") as file:
    dill.dump(grid_search.best_estimator_, file)