In [2]:
%matplotlib inline
import numpy as np

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from imblearn.pipeline import Pipeline
import xgboost as xgb

import custom_helpers as ch

# load data with helper function
data = ch.load_data('../data/train_month_3_with_target.csv')

  from pandas import MultiIndex, Int64Index


------------------------------------------------------------
loading data...
transforming dates...
cast types into bool, object, categorical...
data loaded and casted
------------------------------------------------------------


# Preprocessing

In [9]:
y = data.target
X = data.drop(columns = ['target'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, stratify = y, random_state=42)

num_col = X_train.select_dtypes(include = 'number', exclude = 'bool').columns
cat_col = X_train.select_dtypes(include = 'category').columns
bool_col = X_train.select_dtypes(include = 'bool').columns
date_col = X_train.select_dtypes(include = 'datetime64').columns
obj_col = X_train.select_dtypes(include = 'object').columns

# found during exploratory phase
colinear_features = ['bal_insurance_23',
     'bal_insurance_21',
     'bal_savings_account_starter',
     'has_homebanking',
     'customer_since_bank_years',
     'cap_life_insurance_decreasing_cap',
     'has_mortgage_loan',
     'has_fire_car_other_insurance',
     'bal_pension_saving',
     'bal_personal_loan']


numeric_transformer = Pipeline(steps = [
    ('impute',SimpleImputer(missing_values=np.nan, strategy='median'))
])

categorical_transformer = OneHotEncoder(handle_unknown="ignore")


preprocessor = ColumnTransformer(
    transformers=[
        ('drop_ID','drop',obj_col),
        ('drop_dates','drop',date_col),
        ('drop_colinear', 'drop', colinear_features),
        ('cat',categorical_transformer,cat_col),
        ('num',numeric_transformer,num_col)
    ],
    remainder = "passthrough"
)

X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

# First Pass without Parameter Tuning

In [20]:
#classifier
xg = xgb.XGBClassifier(objective='binary:logistic', use_label_encoder=False,seed = 42)

clf = xg.fit(X_train, y_train,
             eval_set = [(X_test, y_test)], 
             eval_metric = 'auc',
             early_stopping_rounds = 10,
             verbose = True)

ch.evaluate(clf, X_test, y_test)

[0]	validation_0-auc:0.67979
[1]	validation_0-auc:0.70375
[2]	validation_0-auc:0.70044
[3]	validation_0-auc:0.70399
[4]	validation_0-auc:0.71125
[5]	validation_0-auc:0.71134
[6]	validation_0-auc:0.71019
[7]	validation_0-auc:0.70876
[8]	validation_0-auc:0.71080
[9]	validation_0-auc:0.71162
[10]	validation_0-auc:0.71096
[11]	validation_0-auc:0.71148
[12]	validation_0-auc:0.71083
[13]	validation_0-auc:0.71065
[14]	validation_0-auc:0.71025
[15]	validation_0-auc:0.71026
[16]	validation_0-auc:0.71074
[17]	validation_0-auc:0.70985
[18]	validation_0-auc:0.70984
[19]	validation_0-auc:0.70981
------------------------------------------------------------
Performance Over Whole Set
------------------------------------------------------------
               precision    recall  f1-score   support

Did not Churn       0.97      1.00      0.98     30892
        Churn       0.00      0.00      0.00       957

     accuracy                           0.97     31849
    macro avg       0.48      0.50     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, predictor='auto', random_state=42,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=42,
              subsample=1, tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

In [4]:
#pipeline
pipe = Pipeline(
    steps=[("preprocessor", preprocessor),
           ("classifer", rf)]
)


gridscorer = ch.gridscorer() # custom scorer (precision@250)

search = GridSearchCV(pipe, param_grid, scoring = gridscorer, n_jobs=-2)
search.fit(X_train, y_train)
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

Best parameter (CV score=26.000):
{'balancer': 'passthrough', 'forest__class_weight': 'balanced'}


### Fit RF with best parameters

In [5]:
clf = search.best_estimator_.fit(X_train, y_train)
#clf = search.best_estimator_

# make prediction on test
y_pred_test = clf.predict(X_test)
y_pred_test_probs = clf.predict_proba(X_test)

ch.evaluate(clf, X_test, y_test)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('drop_ID', 'drop',
                                                  Index(['client_id', 'customer_postal_code'], dtype='object')),
                                                 ('drop_dates', 'drop',
                                                  Index(['customer_since_all', 'customer_since_bank', 'customer_birth_date'], dtype='object')),
                                                 ('drop_colinear', 'drop',
                                                  ['bal_insurance_23',
                                                   'bal_insurance_21',
                                                   'bal_savin...
       'bal_pension_saving', 'bal_savings_account',
       'bal_savings_account_starter', 'bal_current_account_starter',
       'visits_distinct_so', 'visits_distinct_so_areas',
       'customer_since_all_years', 'customer_sinc

In [6]:
# # run on submission data
# X_sub = ch.load_data('../data/test_month_3.csv')

# # make prediction on test
# y_pred_sub = clf.predict(X_sub)
# y_pred_test_sub = clf.predict_proba(X_sub)
# y_pred_test_sub_pos = y_pred_test_sub[:,1]

# df = pd.DataFrame({'ID': X_sub.client_id,'PROB':y_pred_test_sub_pos})
# today = dt.datetime.today()
# df.to_csv(f'../output/rf_{today.month}{today.day}.csv', index = False)