In [1]:
%matplotlib inline
import numpy as np

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, KBinsDiscretizer
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split, GridSearchCV
from imblearn.pipeline import Pipeline

import custom_helpers as ch

# np.random.seed(0)

In [2]:
# load data with helper function
data = ch.load_data('../data/train_month_3_with_target.csv')
print(data.info())

------------------------------------------------------------
loading data...
transforming dates...
cast types into bool, object, categorical...
data loaded and casted
------------------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63697 entries, 0 to 63696
Data columns (total 43 columns):
 #   Column                             Non-Null Count  Dtype         
---  ------                             --------------  -----         
 0   client_id                          63697 non-null  object        
 1   homebanking_active                 63697 non-null  bool          
 2   has_homebanking                    63697 non-null  bool          
 3   has_insurance_21                   63697 non-null  bool          
 4   has_insurance_23                   63697 non-null  bool          
 5   has_life_insurance_fixed_cap       63697 non-null  bool          
 6   has_life_insurance_decreasing_cap  63697 non-null  bool          
 7   has_fire_car_other_

# Preprocessing

In [3]:
y = data.target
X = data.drop(columns = ['target'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, stratify = y, random_state=42)

num_col = X_train.select_dtypes(include = 'number', exclude = 'bool').columns
cat_col = X_train.select_dtypes(include = 'category').columns
bool_col = X_train.select_dtypes(include = 'bool').columns
date_col = X_train.select_dtypes(include = 'datetime64').columns
obj_col = X_train.select_dtypes(include = 'object').columns

# found during exploratory phase
colinear_features = ['bal_insurance_23',
     'bal_insurance_21',
     'bal_savings_account_starter',
     'has_homebanking',
     'customer_since_bank_years',
     'cap_life_insurance_decreasing_cap',
     'has_mortgage_loan',
     'has_fire_car_other_insurance',
     'bal_pension_saving',
     'bal_personal_loan']


numeric_transformer = Pipeline(steps = [
    ('impute',SimpleImputer(missing_values=np.nan, strategy='median'))
])

categorical_transformer = OneHotEncoder(handle_unknown="ignore")


preprocessor = ColumnTransformer(
    transformers=[
        ('drop_ID','drop',obj_col),
        ('drop_dates','drop',date_col),
        ('drop_colinear', 'drop', colinear_features),
        ('cat',categorical_transformer,cat_col),
        ('num',numeric_transformer,num_col)
    ],
    remainder = "passthrough"
)

# Model Selection With Grid Search

In [5]:
# grid search for params
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier

#balance classes 50:50
smt = SMOTE(random_state=42)

#classifier
rf = RandomForestClassifier(max_depth = 5, n_estimators = 200, random_state=0)

#pipeline
pipe = Pipeline(
    steps=[("preprocessor", preprocessor),
           ("balancer", smt), 
           ("forest", rf)]
)

param_grid = [
    {
        'balancer': [smt],
    },
    {
        'balancer':['passthrough'],
        'forest__class_weight':['balanced', 'balanced_subsample'],
        'forest__max_depth': [5,8,None]
    }
]

gridscorer = ch.gridscorer() # custom scorer (precision@250)

search = GridSearchCV(pipe, param_grid, scoring = gridscorer, n_jobs=-2)
search.fit(X_train, y_train)
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)


Best parameter (CV score=26.000):
{'balancer': 'passthrough', 'forest__class_weight': 'balanced', 'forest__max_depth': 5}


### Fit RF with best parameters

In [6]:
clf = search.best_estimator_.fit(X_train, y_train)
#clf = search.best_estimator_

# make prediction on test
y_pred_test = clf.predict(X_test)
y_pred_test_probs = clf.predict_proba(X_test)

ch.evaluate(clf, X_test, y_test)

------------------------------------------------------------
Performance Over Whole Set
------------------------------------------------------------
               precision    recall  f1-score   support

Did not Churn       0.98      0.69      0.81     30892
        Churn       0.06      0.64      0.11       957

     accuracy                           0.69     31849
    macro avg       0.52      0.67      0.46     31849
 weighted avg       0.96      0.69      0.79     31849

------------------------------------------------------------
AUC: 0.67
No. of TP (precision@250): 42
------------------------------------------------------------


In [None]:
# # run on submission data
# X_sub = ch.load_data('../data/test_month_3.csv')

# # make prediction on test
# y_pred_sub = clf.predict(X_sub)
# y_pred_test_sub = clf.predict_proba(X_sub)
# y_pred_test_sub_pos = y_pred_test_sub[:,1]

# df = pd.DataFrame({'ID': X_sub.client_id,'PROB':y_pred_test_sub_pos})
# today = dt.datetime.today()
# df.to_csv(f'../output/rf_{today.month}{today.day}.csv', index = False)