In [2]:
from   category_encoders          import *
import numpy as np
import pandas as pd
import imblearn
from   imblearn.pipeline          import make_pipeline 
from   sklearn.compose            import *
from   sklearn.ensemble           import RandomForestClassifier, ExtraTreesClassifier, IsolationForest, GradientBoostingClassifier
from   sklearn.experimental       import enable_iterative_imputer
from   sklearn.impute             import *
from   sklearn.linear_model       import LogisticRegression, PassiveAggressiveClassifier, RidgeClassifier, SGDClassifier
from   sklearn.metrics            import balanced_accuracy_score 
from   sklearn.pipeline           import Pipeline
from   sklearn.model_selection    import train_test_split
from   sklearn.preprocessing      import *
from   sklearn.tree               import DecisionTreeClassifier
from   sklearn.metrics            import *
from   sklearn.decomposition      import PCA
from   sklearn.base               import BaseEstimator
from   sklearn.model_selection    import RandomizedSearchCV
from   sklearn.neighbors          import KNeighborsClassifier
from   sklearn.naive_bayes        import GaussianNB
from   sklearn.svm                import SVC
import warnings
warnings.filterwarnings('ignore')

## Load data

In [3]:
df = pd.read_csv("BankChurners.csv")

drop_columns = ['CLIENTNUM',
                'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1',
                'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2']

data_raw = df.drop(drop_columns,errors='ignore',axis=1)

## Check if imbalance

In [4]:
data_raw['Attrition_Flag'].value_counts()

Existing Customer    8500
Attrited Customer    1627
Name: Attrition_Flag, dtype: int64

## Define our target - churned or not

In [5]:
codes = {'Existing Customer':0, 'Attrited Customer':1}
data_raw['Attrition_Flag'] = data_raw['Attrition_Flag'].map(codes)

y = data_raw['Attrition_Flag']
X = data_raw.drop('Attrition_Flag',errors='ignore',axis=1)

## Helper class

In [6]:
class DummyEstimator(BaseEstimator):
    "Pass through class, methods are present but do nothing."
    def fit(self): pass
    def score(self): pass

## Data preprocessing 

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

categorical_columns = ['Gender','Education_Level','Income_Category',
                       'Marital_Status', 'Card_Category']

bin_categorical_columns = ['Customer_Age','Dependent_count',
                           'Total_Relationship_Count','Months_Inactive_12_mon','Contacts_Count_12_mon',
                           'Months_on_book']

numeric_columns = ['Credit_Limit', 'Total_Revolving_Bal','Avg_Open_To_Buy','Total_Amt_Chng_Q4_Q1',
                   'Total_Trans_Amt','Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1','Avg_Utilization_Ratio']

numeric_pipe = Pipeline([('scaler', StandardScaler()),
                          ('imputer', SimpleImputer(strategy='median', add_indicator=True))])

categorical_pipe = Pipeline([('imputer', SimpleImputer(strategy='constant', fill_value='missing')), 
                                  ('ohe', OneHotEncoder(handle_unknown='ignore'))])

bin_categorical_pipe = Pipeline([('imputer', SimpleImputer(strategy='median')), 
                                 ('bins', KBinsDiscretizer(n_bins=4, encode='onehot'))])

preprocessing = ColumnTransformer([('categorical', categorical_pipe,  categorical_columns),
                                   ('bin_categorical', bin_categorical_pipe, bin_categorical_columns),
                                   ('continuous',  numeric_pipe, numeric_columns)])


## Random Search

In [10]:
pipe_dt = Pipeline([('scl', StandardScaler()),
                    ('clf', DummyEstimator())])

search_space = [{'clf': [LogisticRegression()], # LogisticRegression
                 'clf__penalty': ['l1', 'l2'],
                 'clf__C': np.logspace(0, 4, 6,10)},
                
                {'clf': [RandomForestClassifier()],  # RandomForest
                 'clf__criterion': ['gini', 'entropy'],
                 'clf__max_depth': [2, 3, 4, 5, 6],
                 'clf__min_samples_leaf': [1, 2, 3, 4],
                 'clf__n_estimators': [100, 150, 200]},
                
                {'clf': [ExtraTreesClassifier()],  # ExtraTrees
                 'clf__criterion': ['gini', 'entropy'],
                 'clf__max_depth': [2, 3, 4, 5, 6],
                 'clf__min_samples_leaf': [1, 2, 3, 4],
                 'clf__max_features': ['auto', 'sqrt', 'log2']},
                
                {'clf': [GaussianNB()],  # GaussianNB
                 'clf__var_smoothing': np.logspace(0,-9, num=100)},
                
                {'clf': [KNeighborsClassifier()], # KNeighbors
                 'clf__n_neighbors': [2,5,10],
                 'clf__weights': ['uniform', 'distance'],
                 'clf__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']},
                {'clf': [SVC()], # SVC
                 'clf__C': np.logspace(0, 4, 6,10),
                 'clf__kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed']},
                {'clf': [GradientBoostingClassifier()], #XGB
                 'clf__n_estimators': [20,50,100,150]}]

clf_algos_rand = RandomizedSearchCV(estimator=pipe_dt, 
                                    param_distributions=search_space, 
                                    n_iter=25,
                                    cv=5, 
                                    n_jobs=-1,
                                    verbose=10)

## Pipeline - Model training, PCA and SMOTE

In [8]:
pipe = make_pipeline(preprocessing, 
                     PCA(n_components=10),
                     imblearn.over_sampling.SMOTE(sampling_strategy='auto',
                                                  k_neighbors=15,
                                                  n_jobs=-1),
                     clf_algos_rand)


pipe.fit(X_train, y_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits


Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('categorical',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(fill_value='missing',
                                                                                 strategy='constant')),
                                                                  ('ohe',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['Gender', 'Education_Level',
                                                   'Income_Category',
                                                   'Marital_Status',
                                                   'Card_Category']),
                                                 ('bin_categorical',
                                                  Pipeline(steps=[('im

## Valuation metric 

In [142]:
y_test   = y_test.values.ravel()
y_pred   = pipe.predict(X_test)
c_report_test = classification_report(y_test, y_pred)
print(c_report_test)

              precision    recall  f1-score   support

           0       0.98      0.98      0.98      1700
           1       0.88      0.92      0.90       326

    accuracy                           0.97      2026
   macro avg       0.93      0.95      0.94      2026
weighted avg       0.97      0.97      0.97      2026



In [16]:
probs = pipe.predict_proba(X_test)
probs = probs[:,1]
auc = roc_auc_score(y_test, probs)
print(" - AUC score: ",'{:.3f}'.format(auc))

 - AUC score:  0.934
