In [51]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [184]:
data = pd.read_csv('Churn_Modelling.csv')

In [185]:
data.columns

Index(['RowNumber', 'CustomerId', 'Surname', 'CreditScore', 'Geography',
       'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited'],
      dtype='object')

Dropping the unwanted columns such as RowNumber and Surname.

In [186]:
data.drop(['RowNumber', 'Surname'], axis=1, inplace=True)

In [187]:
data.columns
print(data.shape)

(10000, 12)


Converting the categorical variable to numerical variables using get_dummies function of pandas.

In [188]:
data_encoded = pd.get_dummies(data)
print(data_encoded.shape)

(10000, 15)


In [189]:
data_encoded.columns

Index(['CustomerId', 'CreditScore', 'Age', 'Tenure', 'Balance',
       'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary',
       'Exited', 'Geography_France', 'Geography_Germany', 'Geography_Spain',
       'Gender_Female', 'Gender_Male'],
      dtype='object')

In [190]:
data_encoded.drop(['Gender_Male', 'Geography_Spain'], axis=1, inplace=True)

In [191]:
data_encoded.columns

Index(['CustomerId', 'CreditScore', 'Age', 'Tenure', 'Balance',
       'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary',
       'Exited', 'Geography_France', 'Geography_Germany', 'Gender_Female'],
      dtype='object')

separating the dependent/target variable and  feature labels.

In [192]:
y = data_encoded.Exited
print(type(y))

<class 'pandas.core.series.Series'>


In [193]:
data_encoded.drop(['Exited'], axis=1, inplace=True)
print(data_encoded.shape)
print(type(data_encoded))

(10000, 12)
<class 'pandas.core.frame.DataFrame'>


In [194]:
data_encoded.columns

Index(['CustomerId', 'CreditScore', 'Age', 'Tenure', 'Balance',
       'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary',
       'Geography_France', 'Geography_Germany', 'Gender_Female'],
      dtype='object')

In [195]:
data_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 12 columns):
CustomerId           10000 non-null int64
CreditScore          10000 non-null int64
Age                  10000 non-null int64
Tenure               10000 non-null int64
Balance              10000 non-null float64
NumOfProducts        10000 non-null int64
HasCrCard            10000 non-null int64
IsActiveMember       10000 non-null int64
EstimatedSalary      10000 non-null float64
Geography_France     10000 non-null uint8
Geography_Germany    10000 non-null uint8
Gender_Female        10000 non-null uint8
dtypes: float64(2), int64(7), uint8(3)
memory usage: 732.5 KB


Using DecisionTree Classifier. I am starting with decision trees because data doesn't need to be scaled for decision trees. Decision trees also give us feature importance and generally perform well in decision making tasks

In [196]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score

Dividing the data into train, test and validation set. 10 percent data is assigned to each test and validation set. training set consist of 80 percent of the data.

In [197]:
X_t, X_test, y_t, y_test = train_test_split(data_encoded, y, test_size=0.10, random_state=1)

In [198]:
X_train, X_val, y_train, y_val = train_test_split(X_t, y_t, test_size=0.10, random_state=1)

While creating an instance of DecisionTree Classifier, the classa-weight parameter is seet to 'balanced'. This is to address the issue of class imbalance. 

In [199]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
Dt = DecisionTreeClassifier(class_weight='balanced', random_state=1)

In [200]:
cv_results = cross_val_score(Dt, X_train, y_train, cv=10)

In [201]:
from sklearn.metrics import fbeta_score

In [202]:
print(cv_results)
Dt.fit(X_train,y_train)
y_pred_val = Dt.predict(X_val)

[0.79654747 0.77654321 0.81111111 0.79012346 0.78518519 0.81111111
 0.7691358  0.79753086 0.79753086 0.80098888]


The highest accuracy provided by Decision Tree classifier is 81 percent. Also the F-beta score and ROC-AUC score of the model on validation set is 0.44 and 0.65 respectively.
So I will perform hyperparameter tuning and try to create a more accurate version of this model.

In [204]:
fscore = fbeta_score(y_val, y_pred_val, beta=1)
print('F-beta score for Decision tree Model:', fscore)
roc_score = roc_auc_score(y_val, y_pred_val)
print("roc-auc score on validation set:",roc_score )

F-beta score for Decision tree Model: 0.4432989690721649
roc-auc score on validation set: 0.6506142506142507


In [121]:
Dt.get_params

<bound method BaseEstimator.get_params of DecisionTreeClassifier(class_weight='balanced', criterion='gini',
            max_depth=None, max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=1,
            splitter='best')>

I am going to tune the hyperparameters -'max_depth', 'min_samples_leaf', 'max_features' and 'criterion' using GridSearchCV 

In [206]:
params = {'max_depth':[15,16,17,18,19,20,21,22],
         'min_samples_leaf':[1,2,3,4,5],
         'max_features':[1,2,3,4,5,6,7,8,9,10,11,12],
         'criterion':['gini','entropy']}
grid_dt = GridSearchCV(Dt, param_grid=params, scoring='accuracy', cv=10, n_jobs=-1)

In [208]:
grid_dt.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=DecisionTreeClassifier(class_weight='balanced', criterion='gini',
            max_depth=None, max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=1,
            splitter='best'),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'max_depth': [15, 16, 17, 18, 19, 20, 21, 22], 'min_samples_leaf': [1, 2, 3, 4, 5], 'max_features': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], 'criterion': ['gini', 'entropy']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

The best parameters after tuning are 'criterion': 'gini', 'max_depth': 19, 'max_features': 6, 'min_samples_leaf': 1. Now I will select the best model and fit to the training data and check the FBeta score and ROC-AUC score on validation set.

In [210]:
print(grid_dt.best_params_)

{'criterion': 'gini', 'max_depth': 19, 'max_features': 6, 'min_samples_leaf': 1}


In [211]:
print(grid_dt.best_score_)

0.8048148148148148


In [212]:
best_model = grid_dt.best_estimator_
best_model.fit(X_train,y_train)

DecisionTreeClassifier(class_weight='balanced', criterion='gini',
            max_depth=19, max_features=6, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=1,
            splitter='best')

In [213]:
pred_y_val_best = best_model.predict(X_val)
print(fbeta_score(y_val, pred_y_val_best, beta=1))


0.4456521739130435


In [214]:
from sklearn.metrics import recall_score
print(recall_score(y_val, pred_y_val_best))
from sklearn.metrics import accuracy_score

0.44324324324324327


In [215]:
acc =  accuracy_score(y_val,pred_y_val_best)
print("accuracy of Decision tree on validation set", acc)

accuracy of Decision tree on validation set 0.7733333333333333


In [216]:
from sklearn.metrics import mean_squared_error as MSE
print((MSE(y_val, pred_y_val_best))**(1/2))

0.4760952285695233


In [141]:
from sklearn.ensemble import AdaBoostClassifier
dt = DecisionTreeClassifier(max_depth=5)
adb = AdaBoostClassifier(base_estimator= dt, n_estimators=100)
adb.fit(X_train,y_train)

AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
          learning_rate=1.0, n_estimators=100, random_state=None)

In [142]:
y_pred_proba_val = adb.predict_proba(X_val)[:,1]
roc_auc_score_adb = roc_auc_score(y_val, y_pred_proba_val)
print(roc_auc_score_adb)

0.7080929880929883


In [143]:
y_pred_adb = adb.predict(X_val)

In [145]:
print(fbeta_score(y_val, y_pred_adb, beta=0.5))

0.468187274909964


In [148]:
param_ada = {'n_estimators':[100,150,200,250,300]}
ada = AdaBoostClassifier()
grid_ab =  GridSearchCV(ada, param_grid=param_ada, cv=10, n_jobs=-1)
grid_ab.fit(X_train,y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'n_estimators': [100, 150, 200, 250, 300]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [150]:
print(grid_ab.best_params_)
print(grid_ab.best_score_)

{'n_estimators': 100}
0.855925925925926


In [151]:
model_ada = grid_ab.best_estimator_
model_ada.fit(X_train, y_train)


AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=100, random_state=None)

In [152]:
y_ada_prob=model_ada.predict_proba(X_val)[:,1]
y_ada = model_ada.predict(X_val)

In [153]:
print(roc_auc_score(y_val, y_ada_prob))
print(fbeta_score(y_val, y_ada, beta=1))

0.8359402759402759
0.5389610389610389


In [154]:
print(recall_score(y_val, y_ada))

0.4486486486486487


In [155]:
from sklearn.ensemble import GradientBoostingClassifier
sgbt = GradientBoostingClassifier(max_depth=6, subsample=0.8, max_features=0.2, n_estimators=100, random_state=1)

In [156]:
sgbt.fit(X_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=6,
              max_features=0.2, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=1,
              subsample=0.8, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [159]:
sgbt_y = sgbt.predict(X_val)
sgby_y_prob = sgbt.predict_proba(X_val)[:,1]

In [160]:
print(fbeta_score(y_val, sgbt_y, beta=1))
print(roc_auc_score(y_val, sgby_y_prob))

0.575079872204473
0.8615309015309016


In [161]:
sgbt.get_params

<bound method BaseEstimator.get_params of GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=6,
              max_features=0.2, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=1,
              subsample=0.8, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)>

In [164]:
params_sgbt = {'max_depth':[1,2,3,4,5,6,7,8,9,10],
              'n_estimators':[100,150,200,250,300]}
sgb = GradientBoostingClassifier(random_state=2)
grid_sgbt = GridSearchCV(sgb, param_grid=params_sgbt,cv=10, n_jobs=-1)

In [168]:
y_test_pred = sgbt.predict(X_test)
sgbt_y_prob = sgbt.predict_proba(X_test)[:,1]

In [175]:
print(fbeta_score(y_test, y_test_pred, beta=1))
print(roc_auc_score(y_test, sgbt_y_prob))
print((MSE(y_test, y_test_pred))**(1/2))

0.5823529411764706
0.8724704016722828
0.37682887362833545


In [170]:
print(y_test_pred)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0
 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0
 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1
 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0
 1 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0
 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 1 0 0 0 0 0 0 1 0 0 0
 1 1 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1
 0 0 0 1 0 0 0 0 1 0 0 0 0 1 1 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 