In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn import metrics
from sklearn.model_selection import cross_val_score

In [3]:
df= pd.read_csv('data/Churn_Modelling.csv')

In [4]:
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [5]:
#Drop some of the irrelevant features
df = df.drop(labels= ['RowNumber','CustomerId', 'Surname'], axis=1)

In [6]:
#one_hot code the features : Geography & Gender
geography = pd.get_dummies(df['Geography'], drop_first=True)
gender = pd.get_dummies(df['Gender'], drop_first= True)

In [7]:
# add these 2 new features to dataframe and drop the old features
df= pd.concat([df,geography,gender], axis=1)

In [8]:
churn_df= df.drop(labels=['Geography', 'Gender'], axis=1)

In [9]:
churn_df.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Germany,Spain,Male
0,619,42,2,0.0,1,1,1,101348.88,1,0,0,0
1,608,41,1,83807.86,1,0,1,112542.58,0,0,1,0
2,502,42,8,159660.8,3,1,0,113931.57,1,0,0,0
3,699,39,1,0.0,2,0,0,93826.63,0,0,0,0
4,850,43,2,125510.82,1,1,1,79084.1,0,0,1,0


In [10]:
X= churn_df.drop('Exited', axis=1)
y= churn_df['Exited']

In [11]:
X_train, X_test,y_train, y_test= train_test_split(X,y, test_size=0.25, random_state= 9)

In [12]:
#lets build the model

model = XGBClassifier()
model.fit(X_train, y_train)
y_pred= model.predict(X_test)
y_train_pred= model.predict(X_train)

In [13]:
print('Train Accuracy :', metrics.accuracy_score(y_train, y_train_pred ))
print('Test Accuracy :', metrics.accuracy_score(y_pred, y_test))

Train Accuracy : 0.9569333333333333
Test Accuracy : 0.8508


In [14]:
print(metrics.classification_report(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.94      0.91      1985
           1       0.68      0.51      0.59       515

    accuracy                           0.85      2500
   macro avg       0.78      0.73      0.75      2500
weighted avg       0.84      0.85      0.84      2500

[[1862  123]
 [ 250  265]]


# Hyper-Parameter Tuning using RandomizedSearch Cross-Validation

The above Model Over_Fitted Badly. we can over-come this situation by performing Hyper-Parameter Tuning using CV techniques.

In [15]:
from sklearn.model_selection import RandomizedSearchCV

In [16]:
params= {
      'learning_rate'    : [0.01, 0.05, 0.10, 0.15, 0.2, 0.25 ],
    'max_depth'        : [2, 3, 4, 5, 6,7],
    'min_child_weight' : [1, 1.3, 1.5, 1.7, 1.9],
    'gamma'            : [0.1,0.2,0.3,0.4],
    'n_estimators'     : [50,80,100,120]
}

In [17]:
RSearch_cv= RandomizedSearchCV(model, param_distributions= params, n_iter= 5, scoring= 'roc_auc', n_jobs= -1, cv=5,verbose= 7)

In [18]:
RSearch_cv.fit(X,y)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


RandomizedSearchCV(cv=5,
                   estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                           callbacks=None, colsample_bylevel=1,
                                           colsample_bynode=1,
                                           colsample_bytree=1,
                                           early_stopping_rounds=None,
                                           enable_categorical=False,
                                           eval_metric=None, gamma=0, gpu_id=-1,
                                           grow_policy='depthwise',
                                           importance_type=None,
                                           interaction_constraints='',
                                           learning_rate=0.300000012,
                                           max_bin=256,...
                                           monotone_constraints='()',
                                           n_estimators=100, n_jobs=

In [19]:
RSearch_cv.best_params_

{'n_estimators': 120,
 'min_child_weight': 1.7,
 'max_depth': 3,
 'learning_rate': 0.15,
 'gamma': 0.2}

In [20]:
#I am passing the all the parameters list those given by RandomizedSearchCV
model = XGBClassifier(base_score=0.5, booster='gbtree',
                                           callbacks=None, colsample_bylevel=1,
                                           colsample_bynode=1,
                                           colsample_bytree=1,
                                           early_stopping_rounds=None,
                                           enable_categorical=False,
                                           eval_metric=None, gamma=0, gpu_id=-1,
                                           grow_policy='depthwise',
                                           importance_type=None,
                                           interaction_constraints='',
                                           learning_rate=0.300000012,
                                           max_bin=256,
                                           n_estimators=100, n_jobs=0,
                                           num_parallel_tree=1,
                                           predictor='auto', random_state=0,
                                           reg_alpha=0, reg_lambda=1)

In [22]:
score= cross_val_score(model, X,y)

In [23]:
score

array([0.8565, 0.853 , 0.8535, 0.8675, 0.843 ])

In [24]:
score.mean()

0.8547

# Lets apply the params those gave by RandomizedSearchCV in the model and I'm calling it as generalised model

In [25]:
gen_model = XGBClassifier(n_estimators= 80,
                          min_child_weight= 1.3,
                          max_depth = 5,
                          learning_rate = 0.05,
                          gamma = 0.3)

In [26]:
gen_model.fit(X_train, y_train)
y_pred= gen_model.predict(X_test)
y_train_pred= gen_model.predict(X_train)

In [27]:
print('Train Accuracy :', metrics.accuracy_score(y_train, y_train_pred ))
print('Test Accuracy :', metrics.accuracy_score(y_pred, y_test))

Train Accuracy : 0.8741333333333333
Test Accuracy : 0.8624
