In [2]:
# importing libraries
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

## 1. Balanced Dataset

### 1. Loading the data

In [3]:
#read the data
path = "../input/predicting-churn-for-bank-customers/Churn_Modelling.csv"
df = pd.read_csv(path)
df.head()

In [4]:
df.info()

In [5]:
df.isnull().sum()

In [6]:
#drop columsn not needed for prediction
df = df.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1)
df.head()

### 2.EDA

In [7]:
#plotting a pie chart
retained, exited = df["Exited"].value_counts()[0], df["Exited"].value_counts()[1]
explode = (0, 0.1)

fig1, ax1 = plt.subplots(figsize=(9, 7))
ax1.pie([retained, exited], explode=explode, labels=["retained", "exited"], autopct='%1.1f%%',
        shadow=True, startangle=90)
ax1.axis('equal')
plt.title("Proportion of customer churned and retained", size = 20)
plt.show()

From above pie chart,we can see that around 20% of customers had churned i.e exited and 80% retained.This shows that our dataset is a little imbalanced so we have to predict customer churn with a good accuracy as this 20% customers are of more interest to the bank

#### Feature EDA plots

In [8]:
#geography
sns.countplot(x='Geography', hue = 'Exited', data = df)
plt.title("Countplot-Geography Column")

In [9]:
#gender
sns.countplot(x='Gender', hue = 'Exited', data = df)
plt.title("Countplot-Gender Column")

In [10]:
#HasCrCard
sns.countplot(x='HasCrCard', hue = 'Exited', data = df)
plt.title("Countplot-HasCrCard Column")

In [11]:
#IsActiveMember
sns.countplot(x='IsActiveMember', hue = 'Exited', data = df)
plt.title("Countplot-IsActiveMember Column")

From the above countplots we can infer that-  
1.Total umber of customers who retained is highest from France and those who exited are highest from Germany,which means the bank needs to focus more on customers from Germany followed by France so that they don't churn.  
2. The proportion of female customers churning is greater than that of male customers.  
3. Suprisingly,coustomers who had credit card churned more which can be a coincidence.  
4. As usual,the inactive members churned more.  

In [12]:
# Relations based on the continuous data attributes
fig, axes = plt.subplots(3, 2, figsize=(20, 12))
sns.boxplot(y='CreditScore',x = 'Exited', hue = 'Exited',data = df, ax=axes[0][0]).set_title('Boxplot- Credit Score Column')
sns.boxplot(y='Age',x = 'Exited', hue = 'Exited',data = df , ax=axes[0][1]).set_title('Boxplot- Age Column')
sns.boxplot(y='Tenure',x = 'Exited', hue = 'Exited',data = df, ax=axes[1][0])#.set_title('Boxplot- Tenure Column')
sns.boxplot(y='Balance',x = 'Exited', hue = 'Exited',data = df, ax=axes[1][1])#.set_title('Boxplot- Balance Column')
sns.boxplot(y='NumOfProducts',x = 'Exited', hue = 'Exited',data = df, ax=axes[2][0])#.set_title('Boxplot- NumProducts Column')
sns.boxplot(y='EstimatedSalary',x = 'Exited', hue = 'Exited',data = df, ax=axes[2][1])#.set_title('Boxplot- EstimatedSalary Column')

From the above boxplots we can infer that-  
-- There is no significant difference in Credit score,estimated salary and number of products they possess between customers who churned and who don't.  
-- The older customers are churning more than the young ones which indicates that the bank need to focus on older customers more.  
-- Customers with tenure period with bank either too less or too more tends to churn more.   
-- Customers who churned generally have more bank balance which is a bad indications as it will lead to capital deficiency in the bank.  

### 3. Feature Engineering

In [13]:
# 1st Attribute - Balance Salary Ratio
df["BalanceSalaryRatio"] = df["Balance"]/df["EstimatedSalary"]
sns.boxplot(y = "BalanceSalaryRatio", x = "Exited", hue = "Exited", data=df)
plt.ylim(-1, 5)
plt.show()

In [14]:
#  2nd Attribute-Tenure By Age
df['TenureByAge'] = df.Tenure/(df.Age)
sns.boxplot(y='TenureByAge',x = 'Exited', hue = 'Exited',data = df)
plt.ylim(-0.2, 0.7)
plt.show()

In [15]:
# 3rd Attribute- Credit Score Given Age
df['CreditScoreGivenAge'] = df.CreditScore/(df.Age)
sns.boxplot(y='CreditScoreGivenAge',x = 'Exited', hue = 'Exited',data = df)
plt.show()

In [16]:
df.shape

### 4. Data Preparation for the Model fitting

In [17]:
# Arranging columns by data type for easier manipulation

continuous_vars = ['CreditScore',  'Age', 'Tenure', 'Balance','NumOfProducts', 'EstimatedSalary', 'BalanceSalaryRatio',
                   'TenureByAge','CreditScoreGivenAge']
categorical_vars = ['HasCrCard', 'IsActiveMember','Geography', 'Gender']
label = ["Exited"]

#### Correlation Matrix for continuous attributes

In [18]:
sns.set()
sns.set(font_scale = 1.25)
sns.heatmap(df[continuous_vars].corr(), annot = True,fmt = ".1f")
plt.show()

##### Almost all of the columns aren't related with other columns except the features we have generated.

### Dealing with categorical features

In [19]:
# Changing values of column HasCrCard and IsActiveMember from 0 to -1 so that they will influence negatively to the model instead of no effect.
df.loc[df.HasCrCard == 0, 'HasCrCard'] = -1
df.loc[df.IsActiveMember == 0, 'IsActiveMember'] = -1

In [20]:
print("Unique categories in column Gender", df["Gender"].nunique())
print("Unique categories in column Geography", df["Geography"].nunique())

In [21]:
from sklearn.preprocessing import LabelEncoder 

le1 = LabelEncoder() 
le2 = LabelEncoder() 

df['Gender']= le1.fit_transform(df['Gender']) 
df['Geography']= le2.fit_transform(df['Geography']) 


#Get the mappings: Ascending order
print("Mappings for Gender is:", le1.classes_)
print("Mappings for Geography is:", le2.classes_)

In [22]:
df.head()

In [23]:
df1 = pd.get_dummies(data=df, columns=['Gender','Geography'])
df1.columns

In [24]:
df1.head()

#### Scaling the continuous attributes using MinMaxScaler

In [25]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df1[continuous_vars] = scaler.fit_transform(df1[continuous_vars])

In [26]:
df1.head()

#### 5.Model fitting and selection

In [27]:
# Support functions
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from scipy.stats import uniform

# Fit models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Scoring functions
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

In [28]:
#Get the features and the labels: Stratified sampling is preferred here
X = df1.drop('Exited',axis='columns')
y = df1['Exited']


from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,stratify=y, random_state=5)

In [29]:
#Get the distribution of labels in y_train
y_train.value_counts()

In [30]:
#Get the distribution of labels in y_test
y_test.value_counts()

### Figuring out the importance of features in our dataset

In [31]:
features_label = X_train.columns
forest = RandomForestClassifier (n_estimators = 1000, random_state = 0, n_jobs = -1)
forest.fit(X_train, y_train)
importances = forest.feature_importances_
indices = np.argsort(importances)[::-1]
for i in range(X.shape[1]):
    print ("%2d) %-*s %f" % (i + 1, 30, features_label[i], importances[indices[i]]))

In [32]:
# Visualization of the Feature importances
plt.title('Feature Importances')
plt.bar(range(X_train.shape[1]), importances[indices], color = "green", align = "center")
plt.xticks(range(X_train.shape[1]), features_label, rotation = 90)
plt.show()

### Grid Search for the best model & model parameters

In [33]:
### getting the best suited parameters from the dataset
def best_model(model):
    print(model.best_score_)    
    print(model.best_params_)
    print(model.best_estimator_)


def get_auc_scores(y_actual, method,method2):
    auc_score = roc_auc_score(y_actual, method); 
    fpr_df, tpr_df, _ = roc_curve(y_actual, method2); 
    return (auc_score, fpr_df, tpr_df)

#### 1: Primal Logistic Regression

In [44]:
# ####1: Primal logistic regression

# # Fit primal logistic regression
# param_grid = {'C': [0.1,0.5,1,10,50,100], 'max_iter': [250], 'fit_intercept':[True],'intercept_scaling':[1],
#               'penalty':['l2'], 'tol':[0.0001,0.00001,0.000001]}
# log_primal_Grid = GridSearchCV(LogisticRegression(solver='lbfgs'),param_grid, cv=10, refit=True, verbose=0)
# log_primal_Grid.fit(X_train, y_train)
# best_model(log_primal_Grid)

In [34]:
# Fitting the best primal logistic regression
log_primal = LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,intercept_scaling=1, max_iter=250, multi_class='auto',n_jobs=None, 
                                penalty='l2', random_state=None, solver='lbfgs',tol=1e-05, verbose=0, warm_start=False)
log_primal.fit(X_train,y_train)

#### 2: Poly Logistic regression

In [45]:
# ####2:  Fit logistic regression with pol 2 kernel
# param_grid = {'C': [0.1,10,50], 'max_iter': [300,500], 'fit_intercept':[True],'intercept_scaling':[1],'penalty':['l2'],
#               'tol':[0.0001,0.000001]}
# poly2 = PolynomialFeatures(degree=2)
# X_train_pol2 = poly2.fit_transform(X_train)
# log_pol2_Grid = GridSearchCV(LogisticRegression(solver = 'liblinear'),param_grid, cv=5, refit=True, verbose=0)
# log_pol2_Grid.fit(X_train_pol2,y_train)
# best_model(log_pol2_Grid)

In [35]:
# Fitting the best model basis CV

poly2 = PolynomialFeatures(degree=2)
df_train_pol2 = poly2.fit_transform(X_train)
log_pol2 = LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,intercept_scaling=1, max_iter=300, multi_class='auto', n_jobs=None, 
                              penalty='l2', random_state=None, solver='liblinear',tol=0.0001, verbose=0, warm_start=False)
log_pol2.fit(df_train_pol2,y_train)

#### 3: SVM w/ RBF Kernel

In [48]:
# # Fit SVM with RBF Kernel
# param_grid = {'C': [0.5,100,150], 'gamma': [0.1,0.01,0.001],'probability':[True],'kernel': ['rbf']}
# SVM_grid = GridSearchCV(SVC(), param_grid, cv=10, refit=True, verbose=0)
# SVM_grid.fit(X_train, y_train)
# best_model(SVM_grid)

In [36]:
# Fit SVM with RBF Kernel
SVM_RBF = SVC(C=100, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape='ovr', degree=3, gamma=0.1, kernel='rbf', max_iter=-1, probability=True, 
              random_state=None, shrinking=True,tol=0.001, verbose=False)
SVM_RBF.fit(X_train,y_train)

### 4: SVM w/ Pol Kernel

In [51]:
# # Fit SVM with pol kernel
# param_grid = {'C': [0.5,1,10,50,100], 'gamma': [0.1,0.01,0.001],'probability':[True],'kernel': ['poly'],'degree':[2,3] }
# SVM_grid = GridSearchCV(SVC(), param_grid, cv=3, refit=True, verbose=0)
# SVM_grid.fit(X_train, y_train)
# best_model(SVM_grid)

In [37]:
# Fit SVM with Pol Kernel
SVM_POL = SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,  decision_function_shape='ovr', degree=2, gamma=0.1, kernel='poly',  max_iter=-1,
              probability=True, random_state=None, shrinking=True, tol=0.001, verbose=False)
SVM_POL.fit(X_train,y_train)

### 5. RF

In [None]:
# # Fit random forest classifier
# param_grid = {'max_depth': [3, 5, 6, 7, 8], 'max_features': [2,4,6,7,8,9],'n_estimators':[50,100],'min_samples_split': [3, 5, 6, 7]}
# RanFor_grid = GridSearchCV(RandomForestClassifier(), param_grid, cv=3, refit=True, verbose=0)
# RanFor_grid.fit(X_train, y_train)
# best_model(RanFor_grid)

In [38]:
# Fit Random Forest classifier
RF = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',max_depth=8, max_features=7, max_leaf_nodes=None,min_impurity_decrease=0.0,
                            min_impurity_split=None,min_samples_leaf=1, min_samples_split=3,min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
                            oob_score=False, random_state=None, verbose=0,warm_start=False)
RF.fit(X_train,y_train)

### 6. XGBoost

In [None]:
# # Fit Extreme Gradient boosting classifier
# param_grid = {'max_depth': [5,6,7,8], 'gamma': [0.01,0.001,0.001],'min_child_weight':[1,5,10], 'learning_rate': [0.05,0.1, 0.2, 0.3], 'n_estimators':[5,10,20,100]}
# xgb_grid = GridSearchCV(XGBClassifier(), param_grid, cv=3, refit=True, verbose=0)
# xgb_grid.fit(X_train, y_train)
# best_model(xgb_grid)

In [39]:
# Fit Extreme Gradient Boost Classifier
XGB = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,colsample_bytree=1, gamma=0.01, learning_rate=0.1, max_delta_step=0,max_depth=5,
                    min_child_weight=1, n_estimators=100,n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,reg_alpha=0, 
                    reg_lambda=1, scale_pos_weight=1, seed=None,  subsample=1, use_label_encoder =False)
XGB.fit(X_train,y_train)

In [40]:
#Getting the best model among these classifers

print("Normal logistic regression", classification_report(y_train, log_primal.predict(X_train)))
print("2Poly kernel logistic regression", classification_report(y_train,  log_pol2.predict(df_train_pol2)))
print("SVM RBF Kernel", classification_report(y_train,  SVM_RBF.predict(X_train)))
print("SVM with polynomial kernel" ,classification_report(y_train,  SVM_POL.predict(X_train)))
print("Random Forest Classifier", classification_report(y_train,  RF.predict(X_train)))
print("XG Boost", classification_report(y_train,  XGB.predict(X_train)))

##### As can be seen from the metrics above, XGBoost Classifier works the best for our case. Thus, a better strategy would be to further improvise on the output of XGBoost

In [41]:
#Test Data performance

print("Normal logistic regression", classification_report(y_test, log_primal.predict(X_test)))
print("2Poly kernel logistic regression", classification_report(y_test,  log_pol2.predict(poly2.fit_transform(X_test))))
print("SVM RBF Kernel", classification_report(y_test,  SVM_RBF.predict(X_test)))
print("SVM with polynomial kernel" ,classification_report(y_test,  SVM_POL.predict(X_test)))
print("Random Forest Classifier", classification_report(y_test,  RF.predict(X_test)))
print("XG Boost", classification_report(y_test,  XGB.predict(X_test)))

## Part2: Balancing the dataset

#### Removing the imbalance of our dataset by SMOTE oversampling technique

In [42]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy='minority')
X_sm, y_sm = smote.fit_resample(X, y)

y_sm.value_counts()

In [43]:
#Now, that we have same number of datalabels in category X and y

X_trainS, X_testS, y_trainS, y_testS = train_test_split(X_sm, y_sm, test_size=0.2, random_state=15, stratify=y_sm)

In [44]:
y_trainS.value_counts()

In [66]:
import warnings
warnings.filterwarnings("ignore")

In [68]:
#Grid_CV on the balanced data for XGBoost

# param_grid = {'max_depth': [5,6,7,8], 'gamma': [0.01,0.001,0.001],'min_child_weight':[1,5,10], 'learning_rate': [0.05,0.1, 0.2, 0.3], 'n_estimators':[5,10,20,100]}
# xgb_grid = GridSearchCV(XGBClassifier(use_label_encoder=False), param_grid, cv=3, refit=True, verbose=0)
# xgb_grid.fit(X_train, y_train)
# best_model(xgb_grid)

In [45]:
#Refitting the XGB classifier

XGB2 = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,colsample_bytree=1, gamma=0.01, learning_rate=0.2, max_delta_step=0,max_depth=7,
                    min_child_weight=1, n_estimators=100,n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,reg_alpha=0, 
                    reg_lambda=1, scale_pos_weight=1, seed=None,  subsample=1, use_label_encoder=False)
XGB2.fit(X_trainS,y_trainS)

In [70]:
#get the classification report for the training dataset

print(classification_report(y_train,  XGB2.predict(X_train)))

In [49]:
#get the classification report for the training dataset

print(classification_report(y_testS,  XGB2.predict(X_testS)))

#### Precision and Recall values on the original test_data

In [50]:
#getting the classification report for test dataset

print(classification_report(y_test,  XGB2.predict(X_test)))

In [130]:
### Dumping model weights for pickl file

# import joblib 

# XBG_classifier = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,colsample_bytree=1, gamma=0.01, learning_rate=0.2, max_delta_step=0,max_depth=7,
#                     min_child_weight=1, missing=None, n_estimators=100,n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,reg_alpha=0, 
#                     reg_lambda=1, scale_pos_weight=1, seed=None,  subsample=1)


# Save the model as a pickle in a file 
# joblib.dump(XBG_classifier, 'XGB_smote_churnclassifier.pkl') 
  
# Load the model from the file 
# XGB_churn_classify = joblib.load('XGB_smote_churnclassifier.pkl')  
  
# Use the loaded model to make predictions 
# XGB_from_joblib.predict(X_test) 

### Conclusion
We can see that by balancing the dataset has increased our overall testing data accuracy to 91% from 87% on the SMOTE balanced datset. On the original test dataset, the accuracy is 94% with a F-1 score of 0.84 for my churn class