
#Question.
Predicting Beta Bank customer churning
#Background.
Customers of Beta bank are churning their accounts every month. The bank is focused on customer retention rather than new customer aquisition. We need to predict whether a customer will leave the bank soon.

In [14]:
#import libraries
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split




In [15]:
#Data Exploration
churn_data = pd.read_csv('Churn.csv')
churn_data.sample()
churn_data.head()
churn_data.tail()
churn_data.shape
#data types, missing values,unique values.
churn_data.dtypes #all variable are expected data type
churn_data.isnull().sum()
churn_data.nunique()

RowNumber          10000
CustomerId         10000
Surname             2932
CreditScore          460
Geography              3
Gender                 2
Age                   70
Tenure                11
Balance             6382
NumOfProducts          4
HasCrCard              2
IsActiveMember         2
EstimatedSalary     9999
Exited                 2
dtype: int64

In [16]:
#Data preparation1;
#standardize column names
churn_data.columns = churn_data.columns.str.lower().str.strip()
churn_data['tenure'].unique()


array([ 2.,  1.,  8.,  7.,  4.,  6.,  3., 10.,  5.,  9.,  0., nan])

In [17]:
#Data preparation2;
#replacing the missing value with mean of the tenure, close to 10% with missing value
churn_data.loc[churn_data['tenure'].isnull(), 'tenure'] = float(churn_data['tenure'].mean())

In [18]:
#Data preparation3;
#drop columns 
churn_data.drop(columns=['rownumber', 'customerid', 'surname'], inplace=True)

# Data modelling
 1. Create and train
 2. validate 
 3. test the model.

In [19]:
churn_data['geography'].unique()
churn_data.head()

Unnamed: 0,creditscore,geography,gender,age,tenure,balance,numofproducts,hascrcard,isactivemember,estimatedsalary,exited
0,619,France,Female,42,2.0,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1.0,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8.0,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1.0,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2.0,125510.82,1,1,1,79084.1,0


In [20]:

churn_data = pd.get_dummies(churn_data, drop_first=True)
churn_data.head()

Unnamed: 0,creditscore,age,tenure,balance,numofproducts,hascrcard,isactivemember,estimatedsalary,exited,geography_Germany,geography_Spain,gender_Male
0,619,42,2.0,0.0,1,1,1,101348.88,1,0,0,0
1,608,41,1.0,83807.86,1,0,1,112542.58,0,0,1,0
2,502,42,8.0,159660.8,3,1,0,113931.57,1,0,0,0
3,699,39,1.0,0.0,2,0,0,93826.63,0,0,0,0
4,850,43,2.0,125510.82,1,1,1,79084.1,0,0,1,0


In [21]:
#Construct a train dataset
train_df = churn_data.copy()

#create a training and validation dataset from the dataset
train_df, valid_df = train_test_split(train_df, test_size=0.25, random_state=1234)
print(train_df.shape)
print(valid_df.shape)

(7500, 12)
(2500, 12)


In [22]:
#create features and target for both train and test
features_train = train_df.drop(columns=['exited'])
target_train = train_df['exited']
features_valid = valid_df.drop(columns=['exited'])
target_valid = valid_df['exited']

#create a model for Decision Trees, Random Forest and Logistic Regression
#model for Decision Trees, declare and find the ideal depth for the tree
for d in range(1, 11, 1):
  tree_model = DecisionTreeClassifier(random_state=1234, max_depth=d)
  tree_model.fit(features_train, target_train)  #train the model
  #check for accuracy
  print(f'Decision tree has accuracy of: {tree_model.score(features_train, target_train)} for depth of: {d}')

#declare model for random forest and find the best n_estimator value
for n in range(1,20,1):
  forest_model = RandomForestClassifier(class_weight='balanced' ,random_state=1234, n_estimators=n)
  forest_model.fit(features_train, target_train)
  print(f'Random forest has accuracy of: {forest_model.score(features_train, target_train)} for n={n}')

#declare a model for logistic regression
log_model = LogisticRegression(random_state=1234, solver='liblinear')
log_model.fit(features_train, target_train)
print(f'UNBalanced: logistic regression has accuracy of: {log_model.score(features_train, target_train)}')
log_model = LogisticRegression(class_weight='balanced', random_state=1234, solver='liblinear')
log_model.fit(features_train, target_train)
print(f'Balanced: logistic regression has accuracy of: {log_model.score(features_train, target_train)}')

#wieghting the classes and comparing the accuracy
forest_model = RandomForestClassifier(random_state=1234, n_estimators=11)
forest_model.fit(features_train, target_train)
print(f'UNBalanced: Random forest has accuracy of: {forest_model.score(features_train, target_train)}')

forest_model = RandomForestClassifier(class_weight='balanced' ,random_state=1234, n_estimators=11)
forest_model.fit(features_train, target_train)
print(f'Balanced: Random forest has accuracy of: {forest_model.score(features_train, target_train)}')




Decision tree has accuracy of: 0.796 for depth of: 1
Decision tree has accuracy of: 0.828 for depth of: 2
Decision tree has accuracy of: 0.8418666666666667 for depth of: 3
Decision tree has accuracy of: 0.8522666666666666 for depth of: 4
Decision tree has accuracy of: 0.8584 for depth of: 5
Decision tree has accuracy of: 0.8650666666666667 for depth of: 6
Decision tree has accuracy of: 0.8733333333333333 for depth of: 7
Decision tree has accuracy of: 0.8817333333333334 for depth of: 8
Decision tree has accuracy of: 0.8921333333333333 for depth of: 9
Decision tree has accuracy of: 0.9021333333333333 for depth of: 10
Random forest has accuracy of: 0.9273333333333333 for n=1
Random forest has accuracy of: 0.9329333333333333 for n=2
Random forest has accuracy of: 0.9686666666666667 for n=3
Random forest has accuracy of: 0.9604 for n=4
Random forest has accuracy of: 0.9784 for n=5
Random forest has accuracy of: 0.972 for n=6
Random forest has accuracy of: 0.9861333333333333 for n=7
Random f

#Findings
Based on above accuracy for Decision tree, logistic regression and Random forest, my recommendation would be random forest which depicts a high prediction accuracy. 