Import required libraries.

In [379]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.utils import shuffle
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score

In [380]:
data = pd.read_csv('datasets/Churn.csv')

In [381]:
data.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2.0,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1.0,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8.0,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1.0,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2.0,125510.82,1,1,1,79084.1,0


In [382]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           9091 non-null   float64
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(3), int64(8), object(3)
memory usage: 1.1+ MB


### Feature Preparation

I will drop some columns that are not useful in training the models.

In [383]:
data.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1, inplace=True)

I will fill in the missing values in `Tenure` with 0, because these customers may not have fixed deposits thus the period of maturation cannot be determined.

In [384]:
data['Tenure'].fillna(0, inplace=True)

I will use One-Hot Encoding to transform categorical features into numerical features.

In [385]:
pd.get_dummies(data['Geography']).head()

Unnamed: 0,France,Germany,Spain
0,1,0,0
1,0,0,1
2,1,0,0
3,1,0,0
4,0,0,1


In [386]:
pd.get_dummies(data['Gender']).head()

Unnamed: 0,Female,Male
0,1,0
1,1,0
2,1,0
3,1,0
4,1,0


In [388]:
data = pd.get_dummies(data, drop_first=True)

In [389]:
data.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_Germany,Geography_Spain,Gender_Male
0,619,42,2.0,0.0,1,1,1,101348.88,1,0,0,0
1,608,41,1.0,83807.86,1,0,1,112542.58,0,0,1,0
2,502,42,8.0,159660.8,3,1,0,113931.57,1,0,0,0
3,699,39,1.0,0.0,2,0,0,93826.63,0,0,0,0
4,850,43,2.0,125510.82,1,1,1,79084.1,0,0,1,0


In [390]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   CreditScore        10000 non-null  int64  
 1   Age                10000 non-null  int64  
 2   Tenure             10000 non-null  float64
 3   Balance            10000 non-null  float64
 4   NumOfProducts      10000 non-null  int64  
 5   HasCrCard          10000 non-null  int64  
 6   IsActiveMember     10000 non-null  int64  
 7   EstimatedSalary    10000 non-null  float64
 8   Exited             10000 non-null  int64  
 9   Geography_Germany  10000 non-null  uint8  
 10  Geography_Spain    10000 non-null  uint8  
 11  Gender_Male        10000 non-null  uint8  
dtypes: float64(3), int64(6), uint8(3)
memory usage: 732.5 KB


In [392]:
data.describe()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_Germany,Geography_Spain,Gender_Male
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,650.5288,38.9218,4.5434,76485.889288,1.5302,0.7055,0.5151,100090.239881,0.2037,0.2509,0.2477,0.5457
std,96.653299,10.487806,3.111573,62397.405202,0.581654,0.45584,0.499797,57510.492818,0.402769,0.433553,0.431698,0.497932
min,350.0,18.0,0.0,0.0,1.0,0.0,0.0,11.58,0.0,0.0,0.0,0.0
25%,584.0,32.0,2.0,0.0,1.0,0.0,0.0,51002.11,0.0,0.0,0.0,0.0
50%,652.0,37.0,4.0,97198.54,1.0,1.0,1.0,100193.915,0.0,0.0,0.0,1.0
75%,718.0,44.0,7.0,127644.24,2.0,1.0,1.0,149388.2475,0.0,1.0,0.0,1.0
max,850.0,92.0,10.0,250898.09,4.0,1.0,1.0,199992.48,1.0,1.0,1.0,1.0


Some features have different scales; the magnitudes of values and dispersion are higher.

With feature scaling, I will standardize the data.

In [393]:
numeric = ['CreditScore', 'Age', 'Balance', 'EstimatedSalary']

scaler = StandardScaler()
data[numeric] = pd.DataFrame(scaler.fit_transform(data[numeric]))

In [394]:
data.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_Germany,Geography_Spain,Gender_Male
0,-0.326221,0.293517,2.0,-1.225848,1,1,1,0.021886,1,0,0,0
1,-0.440036,0.198164,1.0,0.11735,1,0,1,0.216534,0,0,1,0
2,-1.536794,0.293517,8.0,1.333053,3,1,0,0.240687,1,0,0,0
3,0.501521,0.007457,1.0,-1.225848,2,0,0,-0.108918,0,0,0,0
4,2.063884,0.388871,2.0,0.785728,1,1,1,-0.365276,0,0,1,0


Split the data into training, validation and test set, in 60-20-20 proportions, using the test_train_split function.

In [395]:
features = data.drop('Exited', axis=1)
target = data['Exited']

In [396]:
features_train, features_test, target_train, target_test = train_test_split(features, 
                                                                            target, test_size=0.2, random_state=12345)

features_train, features_valid, target_train, target_valid = train_test_split(features_train, target_train, 
                                                                              test_size=0.25, random_state=12345) 

### Balance of the classes:

For a class to be balanced, the accuracy of the class should be equal to the accuracy of the constant model.

I will create a constant model that predicts 0 for any observation.
Then I will find the accuracy of the constant model.

In [397]:
target_pred_constant = pd.Series(1, index=target.index)

print(accuracy_score(target, target_pred_constant))

0.2037


In [398]:
target.mean()

0.2037

In [399]:
target.value_counts()

0    7963
1    2037
Name: Exited, dtype: int64

**Since the accuracy is far from 1, this means the classes are imbalanced, and this imbalance will affect how the model is trained.**

1. Decision Tree Classifier 

In [400]:
#a loop for max_depth from 1 to 5
for i in range(1,11):
    model_dt = DecisionTreeClassifier(random_state=12345, max_depth=i)
    model_dt.fit(features_train, target_train)
    dt_valid_predictions = model_dt.predict(features_valid)
    f1_score_dt = f1_score(target_valid, dt_valid_predictions)
    print('max_depth = ', i , ':', f1_score_dt)

max_depth =  1 : 0.0
max_depth =  2 : 0.5037037037037037
max_depth =  3 : 0.39382239382239387
max_depth =  4 : 0.4318181818181818
max_depth =  5 : 0.5457364341085271
max_depth =  6 : 0.5087719298245614
max_depth =  7 : 0.5497630331753556
max_depth =  8 : 0.5353846153846155
max_depth =  9 : 0.5349182763744428
max_depth =  10 : 0.5323741007194244


**The highest F1 score of 55%, is achieved at a max_depth of 7.**

2. Random Forest Classifier

In [401]:
#a loop for number of estimators from 1 to 10:
for i in range(10, 21):
    model_rf = RandomForestClassifier(random_state=12345, n_estimators=100, max_depth=i)
    model_rf.fit(features_train, target_train)
    rf_valid_predictions = model_rf.predict(features_valid)
    f1_rf = f1_score(target_valid, rf_valid_predictions)
    print(i, f1_rf)

10 0.5572139303482587
11 0.5540983606557377
12 0.5548387096774193
13 0.5617977528089887
14 0.5573248407643312
15 0.5599999999999999
16 0.5555555555555556
17 0.5637795275590551
18 0.5587301587301587
19 0.5583596214511041
20 0.5521669341894061
21 0.5548387096774193
22 0.5636942675159236
23 0.5623003194888179
24 0.5645933014354066
25 0.5568000000000001
26 0.5668789808917197
27 0.5668789808917197
28 0.5668789808917197
29 0.5668789808917197
30 0.5668789808917197


**The highest F1 score is 56% at 100 trees and a depth of 13.**

In [402]:
model_lr = LogisticRegression(random_state=12345, solver='liblinear')
model_lr.fit(features_train, target_train)
lr_valid_predictions = model_lr.predict(features_valid)
f1_lr = f1_score(target_valid, lr_valid_predictions)
print(f1_lr)

0.3050847457627119


**Logistic Regression has the least F1 score of 30%.**

### Fixing the class imbalance to improve model quality
1. Upsampling:
    - increasing the sample size
    - increases thenumber ofobservations by duplicating the rarer class observations several times.
    
The `upsample` function will:
- Split the training sample by class
- Determine the class with fewer observations 
- Duplicate the rarer class observations several times
- Create a new training sample based on the data obtained
- Shuffle the data


In [403]:
def upsample(features, target, repeat):
    features_zeros = features[target == 0]
    features_ones = features[target == 1]
    target_zeros = target[target == 0]
    target_ones = target[target == 1]

    features_upsampled = pd.concat([features_zeros] + [features_ones] * repeat)
    target_upsampled = pd.concat([target_zeros] + [target_ones] * repeat)
    
    features_upsampled, target_upsampled = shuffle(
        features_upsampled, target_upsampled, random_state=12345)
    
    return features_upsampled, target_upsampled

features_upsampled, target_upsampled = upsample(features_train, target_train, 4)

In [404]:
target_upsampled.value_counts()

1    4876
0    4781
Name: Exited, dtype: int64

The classes are now balanced, and can be used to train the models.

In [405]:
model_dt = DecisionTreeClassifier(random_state=12345, max_depth=7)
model_dt.fit(features_upsampled, target_upsampled)
dt_valid_predictions = model_dt.predict(features_valid)
f1_score_dt = f1_score(target_valid, dt_valid_predictions)
print(f1_score_dt)

0.5537525354969574


In [406]:
model_rf = RandomForestClassifier(random_state=12345, n_estimators=100, max_depth=12)
model_rf.fit(features_upsampled, target_upsampled)
predicted_valid = model_rf.predict(features_valid)
print("F1:", f1_score(target_valid, predicted_valid))

F1: 0.5928659286592864


In [407]:
model_lr = LogisticRegression(random_state=12345, solver='liblinear')
model_lr.fit(features_upsampled, target_upsampled)
lr_valid_predictions = model_lr.predict(features_valid)
f1_lr = f1_score(target_valid, lr_valid_predictions)
print(f1_lr)

0.4792219274977895


### The F1 score of the models have improved significantly after upsampling.

2. Downsampling:
    - decreases the number of observations by randomly dropping the majority class observations.
    
The `downsample` function will perform the same as the `upsample`. The only difference is that instead of duplicating the rarer observations, `downsample` will randomly drop a fraction of the majority class observations.

In [408]:
def downsample(features, target, fraction):
    features_zeros = features[target == 0]
    features_ones = features[target == 1]
    target_zeros = target[target == 0]
    target_ones = target[target == 1]

    features_downsampled = pd.concat(
        [features_zeros.sample(frac=fraction, random_state=12345)] + [features_ones])
    target_downsampled = pd.concat(
        [target_zeros.sample(frac=fraction, random_state=12345)] + [target_ones])
    
    features_downsampled, target_downsampled = shuffle(
        features_downsampled, target_downsampled, random_state=12345)
    
    return features_downsampled, target_downsampled

features_downsampled, target_downsampled = downsample(features_train, target_train, 0.25)

In [409]:
target_downsampled.value_counts()

1    1219
0    1195
Name: Exited, dtype: int64

In [410]:
model_dt = DecisionTreeClassifier(random_state=12345, max_depth=7)
model_dt.fit(features_downsampled, target_downsampled)
dt_valid_predictions = model_dt.predict(features_valid)
f1_score_dt = f1_score(target_valid, dt_valid_predictions)
print(f1_score_dt)

0.5459662288930582


In [411]:
model_rf = RandomForestClassifier(random_state=12345, n_estimators=100, max_depth=12)
model_rf.fit(features_downsampled, target_downsampled)
predicted_valid = model_rf.predict(features_valid)
print("F1:", f1_score(target_valid, predicted_valid))

F1: 0.5574712643678161


In [412]:
model_lr = LogisticRegression(random_state=12345, solver='liblinear')
model_lr.fit(features_downsampled, target_downsampled)
lr_valid_predictions = model_lr.predict(features_valid)
f1_lr = f1_score(target_valid, lr_valid_predictions)
print(f1_lr)

0.4761904761904761


### The downsampled F1 score has improved, in comparison to the imbalanced models, but it is lower than the upsampled models.

### Testing the selected model.

In [413]:
model_rf = RandomForestClassifier(random_state=12345, n_estimators=100, max_depth=12)
model_rf.fit(features_upsampled, target_upsampled)
predicted_test = model_rf.predict(features_test)
f1_test = f1_score(target_test, predicted_test)
print(f1_test)

0.6334106728538283


### Calculating the AU-ROC

AUC-ROC is the area under the curve of the ROC. It is an evaluation metric with values in the range from 0 to 1.

The higher the ROC curve, the greater the TPR value and the better the model's quality.
   - Therefore, the higher the AUC-ROC, the better the model's quality.

In [414]:
probabilities_test = model_rf.predict_proba(features_test)
probabilities_one_test = probabilities_test[:, 1]
auc_roc = roc_auc_score(target_test, probabilities_one_test)
print(auc_roc)

0.8606222391617324


**F1 is 0.63, while AUC-ROC is 0.86**

**Since F1 is the harmonic mean of *recall* and *precision* , and it is closer to 1, this means that the quality of the model in identifying true positives is good.**

**AUC-ROC is also closer to 1. This means that the model has better quality.**

**The balanced Random Forest Classifier with 100 trees and depth of 12 can be used to predict whether a customer will leave the bank soon, because of it's good quality.**