In [73]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.utils import shuffle
from sklearn.metrics import f1_score

In [39]:
data = pd.read_csv('datasets/Churn.csv')

In [40]:
data.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2.0,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1.0,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8.0,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1.0,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2.0,125510.82,1,1,1,79084.1,0
5,6,15574012,Chu,645,Spain,Male,44,8.0,113755.78,2,1,0,149756.71,1
6,7,15592531,Bartlett,822,France,Male,50,7.0,0.0,2,1,1,10062.8,0
7,8,15656148,Obinna,376,Germany,Female,29,4.0,115046.74,4,1,0,119346.88,1
8,9,15792365,He,501,France,Male,44,4.0,142051.07,2,0,1,74940.5,0
9,10,15592389,H?,684,France,Male,27,2.0,134603.88,1,1,1,71725.73,0


In [41]:
data['Gender'].value_counts()

Male      5457
Female    4543
Name: Gender, dtype: int64

In [42]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           9091 non-null   float64
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(3), int64(8), object(3)
memory usage: 1.1+ MB


I will fill in the missing values in `Tenure` with 0, because these customers may not have fixed deposits thus the period of maturation cannot be determined.

In [43]:
data['Tenure'].fillna(0, inplace=True)

In [44]:
data.describe()

Unnamed: 0,RowNumber,CustomerId,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5000.5,15690940.0,650.5288,38.9218,4.5434,76485.889288,1.5302,0.7055,0.5151,100090.239881,0.2037
std,2886.89568,71936.19,96.653299,10.487806,3.111573,62397.405202,0.581654,0.45584,0.499797,57510.492818,0.402769
min,1.0,15565700.0,350.0,18.0,0.0,0.0,1.0,0.0,0.0,11.58,0.0
25%,2500.75,15628530.0,584.0,32.0,2.0,0.0,1.0,0.0,0.0,51002.11,0.0
50%,5000.5,15690740.0,652.0,37.0,4.0,97198.54,1.0,1.0,1.0,100193.915,0.0
75%,7500.25,15753230.0,718.0,44.0,7.0,127644.24,2.0,1.0,1.0,149388.2475,0.0
max,10000.0,15815690.0,850.0,92.0,10.0,250898.09,4.0,1.0,1.0,199992.48,1.0


For feature preparation, I will use Ordinal Encoding to encode textual categories with numbers.

I will use the improrted OrdinalEncoder from sklearn.preprocessing.

In [45]:
encoder = OrdinalEncoder() 
data = pd.DataFrame(encoder.fit_transform(data), columns=data.columns)

In [49]:
data.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0.0,2736.0,1115.0,228.0,0.0,0.0,24.0,2.0,0.0,0.0,1.0,1.0,5068.0,1.0
1,1.0,3258.0,1177.0,217.0,2.0,0.0,23.0,1.0,743.0,0.0,0.0,1.0,5639.0,0.0
2,2.0,2104.0,2040.0,111.0,0.0,0.0,24.0,8.0,5793.0,2.0,1.0,0.0,5707.0,1.0
3,3.0,5435.0,289.0,308.0,0.0,0.0,21.0,1.0,0.0,1.0,0.0,0.0,4704.0,0.0
4,4.0,6899.0,1822.0,459.0,2.0,0.0,25.0,2.0,3696.0,0.0,1.0,1.0,3925.0,0.0


Split the data into training, validation and test set, in 60-20-20 proportions, using the test_train_split function.

In [47]:
features = data.drop('Exited', axis=1)
target = data['Exited']

In [48]:
features_train, features_test, target_train, target_test = train_test_split(features, 
                                                                            target, test_size=0.2, random_state=12345)

features_train, features_valid, target_train, target_valid = train_test_split(features_train, target_train, 
                                                                              test_size=0.25, random_state=12345) 
# 0.25 x 0.8 = 0.2

### Training models without taking into account the imbalance:
1. Decision Tree Classifier 

In [78]:
#a loop for max_depth from 1 to 5
for i in range(1,11):
    model_dt = DecisionTreeClassifier(random_state=12345, max_depth=i)
    model_dt.fit(features_train, target_train)
    dt_valid_predictions = model_dt.predict(features_valid)
    f1_score_dt = f1_score(target_valid, dt_valid_predictions)
    print('max_depth = ', i , ':', f1_score_dt)

max_depth =  1 : 0.0
max_depth =  2 : 0.5037037037037037
max_depth =  3 : 0.39382239382239387
max_depth =  4 : 0.4336448598130841
max_depth =  5 : 0.5238879736408567
max_depth =  6 : 0.5067114093959731
max_depth =  7 : 0.5213270142180094
max_depth =  8 : 0.5421133231240428
max_depth =  9 : 0.5407854984894259
max_depth =  10 : 0.5198237885462555


**The highest f1 score of 54.2%, is achieved at a max_depth of 8.**

2. Random Forest Classifier

In [83]:
#a loop for number of estimators from 1 to 10:
for i in range(20, 31):
    model_rf = RandomForestClassifier(random_state=12345, n_estimators=30, max_depth=i)
    model_rf.fit(features_train, target_train)
    rf_valid_predictions = model_rf.predict(features_valid)
    f1_rf = f1_score(target_valid, rf_valid_predictions)
    print(i, f1_rf)

20 0.542763157894737
21 0.5409836065573771
22 0.5278688524590164
23 0.555921052631579
24 0.5264900662251656
25 0.5294117647058824
26 0.5294117647058824
27 0.5294117647058824
28 0.5294117647058824
29 0.5294117647058824
30 0.5294117647058824


**The highest f1 score is 55.5% at 30 trees and a depth of 23.**

In [84]:
model_lr = LogisticRegression(random_state=12345, solver='liblinear')
model_lr.fit(features_train, target_train)
lr_valid_predictions = model_lr.predict(features_valid)
f1_lr = f1_score(target_valid, lr_valid_predictions)
print(f1_lr)

0.29411764705882354


**Logistic Regression has the least f1 score of 29%.**

### Fixing the class imbalance to improve model quality
1. Upsampling:
    - increasing the sample size
    - increases thenumber ofobservations by duplicating the rarer class observations several times .

In [70]:
def upsample(features, target, repeat):
    features_zeros = features[target == 0]
    features_ones = features[target == 1]
    target_zeros = target[target == 0]
    target_ones = target[target == 1]

    features_upsampled = pd.concat([features_zeros] + [features_ones] * repeat)
    target_upsampled = pd.concat([target_zeros] + [target_ones] * repeat)
    
    features_upsampled, target_upsampled = shuffle(
        features_upsampled, target_upsampled, random_state=12345)
    
    return features_upsampled, target_upsampled

features_upsampled, target_upsampled = upsample(features_train, target_train, 10)

In [86]:
model_dt = DecisionTreeClassifier(random_state=12345, max_depth=8)
model_dt.fit(features_upsampled, target_upsampled)
dt_valid_predictions = model_dt.predict(features_valid)
f1_score_dt = f1_score(target_valid, dt_valid_predictions)
print(f1_score_dt)

0.43994413407821226


In [87]:
model_rf = RandomForestClassifier(random_state=12345, n_estimators=30, max_depth=23)
model_rf.fit(features_upsampled, target_upsampled)
predicted_valid = model_rf.predict(features_valid)
print("F1:", f1_score(target_valid, predicted_valid))

F1: 0.5582089552238807


In [88]:
model_lr.fit(features_upsampled, target_upsampled)
lr_valid_predictions = model_lr.predict(features_valid)
f1_lr = f1_score(target_valid, lr_valid_predictions)
print(f1_lr)

0.39230358097274187


2. Downsampling:
    - decreases thenumber of observations by randomly dropping the majority class observations.

In [75]:
def downsample(features, target, fraction):
    features_zeros = features[target == 0]
    features_ones = features[target == 1]
    target_zeros = target[target == 0]
    target_ones = target[target == 1]

    features_downsampled = pd.concat(
        [features_zeros.sample(frac=fraction, random_state=12345)] + [features_ones])
    target_downsampled = pd.concat(
        [target_zeros.sample(frac=fraction, random_state=12345)] + [target_ones])
    
    features_downsampled, target_downsampled = shuffle(
        features_downsampled, target_downsampled, random_state=12345)
    
    return features_downsampled, target_downsampled

features_downsampled, target_downsampled = downsample(features_train, target_train, 0.1)

In [90]:
model_dt = DecisionTreeClassifier(random_state=12345, max_depth=8)
model_dt.fit(features_downsampled, target_downsampled)
dt_valid_predictions = model_dt.predict(features_valid)
f1_score_dt = f1_score(target_valid, dt_valid_predictions)
print(f1_score_dt)

0.44267726971504306


In [91]:
model_rf = RandomForestClassifier(random_state=12345, n_estimators=30, max_depth=23)
model_rf.fit(features_downsampled, target_downsampled)
predicted_valid = model_rf.predict(features_valid)
print("F1:", f1_score(target_valid, predicted_valid))

F1: 0.4553054662379421


In [92]:
model_lr.fit(features_downsampled, target_downsampled)
lr_valid_predictions = model_lr.predict(features_valid)
f1_lr = f1_score(target_valid, lr_valid_predictions)
print(f1_lr)

0.3914209115281502
