In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle
from sklearn.dummy import DummyClassifier
import warnings
from pandas.core.common import SettingWithCopyWarning

In [2]:
# Supress deprecation, future, and SettingWithCopy warnings since we're instructed to use older sklearn library
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

## Import and Explore data:

In [3]:
# Import data from csv file
try:
    df_churn = pd.read_csv('./Churn.csv')
except:
    print("File(s) not found, please check file path(s) are correct")
df_churn.head(5)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2.0,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1.0,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8.0,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1.0,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2.0,125510.82,1,1,1,79084.1,0


In [4]:
df_churn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           9091 non-null   float64
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(3), int64(8), object(3)
memory usage: 1.1+ MB


Tenure column has 9% of its values blank, need to figure out best way to handle those rows

In [5]:
# Drop RowNumber column
df_churn = df_churn.drop('RowNumber', axis=1)

In [6]:
# Drop CustomerId column
df_churn = df_churn.drop('CustomerId', axis=1)

In [7]:
# Drop Surname column
df_churn = df_churn.drop('Surname', axis=1)

In [8]:
# Change column names to smallcase
df_churn.columns = [col.lower() for col in df_churn.columns]

In [9]:
#check duplicate rows
df_churn.duplicated().sum()

0

In [10]:
# Replace null values with 0 in tenure column
df_churn['tenure'] = df_churn['tenure'].fillna(0)

In [11]:
# Cast Tenure to int
df_churn['tenure'] = df_churn['tenure'].astype('int64')

In [12]:
df_churn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   creditscore      10000 non-null  int64  
 1   geography        10000 non-null  object 
 2   gender           10000 non-null  object 
 3   age              10000 non-null  int64  
 4   tenure           10000 non-null  int64  
 5   balance          10000 non-null  float64
 6   numofproducts    10000 non-null  int64  
 7   hascrcard        10000 non-null  int64  
 8   isactivemember   10000 non-null  int64  
 9   estimatedsalary  10000 non-null  float64
 10  exited           10000 non-null  int64  
dtypes: float64(2), int64(7), object(2)
memory usage: 859.5+ KB


In [13]:
# Check class balance or imbalance
df_churn['exited'].value_counts()

0    7963
1    2037
Name: exited, dtype: int64

There's imbalance in the exited column where only 20% of the rows are for customers who left the bank and 80% are for current customers

## Preprocessing and Splitting Data:

In [14]:
# Use one-hot encoding to transform object columns to numeric for use in classification models
df_churn_ohe = pd.get_dummies(df_churn, drop_first=True)

df_churn_ohe

Unnamed: 0,creditscore,age,tenure,balance,numofproducts,hascrcard,isactivemember,estimatedsalary,exited,geography_Germany,geography_Spain,gender_Male
0,619,42,2,0.00,1,1,1,101348.88,1,0,0,0
1,608,41,1,83807.86,1,0,1,112542.58,0,0,1,0
2,502,42,8,159660.80,3,1,0,113931.57,1,0,0,0
3,699,39,1,0.00,2,0,0,93826.63,0,0,0,0
4,850,43,2,125510.82,1,1,1,79084.10,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,771,39,5,0.00,2,1,0,96270.64,0,0,0,1
9996,516,35,10,57369.61,1,1,1,101699.77,0,0,0,1
9997,709,36,7,0.00,1,0,1,42085.58,1,0,0,0
9998,772,42,3,75075.31,2,1,0,92888.52,1,1,0,1


In [15]:
# Define feature and target
features = df_churn_ohe.drop(['exited'], axis=1)
target = df_churn_ohe['exited']

# Use train_test_split() to split dataset into 60% training, 40% for validation & testing
features_train, features_valid_test, target_train, target_valid_test = train_test_split(features, target, test_size=0.4, random_state=12345)

# Split validation & testing in half, so each is 20% of original dataset
features_valid, features_test, target_valid, target_test = train_test_split(features_valid_test, target_valid_test, test_size=0.5, random_state=12345)

In [16]:
# Scale numerical columns using sklearn StandardScaler
numeric_cols = ['age', 'tenure','creditscore', 'balance', 'estimatedsalary']

scaler = StandardScaler()
scaler.fit(features_train[numeric_cols])

features_train[numeric_cols] = scaler.transform(features_train[numeric_cols])
features_valid[numeric_cols] = scaler.transform(features_valid[numeric_cols])
features_test[numeric_cols] = scaler.transform(features_test[numeric_cols])

## Model Training with Imbalanced Classes:

### Logistic Regression Model:

In [17]:
model = LogisticRegression(random_state=12345, solver='liblinear')
model.fit(features_train,target_train)
predicted_valid = model.predict(features_valid)
probabilities_valid = model.predict_proba(features_valid)
probabilities_one_valid = probabilities_valid[:, 1]
auc_roc = roc_auc_score(target_valid, probabilities_one_valid)

print('F1 Score =', f1_score(target_valid,predicted_valid))
print('AUC-ROC =', auc_roc)
print('Accuracy Score =', accuracy_score(target_valid,predicted_valid))

F1 Score = 0.33389544688026984
AUC-ROC = 0.7585879420998192
Accuracy Score = 0.8025


In [18]:
# Sanity Check using DummyClassifier from sklearn
dummy_majority_clf = DummyClassifier(strategy='most_frequent')
dummy_majority_clf.fit(features_train, target_train)
majority_predictions = dummy_majority_clf.predict(features_valid)
print('F1 Score =', f1_score(target_valid,majority_predictions))
print('AUC-ROC =', roc_auc_score(target_valid, majority_predictions))
print('Accuracy Score =', accuracy_score(target_valid,majority_predictions))

F1 Score = 0.0
AUC-ROC = 0.5
Accuracy Score = 0.791


Logistic regression model is better than constant dummy model however F1 score is low and below target of 0.59

### Decision Tree Model:

In [19]:
model = DecisionTreeClassifier(random_state=12345)
model.fit(features_train,target_train)
predicted_valid = model.predict(features_valid)
probabilities_valid = model.predict_proba(features_valid)
probabilities_one_valid = probabilities_valid[:, 1]
auc_roc = roc_auc_score(target_valid, probabilities_one_valid)

print('F1 Score =', f1_score(target_valid,predicted_valid))
print('AUC-ROC =', auc_roc)
print('Accuracy Score =', accuracy_score(target_valid,predicted_valid))

F1 Score = 0.47990255785627284
AUC-ROC = 0.6705384741015855
Accuracy Score = 0.7865


Decision Tree model has a much better F1 score than logistic regression but a lower AUC-ROC due to overfitting. Model accuracy slightly worse than constant model

### Random Forest Model:

In [20]:
model = RandomForestClassifier(random_state=12345, n_estimators=5)
model.fit(features_train,target_train)
predicted_valid = model.predict(features_valid)
probabilities_valid = model.predict_proba(features_valid)
probabilities_one_valid = probabilities_valid[:, 1]
auc_roc = roc_auc_score(target_valid, probabilities_one_valid)

print('F1 Score =', f1_score(target_valid,predicted_valid))
print('AUC-ROC =', auc_roc)
print('Accuracy Score =', accuracy_score(target_valid,predicted_valid))

F1 Score = 0.544943820224719
AUC-ROC = 0.7915068745879179
Accuracy Score = 0.838


Random Forest model has a much better F1 score and AUC-ROC score than logistic regression and Decision Tree and it also has noticeably better accuracy compared to the constant model

## Model Training with Balanced Classes:

### Logistic Regression Model:

In [21]:
# Balance classes using class_weight parameter for LogisticRegression
model = LogisticRegression(random_state=12345, solver='liblinear', class_weight = 'balanced')
model.fit(features_train, target_train)
predicted_valid = model.predict(features_valid)
probabilities_valid = model.predict_proba(features_valid)
probabilities_one_valid = probabilities_valid[:, 1]
auc_roc = roc_auc_score(target_valid, probabilities_one_valid)

print('F1 Score =', f1_score(target_valid,predicted_valid))
print('AUC-ROC =', auc_roc)
print('Accuracy Score =', accuracy_score(target_valid,predicted_valid))

F1 Score = 0.4888888888888888
AUC-ROC = 0.7635631718072333
Accuracy Score = 0.701


In [22]:
# upsample function
def upsample(features, target, repeat):
    features_zeros = features[target == 0]
    features_ones = features[target == 1]
    target_zeros = target[target == 0]
    target_ones = target[target == 1]

    features_upsampled = pd.concat([features_zeros] + [features_ones] * repeat)
    target_upsampled = pd.concat([target_zeros] + [target_ones] * repeat)

    features_upsampled, target_upsampled = shuffle(
        features_upsampled, target_upsampled, random_state=12345
    )

    return features_upsampled, target_upsampled

In [23]:
# Balance classes using upsampling
for factor in range(1,10):
    features_upsampled, target_upsampled = upsample(
        features_train, target_train, factor
    )

    model = LogisticRegression(random_state=12345, solver='liblinear')
    model.fit(features_upsampled,target_upsampled)
    predicted_valid = model.predict(features_valid)

    print('Upsampling Factor:', factor, 'F1:', f1_score(target_valid, predicted_valid))


Upsampling Factor: 1 F1: 0.33389544688026984
Upsampling Factor: 2 F1: 0.4672435105067985
Upsampling Factor: 3 F1: 0.5014985014985014
Upsampling Factor: 4 F1: 0.4888888888888888
Upsampling Factor: 5 F1: 0.483969465648855
Upsampling Factor: 6 F1: 0.4688796680497925
Upsampling Factor: 7 F1: 0.4544875875238701
Upsampling Factor: 8 F1: 0.4409826243259437
Upsampling Factor: 9 F1: 0.4297994269340974


Upsampling and class weight adjustments resulted in same improvement in F1 score but still below minimum target of 0.59

In [24]:
# Custom threshold
model = LogisticRegression(random_state=12345, solver='liblinear', class_weight = 'balanced')
model.fit(features_train, target_train)

probabilities_valid = model.predict_proba(features_valid)
probabilities_one_valid = probabilities_valid[:, 1]

for threshold in np.arange(0, 1, 0.02):
    predicted_valid = probabilities_one_valid > threshold
    print('Threshold:', threshold, 'F1:', f1_score(target_valid,predicted_valid))

Threshold: 0.0 F1: 0.3457402812241522
Threshold: 0.02 F1: 0.3457402812241522
Threshold: 0.04 F1: 0.3458833264377327
Threshold: 0.06 F1: 0.34717607973421927
Threshold: 0.08 F1: 0.3496445002091176
Threshold: 0.1 F1: 0.3546881629189648
Threshold: 0.12 F1: 0.3593952483801296
Threshold: 0.14 F1: 0.36807095343680707
Threshold: 0.16 F1: 0.3747731397459165
Threshold: 0.18 F1: 0.3797585886722377
Threshold: 0.2 F1: 0.38755980861244016
Threshold: 0.22 F1: 0.396235760277365
Threshold: 0.24 F1: 0.402246043899949
Threshold: 0.26 F1: 0.4083989501312337
Threshold: 0.28 F1: 0.4154013015184382
Threshold: 0.3 F1: 0.4225824482951369
Threshold: 0.32 F1: 0.4316463059918557
Threshold: 0.34 F1: 0.4401683704149129
Threshold: 0.36 F1: 0.45226130653266333
Threshold: 0.38 F1: 0.462853385930309
Threshold: 0.4 F1: 0.47010309278350515
Threshold: 0.42 F1: 0.474964234620887
Threshold: 0.44 F1: 0.4875846501128669
Threshold: 0.46 F1: 0.48503937007874015
Threshold: 0.48 F1: 0.48519736842105265
Threshold: 0.5 F1: 0.488888

Minimal improvement in F1 score from 0.488 to 0.5 from increasing threshold to 0.62 for Logistic Regression

### Decision Tree Model:

In [25]:
# Decision Tree
model = DecisionTreeClassifier(random_state=12345, class_weight = 'balanced')
model.fit(features_train, target_train)
predicted_valid = model.predict(features_valid)
probabilities_valid = model.predict_proba(features_valid)
probabilities_one_valid = probabilities_valid[:, 1]
auc_roc = roc_auc_score(target_valid, probabilities_one_valid)

print('F1 Score =', f1_score(target_valid,predicted_valid))
print('AUC-ROC =', auc_roc)
print('Accuracy Score =', accuracy_score(target_valid,predicted_valid))

F1 Score = 0.45622688039457454
AUC-ROC = 0.6555522958643591
Accuracy Score = 0.7795


Barely any change compared to Decision Tree model trained on imbalanced classes

In [26]:
# Custom threshold
probabilities_valid = model.predict_proba(features_valid)
probabilities_one_valid = probabilities_valid[:, 1]

for threshold in np.arange(0, 0.46, 0.02):
    predicted_valid = probabilities_one_valid > threshold
    print('Threshold:', threshold, 'F1:', f1_score(target_valid,predicted_valid))

Threshold: 0.0 F1: 0.45622688039457454
Threshold: 0.02 F1: 0.45622688039457454
Threshold: 0.04 F1: 0.45622688039457454
Threshold: 0.06 F1: 0.45622688039457454
Threshold: 0.08 F1: 0.45622688039457454
Threshold: 0.1 F1: 0.45622688039457454
Threshold: 0.12 F1: 0.45622688039457454
Threshold: 0.14 F1: 0.45622688039457454
Threshold: 0.16 F1: 0.45622688039457454
Threshold: 0.18 F1: 0.45622688039457454
Threshold: 0.2 F1: 0.45622688039457454
Threshold: 0.22 F1: 0.45622688039457454
Threshold: 0.24 F1: 0.45622688039457454
Threshold: 0.26 F1: 0.45622688039457454
Threshold: 0.28 F1: 0.45622688039457454
Threshold: 0.3 F1: 0.45622688039457454
Threshold: 0.32 F1: 0.45622688039457454
Threshold: 0.34 F1: 0.45622688039457454
Threshold: 0.36 F1: 0.45622688039457454
Threshold: 0.38 F1: 0.45622688039457454
Threshold: 0.4 F1: 0.45622688039457454
Threshold: 0.42 F1: 0.45622688039457454
Threshold: 0.44 F1: 0.45622688039457454


Custom threshold doesn't make any difference in Decision Tree Classifier model f1 score

In [27]:
# Hyperparameter Tuning:
for depth in range(1,10):
    model = DecisionTreeClassifier(random_state=12345, max_depth=depth, class_weight = 'balanced')
    model.fit(features_train,target_train)
    predicted_valid = model.predict(features_valid)
    probabilities_valid = model.predict_proba(features_valid)
    probabilities_one_valid = probabilities_valid[:, 1]
    print('max_depth =', depth, ':', 'F1:', f1_score(target_valid,predicted_valid), 'AUC-ROC =', roc_auc_score(target_valid, probabilities_one_valid))

max_depth = 1 : F1: 0.4994903160040775 AUC-ROC = 0.6925565119556736
max_depth = 2 : F1: 0.541015625 AUC-ROC = 0.7501814673449512
max_depth = 3 : F1: 0.541015625 AUC-ROC = 0.7980472601455368
max_depth = 4 : F1: 0.5277777777777778 AUC-ROC = 0.8190853743368881
max_depth = 5 : F1: 0.5963791267305644 AUC-ROC = 0.8310244134068074
max_depth = 6 : F1: 0.5581835383159887 AUC-ROC = 0.7999473744699641
max_depth = 7 : F1: 0.5559999999999999 AUC-ROC = 0.7937290934496337
max_depth = 8 : F1: 0.5401174168297456 AUC-ROC = 0.7745230433283531
max_depth = 9 : F1: 0.5338567222767419 AUC-ROC = 0.7652984532933299


Decision Tree Classifier with balanced class weights and a max depth of 5 achieved F1 score of 0.6 and also highest AUC-ROC score of 0.847

### Random Forest Model:

In [28]:
# Random Forest 
model = RandomForestClassifier(random_state=12345, n_estimators=5, class_weight = 'balanced')
model.fit(features_train, target_train)
predicted_valid = model.predict(features_valid)
probabilities_valid = model.predict_proba(features_valid)
probabilities_one_valid = probabilities_valid[:, 1]
auc_roc = roc_auc_score(target_valid, probabilities_one_valid)

print('F1 Score =', f1_score(target_valid,predicted_valid))
print('AUC-ROC =', auc_roc)
print('Accuracy Score =', accuracy_score(target_valid,predicted_valid))

F1 Score = 0.5393258426966292
AUC-ROC = 0.7842156678905631
Accuracy Score = 0.836


In [29]:
# Hyperparameter Tuning:
best_F1 = 0
best_est = 0
for est in range(1, 100): # choose hyperparameter range
    model = RandomForestClassifier(random_state=12345, n_estimators=est, class_weight = 'balanced') # set number of trees
    model.fit(features_train,target_train)
    predicted_valid = model.predict(features_valid)
    probabilities_valid = model.predict_proba(features_valid)
    probabilities_one_valid = probabilities_valid[:, 1]
    auc_roc = roc_auc_score(target_valid, probabilities_one_valid)
    if f1_score(target_valid,predicted_valid) > best_F1:
        best_F1 = f1_score(target_valid,predicted_valid)
        best_est = est
    # Print loading pattern
    print("Tuning in progress" + "."*(est%4), end="\r")

print('Best n_estimators=', best_est, 'F1 =', best_F1)

Best n_estimators= 85 F1 = 0.5657492354740061


Hypertuning of model with validation set shows that using 85 n_estimators yields highest F1 score

In [30]:
# Random Forest with n_estimators = 85
model = RandomForestClassifier(random_state=12345, n_estimators=85, class_weight = 'balanced')
model.fit(features_train,target_train)
predicted_valid = model.predict(features_valid)
probabilities_valid = model.predict_proba(features_valid)
probabilities_one_valid = probabilities_valid[:, 1]
auc_roc = roc_auc_score(target_valid, probabilities_one_valid)

print('F1 Score =', f1_score(target_valid,predicted_valid))
print('AUC-ROC =', auc_roc)
print('Accuracy Score =', accuracy_score(target_valid,predicted_valid))

F1 Score = 0.5657492354740061
AUC-ROC = 0.8380237601243656
Accuracy Score = 0.858


In [31]:
# Custom threshold
probabilities_valid = model.predict_proba(features_valid)
probabilities_one_valid = probabilities_valid[:, 1]

for threshold in np.arange(0, 1, 0.02):
    predicted_valid = probabilities_one_valid > threshold
    auc_roc = roc_auc_score(target_valid, probabilities_one_valid)
    print('Threshold:', threshold, 'F1:', f1_score(target_valid,predicted_valid))

Threshold: 0.0 F1: 0.36339754816112085
Threshold: 0.02 F1: 0.3800652072659525
Threshold: 0.04 F1: 0.41379310344827586
Threshold: 0.06 F1: 0.44470314318975557
Threshold: 0.08 F1: 0.4604141291108404
Threshold: 0.1 F1: 0.48626925653047554
Threshold: 0.12 F1: 0.5138686131386861
Threshold: 0.14 F1: 0.5289634146341463
Threshold: 0.16 F1: 0.5410235580828594
Threshold: 0.18 F1: 0.5583405358686258
Threshold: 0.2 F1: 0.5759849906191369
Threshold: 0.22 F1: 0.5890410958904109
Threshold: 0.24 F1: 0.5960539979231567
Threshold: 0.26 F1: 0.6152173913043479
Threshold: 0.28 F1: 0.6202247191011236
Threshold: 0.3 F1: 0.6193853427895981
Threshold: 0.32 F1: 0.6172539489671932
Threshold: 0.34 F1: 0.6159509202453988
Threshold: 0.36 F1: 0.6101265822784809
Threshold: 0.38 F1: 0.6081258191349934
Threshold: 0.4 F1: 0.5997286295793759
Threshold: 0.42 F1: 0.5989010989010989
Threshold: 0.44 F1: 0.5940594059405939
Threshold: 0.46 F1: 0.5830903790087464
Threshold: 0.48 F1: 0.5663189269746647
Threshold: 0.5 F1: 0.56574

Best Random Forest Classifier model has F1 score of 0.62 with balanced class weights and custom threshold of 0.28

In [32]:
# Final model testing using test set
probabilities_test = model.predict_proba(features_test)
probabilities_one_test = probabilities_test[:, 1]
predicted_test = probabilities_one_test > 0.28
auc_roc = roc_auc_score(target_test, probabilities_one_test)

print('F1 Score =', f1_score(target_test,predicted_test))
print('AUC-ROC =', auc_roc)
print('Accuracy Score =', accuracy_score(target_test,predicted_test))

F1 Score = 0.6283185840707965
AUC-ROC = 0.8544053031836192
Accuracy Score = 0.832


## Conclusions:

- Logistic Regression model had the lowest F1 score prior to balancing classes
- Random Forest model performed best among the models prior to balancing classes
- Decision Tree model doesn't benefit from threshold adjustment
- F1 score is more sensitive to class imbalance than AUC-ROC score. The improvement in F1 score after balancing classes and adjusting thresholds is significantly more than AUC-ROC.
- The Random Forest model with n_estimators = 85, balanced class weights, and a custom threshold of 0.28 has the best combination of F1, AUC-ROC score, and accuracy compared to others and the constant model. This is the recommended model to use to predict which bank customers are likely to leave.