In [57]:
# Import library
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer

In [17]:
customer_df = pd.read_csv("./Churn_Modelling.csv", index_col="RowNumber")

In [18]:
customer_df.head()

Unnamed: 0_level_0,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
RowNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [38]:
customer_df.Exited.value_counts()

0    7963
1    2037
Name: Exited, dtype: int64

In [21]:
num_features = ['CreditScore','Age','Tenure','Balance','NumOfProducts', 'HasCrCard','IsActiveMember']
cat_features = ['Geography', 'Gender']

X = customer_df[num_features+cat_features]
y = customer_df.iloc[:,-1]

In [22]:
# Inputing numerical values with median
num_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy="median")),
    ('scaler', MinMaxScaler())
])

# Inputing missing values with most frequent one for categorical data
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy="most_frequent")),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(transformers=[
    ('num', num_transformer, num_features),
    ('cat', cat_transformer, cat_features)
], remainder='passthrough')

In [52]:
new_X = preprocessor.fit_transform(X)

In [26]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=2022)

In [58]:
lr = LogisticRegression()

def classification_report_with_f1_score(y_true, y_pred):
    print(classification_report(y_true, y_pred)) # print classification report
    return f1_score(y_true, y_pred) # return accuracy score

scores = cross_val_score(lr, new_X, y, cv=cv, scoring=make_scorer(classification_report_with_f1_score))
scores.mean()

              precision    recall  f1-score   support

           0       0.83      0.96      0.89      1592
           1       0.60      0.22      0.32       408

    accuracy                           0.81      2000
   macro avg       0.71      0.59      0.60      2000
weighted avg       0.78      0.81      0.77      2000

              precision    recall  f1-score   support

           0       0.83      0.96      0.89      1592
           1       0.57      0.21      0.31       408

    accuracy                           0.81      2000
   macro avg       0.70      0.58      0.60      2000
weighted avg       0.77      0.81      0.77      2000

              precision    recall  f1-score   support

           0       0.82      0.96      0.89      1593
           1       0.57      0.20      0.30       407

    accuracy                           0.81      2000
   macro avg       0.70      0.58      0.59      2000
weighted avg       0.77      0.81      0.77      2000

              preci

0.30839669018141025

- Although the accuracy ~ 82% for overall predictition
- Recall for class "1" ~ 20%, meaning that, we are only able to detect 20% of customer who want to exit (i.e: class 1)
- Also, F1 score is only around 30%

If we use `class_weight="balanced"`, we will provide the Logistic Regression with Balance Class

In [69]:
scores = cross_val_score(LogisticRegression(class_weight="balanced"), new_X, y, cv=cv, scoring='f1')
scores.mean()

0.4941147661454902

## Sampling Technique for Imbalance Class
### Under Sampling
- Random undersampling: involves randomly selecting examples from the majority class
- NearMiss: a collection of undersampling methods that select examples based on the distance of majority class examples to minority class examples
    - NearMiss-1: Majority class examples with minimum average distance to three closest minority class examples.
    - NearMiss-2: Majority class examples with minimum average distance to three furthest minority class examples.
    - NearMiss-3: Majority class examples with minimum distance to each minority class example.

In [71]:
from imblearn.under_sampling import RandomUnderSampler, NearMiss
rus = RandomUnderSampler()

X_under, y_under = rus.fit_resample(new_X, y)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=2022)
scores = cross_val_score(lr, X_under, y_under, scoring='f1', cv=cv, n_jobs=-1)
scores.mean()

0.7034329225941041

In [72]:
nearmiss_sampling = NearMiss(version=2, n_neighbors=3)

X_under, y_under = nearmiss_sampling.fit_resample(new_X,y)

In [60]:
y_under.value_counts()

0    2037
1    2037
Name: Exited, dtype: int64

In [61]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=2022)
scores = cross_val_score(lr, X_under, y_under, scoring='f1', cv=cv, n_jobs=-1)
scores.mean()

0.7352055438348664

In [64]:
for i in range(1,4):
    nearmiss_sampling = NearMiss(version=i, n_neighbors=3)
    X_under, y_under = nearmiss_sampling.fit_resample(new_X,y)
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=2022)
    scores = cross_val_score(lr, X_under, y_under, scoring='f1', cv=cv, n_jobs=-1)
    print(f"Version {i}: {round(scores.mean(),2)}")

Version 1: 0.7
Version 2: 0.74
Version 3: 0.65


In this case, **near-miss** version `2` has the best F1 score, and also better than RUS (Random Under Sampling)

### Oversampling 
- **Random Over Sampling**
- **SMOTE**: Synthetic (Generate) Minority Oversampling Technique

In [77]:
from imblearn.over_sampling import SMOTE, RandomOverSampler

smote = SMOTE()

X_over, y_over = smote.fit_resample(new_X, y)
y_over.value_counts()

1    7963
0    7963
Name: Exited, dtype: int64

In [78]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=2022)
scores = cross_val_score(lr, X_over, y_over, scoring='f1', cv=cv, n_jobs=-1)
scores.mean()

0.7067188980308134

In [75]:
ros = RandomUnderSampler()
X_over, y_over = ros.fit_resample(new_X, y)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=2022)
scores = cross_val_score(lr, X_over, y_over, scoring='f1', cv=cv, n_jobs=-1)
scores.mean()

0.694178429506676