In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_predict
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
df = pd.read_csv("customer_churn(in).csv")
df.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
count,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.761692
std,0.368612,24.559481,30.090047
min,0.0,0.0,18.25
25%,0.0,9.0,35.5
50%,0.0,29.0,70.35
75%,0.0,55.0,89.85
max,1.0,72.0,118.75


In [3]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


# Explore & Pre-processing

In [4]:
filename = "customer_churn(in).csv"
df = pd.read_csv(filename)

df.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
count,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.761692
std,0.368612,24.559481,30.090047
min,0.0,0.0,18.25
25%,0.0,9.0,35.5
50%,0.0,29.0,70.35
75%,0.0,55.0,89.85
max,1.0,72.0,118.75


In [5]:
df.isnull().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

## train/test split

In [6]:
def split_Xy(df):
    
    X = df.drop(columns=["Churn"])
    X = X.select_dtypes(include=["number"])
    y = df["Churn"]
    return X,y
# split train / test on dataframe (protech data leakage)
r = int(0.7*len(df))
df_train = df[:r]
df_test = df[r:]

In [7]:
X_train, y_train = split_Xy(df_train)
X_test, y_test = split_Xy(df_test)
print("No",len( y_train[y_train == "No"]))
print("Yes",len( y_train[y_train == "Yes"]))

No 3641
Yes 1289


# Build Model , baseline

In [8]:
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

In [9]:
model = Pipeline([
    ("scaler", StandardScaler()),
    ("knn", KNeighborsClassifier())
])

param_grid = {
    "knn__n_neighbors": [1,3,5,7,9,11,13,15,17],
    "knn__weights": ["uniform", "distance"]
}
grid = GridSearchCV(model, param_grid, cv=5)

knn_baseline = grid.fit(X_train, y_train)
y_pred_baseline = knn_baseline.predict(X_test)

print(knn_baseline.best_params_)
print("Base line classification")
print(classification_report(y_test,y_pred_baseline))

{'knn__n_neighbors': 11, 'knn__weights': 'uniform'}
Base line classification
              precision    recall  f1-score   support

          No       0.82      0.90      0.86      1533
         Yes       0.64      0.46      0.54       580

    accuracy                           0.78      2113
   macro avg       0.73      0.68      0.70      2113
weighted avg       0.77      0.78      0.77      2113



# fixing size

In [10]:
# -- fixing size of No and Yes at 1000
n_yes = len(df_train[df_train["Churn"] == "Yes"])
n_no = len(df_train[df_train["Churn"] == "No"])

_min = min(n_yes, n_no)

yes_df_sample = df_train[df_train["Churn"] == "Yes"].sample(n=_min, random_state=42)
no_df_sample = df_train[df_train["Churn"] == "No"].sample(n=_min, random_state=42)
fix_size_df = pd.concat([yes_df_sample, no_df_sample], axis=0).reset_index(drop=True)

print(len( fix_size_df[fix_size_df["Churn"] == "Yes"]), len( fix_size_df[fix_size_df["Churn"] == "No"]))

X_fix_train, y_fix_train = split_Xy(fix_size_df)

knn_fixsize = grid.fit(X_fix_train, y_fix_train)

y_pred_fix = knn_fixsize.predict(X_test)

print(classification_report(y_test, y_pred_fix))

1289 1289
              precision    recall  f1-score   support

          No       0.89      0.71      0.79      1533
         Yes       0.49      0.76      0.60       580

    accuracy                           0.72      2113
   macro avg       0.69      0.73      0.69      2113
weighted avg       0.78      0.72      0.74      2113



# Over-sampling by duplication the minority class

In [11]:
yes_df = df_train[df_train['Churn'] == "Yes"]
dup_yes = pd.concat([yes_df, yes_df])


dup_clean_df = pd.concat([df, dup_yes])
print(len(dup_clean_df[dup_clean_df["Churn"] == "No"]))
print(len(dup_clean_df[dup_clean_df["Churn"] == "Yes"]))

X_over_samp, y_over_samp = split_Xy(dup_clean_df)

knn_oversampling = grid.fit(X_over_samp, y_over_samp)

y_pred_oversamp = knn_oversampling.predict(X_test)

print(classification_report(y_test, y_pred_oversamp))

5174
4447
              precision    recall  f1-score   support

          No       0.98      0.98      0.98      1533
         Yes       0.95      0.94      0.95       580

    accuracy                           0.97      2113
   macro avg       0.97      0.96      0.96      2113
weighted avg       0.97      0.97      0.97      2113



# Over-sampling by SMOTE

In [12]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)

print("Yes/No before smote")
print("No", len(y_train[y_train == "No"]))
print("Yes", len(y_train[y_train == "Yes"]))
X_train_SMOTE, y_train_SMOTE = smote.fit_resample(X_train, y_train)

print("Yes/No after smote")
print("No", len(y_train_SMOTE[y_train_SMOTE == "No"]))
print("Yes", len(y_train_SMOTE[y_train_SMOTE == "Yes"]))
knn_SMOTE = grid.fit(X_train_SMOTE, y_train_SMOTE)


y_pred_SMOTE = knn_SMOTE.predict(X_test)

print(classification_report(y_test, y_pred_SMOTE))

Yes/No before smote
No 3641
Yes 1289
Yes/No after smote
No 3641
Yes 3641
              precision    recall  f1-score   support

          No       0.83      0.75      0.79      1533
         Yes       0.48      0.61      0.54       580

    accuracy                           0.71      2113
   macro avg       0.66      0.68      0.66      2113
weighted avg       0.74      0.71      0.72      2113



# Compare 3 method

In [13]:
print(knn_baseline.best_params_)
print("Base line classification")
print(classification_report(y_test,y_pred_baseline))

{'knn__n_neighbors': 5, 'knn__weights': 'distance'}
Base line classification
              precision    recall  f1-score   support

          No       0.82      0.90      0.86      1533
         Yes       0.64      0.46      0.54       580

    accuracy                           0.78      2113
   macro avg       0.73      0.68      0.70      2113
weighted avg       0.77      0.78      0.77      2113



In [14]:
print(knn_fixsize.best_params_)
print("fix size")
print(classification_report(y_test,y_pred_fix))

{'knn__n_neighbors': 5, 'knn__weights': 'distance'}
fix size
              precision    recall  f1-score   support

          No       0.89      0.71      0.79      1533
         Yes       0.49      0.76      0.60       580

    accuracy                           0.72      2113
   macro avg       0.69      0.73      0.69      2113
weighted avg       0.78      0.72      0.74      2113



In [15]:
print(knn_oversampling.best_params_)
print("over sampling")
print(classification_report(y_test,y_pred_oversamp))

{'knn__n_neighbors': 5, 'knn__weights': 'distance'}
over sampling
              precision    recall  f1-score   support

          No       0.98      0.98      0.98      1533
         Yes       0.95      0.94      0.95       580

    accuracy                           0.97      2113
   macro avg       0.97      0.96      0.96      2113
weighted avg       0.97      0.97      0.97      2113



In [16]:
print(knn_SMOTE.best_params_)
print("SMOTE algorithm")
print(classification_report(y_test,y_pred_SMOTE))

{'knn__n_neighbors': 5, 'knn__weights': 'distance'}
SMOTE algorithm
              precision    recall  f1-score   support

          No       0.83      0.75      0.79      1533
         Yes       0.48      0.61      0.54       580

    accuracy                           0.71      2113
   macro avg       0.66      0.68      0.66      2113
weighted avg       0.74      0.71      0.72      2113

