In [187]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report, confusion_matrix

In [188]:
df = pd.read_csv("customer_churn(in).csv")
print(df.describe())
print(df.columns)

       SeniorCitizen       tenure  MonthlyCharges
count    7043.000000  7043.000000     7043.000000
mean        0.162147    32.371149       64.761692
std         0.368612    24.559481       30.090047
min         0.000000     0.000000       18.250000
25%         0.000000     9.000000       35.500000
50%         0.000000    29.000000       70.350000
75%         0.000000    55.000000       89.850000
max         1.000000    72.000000      118.750000
Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')


## About Dataset

### Context
"Predict behavior to retain customers. You can analyze all relevant customer data and develop focused customer retention programs." [IBM Sample Data Sets]

### Content
Each row represents a customer, each column contains customer's attributes described on the column

- Customers who left within the last month – the column is called Churn

- Services that each customer has signed up for – phone, multiple lines, internet, online security, online backup, device protection, tech support, and streaming TV and movies

- Customer account information – how long they’ve been a customer, contract, payment method, paperless billing, monthly charges, and total charges

- Demographic info about customers – gender, age range, and if they have partners and dependents


# Explore & Pre-processing

In [189]:
filename = "customer_churn(in).csv"
df = pd.read_csv(filename)

print(df.describe())
df

       SeniorCitizen       tenure  MonthlyCharges
count    7043.000000  7043.000000     7043.000000
mean        0.162147    32.371149       64.761692
std         0.368612    24.559481       30.090047
min         0.000000     0.000000       18.250000
25%         0.000000     9.000000       35.500000
50%         0.000000    29.000000       70.350000
75%         0.000000    55.000000       89.850000
max         1.000000    72.000000      118.750000


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6,Yes


In [190]:
df.isnull().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [191]:
clean_df = df.dropna()

def split_Xy(df):
    X = df.drop(columns=["Churn"])
    X = X.select_dtypes(include=["number"])
    y = df["Churn"]
    return X,y

X,y = split_Xy(clean_df)

# Build Model

In [192]:
model = KNeighborsClassifier(n_neighbors=5)
y_pred = cross_val_predict(model, X, y, cv=5)

print(classification_report(y_pred, y))

              precision    recall  f1-score   support

          No       0.88      0.82      0.85      5583
         Yes       0.46      0.59      0.51      1460

    accuracy                           0.77      7043
   macro avg       0.67      0.70      0.68      7043
weighted avg       0.80      0.77      0.78      7043



f1-score ของ 'YES' ต่ำมากซึ่งน่าจะมีเหตุผลมาจากการที่ data imbalance ที่ 'YES' เป็น minority class

# fixing size

In [193]:
# -- fixing size of No and Yes at 1000
clean_df_sample = clean_df.sample(n=1000, random_state=42)
X_fix,y_fix = split_Xy(clean_df_sample)

y_pred = cross_val_predict(model, X_fix, y_fix, cv=5)
print(classification_report(y_pred,y_fix))

              precision    recall  f1-score   support

          No       0.87      0.83      0.85       775
         Yes       0.50      0.59      0.54       225

    accuracy                           0.78      1000
   macro avg       0.69      0.71      0.70      1000
weighted avg       0.79      0.78      0.78      1000



# Over-sampling by duplication the minority class

In [194]:
yes_df = clean_df[clean_df['Churn'] == "Yes"]
dup_yes = pd.concat([yes_df, yes_df])

dup_clean_df = pd.concat([clean_df, dup_yes])

X, y = split_Xy(dup_clean_df)

y_pred = cross_val_predict(model, X_fix, y_fix, cv=5)
print(classification_report(y_pred,y_fix))

              precision    recall  f1-score   support

          No       0.87      0.83      0.85       775
         Yes       0.50      0.59      0.54       225

    accuracy                           0.78      1000
   macro avg       0.69      0.71      0.70      1000
weighted avg       0.79      0.78      0.78      1000



# Over-sampling by SMOTE

In [195]:
from imblearn.over_sampling import SMOTE


smote = SMOTE(random_state=42)
X_train_SMOTE, y_train_SMOTE = smote.fit_resample(X, y)

knn = KNeighborsClassifier(n_neighbors=5)
y_pred_SMOTE = cross_val_predict(knn, X_train_SMOTE, y_train_SMOTE, cv=5)

In [196]:
knn = KNeighborsClassifier(n_neighbors=5)
y_pred_SMOTE = cross_val_predict(knn, X_train_SMOTE, y_train_SMOTE, cv=5)

In [197]:
# Output results
print("Classification Report (KNN with PCA, no pipeline):")
print(classification_report(y_train_SMOTE, y_pred_SMOTE))

print("Confusion Matrix:")
print(confusion_matrix(y_train_SMOTE, y_pred_SMOTE))

Classification Report (KNN with PCA, no pipeline):
              precision    recall  f1-score   support

          No       0.77      0.71      0.74      5607
         Yes       0.73      0.79      0.76      5607

    accuracy                           0.75     11214
   macro avg       0.75      0.75      0.75     11214
weighted avg       0.75      0.75      0.75     11214

Confusion Matrix:
[[3962 1645]
 [1188 4419]]


จากการลองทำนายด้วย knn โดยที่ไม่ผ่านการ sampling ข้อมูลได้ accuracy อยู่ที่ 0.77 แต่ f1-score ของ minority data ต่ำมากอยู่ที่ 0.51
การลอง pre-processing data ด้วยการ fixing size, duplicate minority class, SMOTE ได้ผลว่าการใช้ fixing size และ duplicate minority ไม่ช่วยให้ f1-score ของ minority class ดีขึ้นเนื่องจากผลลัพค่า f1 ยังคงอยู่ในช่วง 0.5 แต่การทำ SMOTE ช่วยให้ f1-score ของ minority class เพิ่มขึ้นมาเป็น 0.76 แต่ accuracy โดยรวมจะลดลงนิดหน่อย