Import libraries 

In [2]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import optuna
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import seaborn as sns
import shap
from lime.lime_tabular import LimeTabularExplainer
import time

Load dataset

In [3]:
df = pd.read_csv('Telco_customer_churn.csv')
df.head()

Unnamed: 0,CustomerID,Count,Country,State,City,Zip Code,Lat Long,Latitude,Longitude,Gender,...,Contract,Paperless Billing,Payment Method,Monthly Charges,Total Charges,Churn Label,Churn Value,Churn Score,CLTV,Churn Reason
0,3668-QPYBK,1,United States,California,Los Angeles,90003,"33.964131, -118.272783",33.964131,-118.272783,Male,...,Month-to-month,Yes,Mailed check,53.85,108.15,Yes,1,86,3239,Competitor made better offer
1,9237-HQITU,1,United States,California,Los Angeles,90005,"34.059281, -118.30742",34.059281,-118.30742,Female,...,Month-to-month,Yes,Electronic check,70.7,151.65,Yes,1,67,2701,Moved
2,9305-CDSKC,1,United States,California,Los Angeles,90006,"34.048013, -118.293953",34.048013,-118.293953,Female,...,Month-to-month,Yes,Electronic check,99.65,820.5,Yes,1,86,5372,Moved
3,7892-POOKP,1,United States,California,Los Angeles,90010,"34.062125, -118.315709",34.062125,-118.315709,Female,...,Month-to-month,Yes,Electronic check,104.8,3046.05,Yes,1,84,5003,Moved
4,0280-XJGEX,1,United States,California,Los Angeles,90015,"34.039224, -118.266293",34.039224,-118.266293,Male,...,Month-to-month,Yes,Bank transfer (automatic),103.7,5036.3,Yes,1,89,5340,Competitor had better devices


EDA - Check for and handle missing values

In [4]:
df.loc[(df['Total Charges'] == ' '), 'Total Charges'] = 0
df['Total Charges'] = pd.to_numeric(df['Total Charges'])
df.dtypes

CustomerID            object
Count                  int64
Country               object
State                 object
City                  object
Zip Code               int64
Lat Long              object
Latitude             float64
Longitude            float64
Gender                object
Senior Citizen        object
Partner               object
Dependents            object
Tenure Months          int64
Phone Service         object
Multiple Lines        object
Internet Service      object
Online Security       object
Online Backup         object
Device Protection     object
Tech Support          object
Streaming TV          object
Streaming Movies      object
Contract              object
Paperless Billing     object
Payment Method        object
Monthly Charges      float64
Total Charges        float64
Churn Label           object
Churn Value            int64
Churn Score            int64
CLTV                   int64
Churn Reason          object
dtype: object

Data preprocessing and dropping non-essential columns

In [5]:
df.drop(['CustomerID', 'State', 'Country', 'Count', 'Lat Long', 'Churn Label', 'Churn Score', 'CLTV', 'Churn Reason'], axis=1, inplace=True)


Replace white spaces with underscores

In [6]:

df.columns = df.columns.str.replace(' ', '_')
df.replace(' ', '_', regex=True, inplace=True)


Prepare features and target


In [7]:
X = df.drop('Churn_Value', axis=1).copy()
y = df['Churn_Value'].copy()

 One-hot encoding

In [8]:

X_encoded = pd.get_dummies(X, columns=[
    'City', 'Gender', 'Senior_Citizen', 'Partner', 'Dependents', 
    'Phone_Service', 'Multiple_Lines', 'Internet_Service',
    'Online_Security', 'Online_Backup', 'Device_Protection',
    'Tech_Support', 'Streaming_TV', 'Streaming_Movies',
    'Contract', 'Paperless_Billing', 'Payment_Method'
])

Split into training and testing sets

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42, stratify=y)


Define the Optuna objective function

In [10]:

def objective(trial):
    smote_sampling_strategy = trial.suggest_uniform("sampling_strategy", 0.5, 1.0)
    smote_k_neighbors = trial.suggest_int("k_neighbors", 3, 10)
    
    xgb_n_estimators = trial.suggest_int("n_estimators", 50, 300)
    xgb_learning_rate = trial.suggest_loguniform("learning_rate", 0.01, 0.3)
    xgb_max_depth = trial.suggest_int("max_depth", 3, 20)
    xgb_subsample = trial.suggest_uniform("subsample", 0.6, 1.0)
    xgb_colsample_bytree = trial.suggest_uniform("colsample_bytree", 0.6, 1.0)
    
    smote = SMOTE(sampling_strategy=smote_sampling_strategy, k_neighbors=smote_k_neighbors, random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
    
    xgb_model = XGBClassifier(
        n_estimators=xgb_n_estimators,
        learning_rate=xgb_learning_rate,
        max_depth=xgb_max_depth,
        subsample=xgb_subsample,
        colsample_bytree=xgb_colsample_bytree,
        random_state=42,
        eval_metric='logloss'
    )
    xgb_model.fit(X_train_resampled, y_train_resampled)
    y_pred = xgb_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy


Run the Optuna study

In [11]:

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)
print("Best parameters:", study.best_params)


[I 2025-01-14 15:16:16,275] A new study created in memory with name: no-name-2509f39f-d838-4b2a-8ca7-dd62a6630276
  smote_sampling_strategy = trial.suggest_uniform("sampling_strategy", 0.5, 1.0)
  xgb_learning_rate = trial.suggest_loguniform("learning_rate", 0.01, 0.3)
  xgb_subsample = trial.suggest_uniform("subsample", 0.6, 1.0)
  xgb_colsample_bytree = trial.suggest_uniform("colsample_bytree", 0.6, 1.0)
[I 2025-01-14 15:16:20,835] Trial 0 finished with value: 0.7934705464868701 and parameters: {'sampling_strategy': 0.9289286637265602, 'k_neighbors': 8, 'n_estimators': 281, 'learning_rate': 0.07086021227405134, 'max_depth': 3, 'subsample': 0.7133131134418179, 'colsample_bytree': 0.8688769118714237}. Best is trial 0 with value: 0.7934705464868701.
  smote_sampling_strategy = trial.suggest_uniform("sampling_strategy", 0.5, 1.0)
  xgb_learning_rate = trial.suggest_loguniform("learning_rate", 0.01, 0.3)
  xgb_subsample = trial.suggest_uniform("subsample", 0.6, 1.0)
  xgb_colsample_bytree

Best parameters: {'sampling_strategy': 0.8536176465594532, 'k_neighbors': 6, 'n_estimators': 183, 'learning_rate': 0.014091842595230886, 'max_depth': 13, 'subsample': 0.7510702511586398, 'colsample_bytree': 0.7574647114519422}


 Apply SMOTE with best parameters



In [12]:
best_smote = SMOTE(sampling_strategy=study.best_params["sampling_strategy"], k_neighbors=study.best_params["k_neighbors"], random_state=42)
X_train_best, y_train_best = best_smote.fit_resample(X_train, y_train)


In [13]:
# Train final XGBoost model
best_xgb_model = XGBClassifier(
    n_estimators=study.best_params["n_estimators"],
    learning_rate=study.best_params["learning_rate"],
    max_depth=study.best_params["max_depth"],
    subsample=study.best_params["subsample"],
    colsample_bytree=study.best_params["colsample_bytree"],
    random_state=42,
    eval_metric='logloss'
)
best_xgb_model.fit(X_train_best, y_train_best)



 Evaluate the model


In [14]:
y_pred_best = best_xgb_model.predict(X_test)
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_best))
print("\nClassification Report:\n", classification_report(y_test, y_pred_best))
print("\nAccuracy Score:", accuracy_score(y_test, y_pred_best))



Confusion Matrix:
 [[908 127]
 [152 222]]

Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.88      0.87      1035
           1       0.64      0.59      0.61       374

    accuracy                           0.80      1409
   macro avg       0.75      0.74      0.74      1409
weighted avg       0.80      0.80      0.80      1409


Accuracy Score: 0.8019872249822569


SVM 

In [15]:
from sklearn.svm import SVC

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42, stratify=y)


In [17]:
# Define the Optuna objective function
def objective(trial):
    smote_sampling_strategy = trial.suggest_uniform("sampling_strategy", 0.5, 1.0)
    smote_k_neighbors = trial.suggest_int("k_neighbors", 3, 10)
    
    svm_C = trial.suggest_loguniform("C", 0.1, 10)
    svm_kernel = trial.suggest_categorical("kernel", ["linear", "rbf"])
    
    smote = SMOTE(sampling_strategy=smote_sampling_strategy, k_neighbors=smote_k_neighbors, random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
    
    svm_model = SVC(C=svm_C, kernel=svm_kernel, probability=True, random_state=42, class_weight='balanced')
    svm_model.fit(X_train_resampled, y_train_resampled)
    y_pred = svm_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy


In [1]:
# Run the Optuna study
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)
print("Best parameters:", study.best_params)

[I 2025-01-14 15:23:33,561] A new study created in memory with name: no-name-be8fc9a8-1fc2-4b12-b1aa-29cf4b80f244
  smote_sampling_strategy = trial.suggest_uniform("sampling_strategy", 0.5, 1.0)
  svm_C = trial.suggest_loguniform("C", 0.1, 10)
[I 2025-01-14 15:29:00,263] Trial 0 finished with value: 0.46983676366217175 and parameters: {'sampling_strategy': 0.7078369712803996, 'k_neighbors': 5, 'C': 1.841517701723231, 'kernel': 'rbf'}. Best is trial 0 with value: 0.46983676366217175.
  smote_sampling_strategy = trial.suggest_uniform("sampling_strategy", 0.5, 1.0)
  svm_C = trial.suggest_loguniform("C", 0.1, 10)
[I 2025-01-14 15:33:43,913] Trial 1 finished with value: 0.47338537970191624 and parameters: {'sampling_strategy': 0.5801626720996049, 'k_neighbors': 9, 'C': 6.201471467931814, 'kernel': 'rbf'}. Best is trial 1 with value: 0.47338537970191624.
  smote_sampling_strategy = trial.suggest_uniform("sampling_strategy", 0.5, 1.0)
  svm_C = trial.suggest_loguniform("C", 0.1, 10)
[I 2025-