Install Required Libraries

In [None]:

!pip install shap xgboost seaborn scikit-optimize catboost lightgbm

Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier

from skopt import BayesSearchCV

Load and Preprocess Data

In [None]:
# --- Section 3: Load and Preprocess Data ---
data = pd.read_csv('./Telco Customer Churn.csv')
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce')
data.dropna(inplace=True)

label_encoder = LabelEncoder()
for col in data.columns:
    if data[col].dtype == 'object':
        data[col] = label_encoder.fit_transform(data[col].astype(str))

X = data.drop('Churn', axis=1)
y = data['Churn']


Split Data and Define Cross-Validation


In [None]:
# --- Section 4: Split Data and Define Cross-Validation ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


Define Models and Hyperparameter Search Spaces

In [None]:
# --- Section 5: Define Models and Hyperparameter Search Spaces ---
models = {
    'Logistic Regression': {
        'model': LogisticRegression(max_iter=1000, random_state=42),
        'params': {
            'feature_selection__k': (5, X_train.shape[1]),
            'classifier__C': (1e-3, 1e3, 'log-uniform'),
            'classifier__penalty': ['l2'],
            'classifier__solver': ['lbfgs']
        }
    },

    'Random Forest': {
        'model': RandomForestClassifier(random_state=42),
        'params': {
            'feature_selection__k': (5, X_train.shape[1]),
            'classifier__n_estimators': (50, 300),
            'classifier__max_depth': (3, 30),
            'classifier__min_samples_split': (2, 20),
            'classifier__min_samples_leaf': (1, 20),
            'classifier__max_features': ['sqrt', 'log2']
        }
    },
    'XGBoost': {
        'model': xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
        'params': {
            'feature_selection__k': (5, X_train.shape[1]),
            'classifier__n_estimators': (50, 300),
            'classifier__max_depth': (3, 30),
            'classifier__learning_rate': (0.01, 0.3, 'log-uniform'),
            'classifier__subsample': (0.5, 1.0),
            'classifier__colsample_bytree': (0.5, 1.0)
        }
    },
    'LightGBM': {
        'model': lgb.LGBMClassifier(random_state=42),
        'params': {
            'feature_selection__k': (5, X_train.shape[1]),
            'classifier__n_estimators': (50, 300),
            'classifier__max_depth': (3, 30),
            'classifier__learning_rate': (0.01, 0.3, 'log-uniform'),
            'classifier__num_leaves': (20, 50),
            'classifier__subsample': (0.5, 1.0),
            'classifier__colsample_bytree': (0.5, 1.0)
        }
    },
    'CatBoost': {
        'model': CatBoostClassifier(verbose=0, random_seed=42),
        'params': {
            'feature_selection__k': (5, X_train.shape[1]),
            'classifier__iterations': (50, 300),
            'classifier__depth': (3, 10),
            'classifier__learning_rate': (0.01, 0.3, 'log-uniform'),
            'classifier__l2_leaf_reg': (1, 10)
        }
    },
    'AdaBoost': {
        'model': AdaBoostClassifier(random_state=42),
        'params': {
            'feature_selection__k': (5, X_train.shape[1]),
            'classifier__n_estimators': (50, 300),
            'classifier__learning_rate': (0.01, 1.0, 'log-uniform')
        }
    }
}


Train Models, Optimize, and Save Results

In [None]:
# --- Section 6: Train Models, Optimize, and Save Results ---
results = []
best_estimators = {}

for name, m in models.items():
    print(f"--- {name} ---")
    # Base model without optimization
    pipe_base = Pipeline([
        ('feature_selection', SelectKBest(score_func=f_classif, k='all')),
        ('classifier', m['model'])
    ])
    pipe_base.fit(X_train, y_train)
    y_pred_base = pipe_base.predict(X_test)
    acc_base = accuracy_score(y_test, y_pred_base)
    print(f"Base Accuracy: {acc_base:.4f}")

    # Model with Bayesian optimization
    pipe_opt = Pipeline([
        ('feature_selection', SelectKBest(score_func=f_classif)),
        ('classifier', m['model'])
    ])
    opt = BayesSearchCV(
        estimator=pipe_opt,
        search_spaces=m['params'],
        n_iter=30,
        scoring='accuracy',
        cv=cv,
        random_state=42,
        n_jobs=-1
    )
    opt.fit(X_train, y_train)
    y_pred_opt = opt.predict(X_test)
    acc_opt = accuracy_score(y_test, y_pred_opt)
    prec_opt = precision_score(y_test, y_pred_opt)
    rec_opt = recall_score(y_test, y_pred_opt)
    f1_opt = f1_score(y_test, y_pred_opt)

    print(f"Optimized Accuracy: {acc_opt:.4f}")
    print(f"Optimized Precision: {prec_opt:.4f}")
    print(f"Optimized Recall: {rec_opt:.4f}")
    print(f"Optimized F1-Score: {f1_opt:.4f}")
    print("Best params:", opt.best_params_)
    print()

    results.append({
        'Model': name,
        'Base Accuracy': acc_base,
        'Optimized Accuracy': acc_opt,
        'Optimized Precision': prec_opt,
        'Optimized Recall': rec_opt,
        'Optimized F1-Score': f1_opt
    })

    best_estimators[name] = opt.best_estimator_

results_df = pd.DataFrame(results)


Display Results Table

In [None]:
# --- Section 7: Display Results Table ---
results_df.style.background_gradient(cmap='viridis')


Plot Accuracy Comparison Before and After Optimization

In [None]:
# --- Section 8: Plot Accuracy Comparison ---
plt.figure(figsize=(14, 7))
bar_width = 0.35
index = np.arange(len(results_df))

plt.bar(index, results_df['Base Accuracy'], bar_width, label='Base Accuracy', alpha=0.7)
plt.bar(index + bar_width, results_df['Optimized Accuracy'], bar_width, label='Optimized Accuracy', alpha=0.7)

plt.xticks(index + bar_width / 2, results_df['Model'], rotation=45)
plt.ylim(0, 1)
plt.ylabel('Accuracy')
plt.title('Comparison of Model Accuracies Before and After Bayesian Optimization')
plt.legend()

for i in range(len(results_df)):
    plt.text(i, results_df['Base Accuracy'][i] + 0.02, f"{results_df['Base Accuracy'][i]:.4f}", ha='center')
    plt.text(i + bar_width, results_df['Optimized Accuracy'][i] + 0.02, f"{results_df['Optimized Accuracy'][i]:.4f}", ha='center')

plt.tight_layout()
plt.show()


Plot ROC Curves for Optimized Models

In [None]:
# --- Section 9: Plot ROC Curves for Optimized Models ---
plt.figure(figsize=(10, 8))
for name, estimator in best_estimators.items():
    if hasattr(estimator.named_steps['classifier'], "predict_proba"):
        y_proba = estimator.predict_proba(X_test)[:, 1]
    else:
        y_score = estimator.decision_function(X_test)
        y_proba = (y_score - y_score.min()) / (y_score.max() - y_score.min())  # Normalize to [0,1]

    fpr, tpr, _ = roc_curve(y_test, y_proba)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, lw=2, label=f'{name} (AUC = {roc_auc:.3f})')

plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Optimized Models')
plt.legend(loc="lower right")
plt.show()
