<a href="https://colab.research.google.com/github/Akshatha7710/telco-customer-churn/blob/main/telco_customer_churn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
# Core imports
import os, json
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, average_precision_score, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier
from sklearn.neural_network import MLPClassifier

# libraries
try:
    import xgboost as xgb
    XGBOOST_AVAILABLE = True
except Exception:
    XGBOOST_AVAILABLE = False

try:
    import shap
    SHAP_AVAILABLE = True
except Exception:
    SHAP_AVAILABLE = False

try:
    from imblearn.over_sampling import SMOTE
    IMBLEARN_AVAILABLE = True
except Exception:
    IMBLEARN_AVAILABLE = False

# Random state
RANDOM_STATE = 42

# Create outputs folder
os.makedirs("outputs", exist_ok=True)

# Load Dataset
data_path = "/content/sample_data/Telco-Customer-Churn.csv"
if not os.path.exists(data_path):
    print(f"[WARN] Dataset not found at {data_path}, creating synthetic data")
    from sklearn.datasets import make_classification
    X_synthetic, y_synthetic = make_classification(
        n_samples=2000, n_features=20, n_informative=10,
        n_redundant=5, n_classes=2, weights=[0.69, 0.31],
        random_state=RANDOM_STATE
    )
    df = pd.DataFrame(X_synthetic, columns=[f"feature_{i}" for i in range(20)])
    df["Churn"] = np.where(y_synthetic==1, "Yes", "No")
else:
    df = pd.read_csv(data_path)

print(f"[INFO] Dataset loaded. Shape: {df.shape}")

# EDA (Task 1)
print("----- EDA -----")

if 'TotalCharges' in df.columns:
    df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Show a sample that includes customerID for the report appendix, then drop ID before model prep
print(df.head())   # this prints the sample with customerID present (if it exists)

if 'customerID' in df.columns:
    # drop before preprocessing (customerID is an identifier, not a feature)
    df.drop(columns=['customerID'], inplace=True)

# 1. Basic checks
print(df.head())
print("\nMissing values:\n", df.isnull().sum())
print("\nTarget distribution:\n", df["Churn"].value_counts())

# 2. Target distribution
plt.figure(figsize=(6,4))
sns.countplot(x='Churn', data=df)
plt.title('Churn Distribution BEFORE SMOTE')
plt.savefig("outputs/churn_distribution_before_smote.png")
plt.close()

# 3. Numeric features EDA
numerical_cols = df.select_dtypes(include=['int64','float64']).columns.tolist()
for col in numerical_cols:
    plt.figure(figsize=(8,5))
    sns.histplot(df[col], bins=30, kde=True)
    plt.title(f'{col} Histogram')
    plt.savefig(f"outputs/{col}_hist.png")
    plt.close()

    # Boxplot for outlier detection
    plt.figure(figsize=(8,5))
    sns.boxplot(x='Churn', y=col, data=df)
    plt.title(f'{col} Boxplot by Churn')
    plt.savefig(f"outputs/{col}_boxplot.png")
    plt.close()

# 4. Correlation heatmap
plt.figure(figsize=(12,10))
sns.heatmap(df[numerical_cols].corr(), annot=True, fmt=".2f", cmap='coolwarm')
plt.title("Correlation Heatmap of Numeric Features")
plt.savefig("outputs/correlation_heatmap.png")
plt.close()

print("[INFO] EDA plots saved in outputs/")

# Preprocessing (Task 2)
def preprocess(df):
    categorical_cols = df.select_dtypes(include=["object"]).columns.tolist()
    if "Churn" in categorical_cols:
        categorical_cols.remove("Churn")

    numerical_cols = df.select_dtypes(include=["int64", "float64"]).columns.tolist()

    cat_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    preprocessor = ColumnTransformer([
        ('num', num_pipeline, numerical_cols),
        ('cat', cat_pipeline, categorical_cols)
    ], remainder='passthrough')

    X = df.drop(columns="Churn")
    y = df["Churn"].map({"No":0, "Yes":1})

    X_trans = preprocessor.fit_transform(X)

    # Handle imbalance
    if IMBLEARN_AVAILABLE:
        sm = SMOTE(random_state=RANDOM_STATE)
        X_trans, y = sm.fit_resample(X_trans, y)

    return X_trans, y, preprocessor, categorical_cols, numerical_cols

X, y, preprocessor, cat_cols, num_cols = preprocess(df)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
)

# VISUALIZE BALANCED DATA (Figure 1 - After SMOTE)
plt.figure(figsize=(6,4))
sns.countplot(x=y.map({0: 'No', 1: 'Yes'}))
plt.title('Churn Distribution AFTER SMOTE')
plt.ylabel('Count (Balanced)')
plt.xlabel('Churn Status')
plt.savefig("outputs/churn_distribution_after_smote.png")
plt.close()

print("[INFO] 'After SMOTE' distribution plot saved for Figure 1.")
# Models & Hyperparameter Tuning
models = {
    "dummy": DummyClassifier(strategy="most_frequent"),
    "random_forest": RandomForestClassifier(random_state=RANDOM_STATE, n_estimators=100),
}

if XGBOOST_AVAILABLE:
    models["xgboost"] = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=RANDOM_STATE)

# Tuned Decision Tree
dt_model = DecisionTreeClassifier(random_state=RANDOM_STATE)
dt_param_grid = {'max_depth':[5,10,15], 'min_samples_leaf':[5,10]}
tuned_dt = GridSearchCV(dt_model, dt_param_grid, cv=3, scoring='f1', n_jobs=1)
models["tuned_decision_tree"] = tuned_dt

# Tuned Neural Network
nn_model = MLPClassifier(max_iter=300, random_state=RANDOM_STATE, early_stopping=True)
nn_param_grid = {'hidden_layer_sizes':[(32,16),(64,32),(64,)], 'alpha':[0.0001,0.001]}
tuned_nn = GridSearchCV(nn_model, nn_param_grid, cv=3, scoring='f1', n_jobs=1)
models["tuned_neural_network"] = tuned_nn

# Training & Evaluation
results = {}
for name, model in models.items():
    print(f"[INFO] Training {name}")
    model.fit(X_train, y_train)
    best_model = model.best_estimator_ if hasattr(model,'best_estimator_') else model

    y_pred = best_model.predict(X_test)
    y_proba = best_model.predict_proba(X_test)[:,1] if hasattr(best_model,"predict_proba") else y_pred

    results[name] = {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred, zero_division=0),
        "recall": recall_score(y_test, y_pred, zero_division=0),
        "f1": f1_score(y_test, y_pred, zero_division=0),
        "roc_auc": roc_auc_score(y_test, y_proba),
        "pr_auc": average_precision_score(y_test, y_proba),
        "confusion_matrix": confusion_matrix(y_test, y_pred).tolist()
    }

    if hasattr(model,'best_params_'):
        results[name]["best_params"] = model.best_params_
        print(f"Best Parameters: {model.best_params_}")

    # Confusion matrix plot
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(5,4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title(f"{name} Confusion Matrix")
    plt.savefig(f"outputs/{name}_confusion_matrix.png")
    plt.close()

# Save results
with open("outputs/results.json","w") as f:
    json.dump(results,f,indent=4)
print("[INFO] Training complete. Results saved to outputs/results.json")

# ROC-AUC Curve Comparison (Figure 11)
from sklearn.metrics import roc_curve, auc # <-- Ensure this import is present!
plt.figure(figsize=(10, 8))

for name, model in models.items():
    # Only plot the main comparison models (DT, NN, RF, XGBoost)
    if name not in ["dummy"]:
        best_model = model.best_estimator_ if hasattr(model,'best_estimator_') else model

        # Check if the model has predict_proba
        if hasattr(best_model, "predict_proba"):
            y_proba = best_model.predict_proba(X_test)[:, 1]
            fpr, tpr, _ = roc_curve(y_test, y_proba)
            roc_auc = auc(fpr, tpr)
            plt.plot(fpr, tpr, label=f'{name.replace("_", " ").title()} (AUC = {roc_auc:.3f})')

plt.plot([0, 1], [0, 1], 'k--', label='Baseline (AUC = 0.5)')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC-AUC Curve Comparison of Models')
plt.legend(loc="lower right")
plt.grid(True)
plt.savefig("outputs/roc_auc_comparison_curve.png")
plt.close()

print("[INFO] ROC-AUC comparison plot saved for Figure 11.")

# Feature Importance & Model Plots
try:
    feature_names = preprocessor.get_feature_names_out()
except AttributeError:
    feature_names = [f"feature_{i}" for i in range(X.shape[1])]

# Feature importance for tree models
for name in ["tuned_decision_tree","random_forest","xgboost"]:
    if name in models:
        model_to_check = models[name].best_estimator_ if hasattr(models[name],'best_estimator_') else models[name]
        if hasattr(model_to_check,"feature_importances_"):
            fi = model_to_check.feature_importances_
            sorted_idx = fi.argsort()[-20:]
            plt.figure(figsize=(12,8))
            plt.barh(feature_names[sorted_idx], fi[sorted_idx])
            plt.title(f"{name} Feature Importances")
            plt.tight_layout()
            plt.savefig(f"outputs/{name}_feature_importance.png")
            plt.close()

# MLP loss curve
if "tuned_neural_network" in models:
    nn_model = models["tuned_neural_network"].best_estimator_
    if hasattr(nn_model,"loss_curve_"):
        plt.figure(figsize=(8,5))
        plt.plot(nn_model.loss_curve_)
        plt.title("Tuned Neural Network Training Loss Curve")
        plt.xlabel("Iteration")
        plt.ylabel("Loss")
        plt.grid(True)
        plt.tight_layout()
        plt.savefig("outputs/nn_loss_curve.png")
        plt.close()

# SHAP Explanations (Ethics Discussion)
if SHAP_AVAILABLE and XGBOOST_AVAILABLE:
    print("[INFO] Generating SHAP explanations")
    xgb_model = models["xgboost"].best_estimator_ if hasattr(models["xgboost"],'best_estimator_') else models["xgboost"]

    if hasattr(xgb_model,'feature_importances_'):
        explainer = shap.TreeExplainer(xgb_model)
        X_sample = X_test[:200]
        feature_names_list = feature_names.tolist()

        try:
            shap_values = explainer.shap_values(X_sample)
            X_sample_df = pd.DataFrame(X_sample, columns=feature_names_list)
            shap.summary_plot(shap_values, X_sample_df, show=False)
            plt.savefig("outputs/shap_summary.png")
            plt.close()
        except Exception as e:
            print(f"[WARN] SHAP plotting failed: {e}")

print("[INFO] All outputs saved in: outputs/")
print("[INFO] AI Ethics: SMOTE used to address class imbalance, SHAP for explainability.")
print("[INFO] Post-deployment: Monitor model drift, retrain periodically, log misclassifications for fairness analysis.")


[INFO] Dataset loaded. Shape: (7043, 21)
----- EDA -----
   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone service             DSL            Yes  ...              Yes   
4                No     Fiber optic             No  ...               No   

  Tec