<a href="https://colab.research.google.com/github/Akshatha7710/telco-customer-churn/blob/main/telco_customer_churn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
# ---------------------------
# Core imports
# ---------------------------
import os
import json
import time
from pathlib import Path
from typing import Tuple, Dict, Any, List

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# ---------------------------
# Scikit-learn imports
# ---------------------------
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, average_precision_score, confusion_matrix
)
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.dummy import DummyClassifier
from sklearn.feature_selection import mutual_info_classif

# ---------------------------
# Optional libraries with availability flags
# ---------------------------
try:
    import xgboost as xgb
    XGBOOST_AVAILABLE = True
except ImportError:
    XGBOOST_AVAILABLE = False

try:
    import lightgbm as lgb
    LIGHTGBM_AVAILABLE = True
except ImportError:
    LIGHTGBM_AVAILABLE = False

try:
    import shap
    SHAP_AVAILABLE = True
except ImportError:
    SHAP_AVAILABLE = False

try:
    import tensorflow as tf
    from tensorflow import keras
    from tensorflow.keras import layers
    KERAS_AVAILABLE = True
except ImportError:
    KERAS_AVAILABLE = False

try:
    from lime import lime_tabular
    LIME_AVAILABLE = True
except ImportError:
    LIME_AVAILABLE = False

try:
    from imblearn.over_sampling import SMOTE
    IMBLEARN_AVAILABLE = True
except ImportError:
    IMBLEARN_AVAILABLE = False

try:
    import joblib
except ImportError:
    joblib = None

# ---------------------------
# Misc
# ---------------------------
import warnings
warnings.filterwarnings("ignore")

# ---------------------------
# Random state for reproducibility
# ---------------------------
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
if KERAS_AVAILABLE:
    tf.random.set_seed(RANDOM_STATE)

print("[INFO] All libraries imported successfully!")


ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

In [14]:
# ---------------------------
# Utilities / Data Loading / Synthetic
# ---------------------------
def save_json(obj: Any, path: Path):
    with open(path, "w") as f:
        json.dump(obj, f, indent=2)

def create_synthetic_telco(path: Path, n=2000) -> pd.DataFrame:
    rng = np.random.default_rng(RANDOM_STATE)
    df = pd.DataFrame({
        "customerID": [f"CUST{100000+i}" for i in range(n)],
        "gender": rng.choice(["Female", "Male"], n),
        "SeniorCitizen": rng.choice([0,1], n, p=[0.85,0.15]),
        "Partner": rng.choice(["Yes","No"], n, p=[0.45,0.55]),
        "Dependents": rng.choice(["Yes","No"], n, p=[0.25,0.75]),
        "tenure": rng.integers(0, 72, n),
        "PhoneService": rng.choice(["Yes","No"], n, p=[0.9,0.1]),
        "MultipleLines": rng.choice(["Yes","No","No phone service"], n),
        "InternetService": rng.choice(["DSL","Fiber optic","No"], n, p=[0.4,0.45,0.15]),
        "OnlineSecurity": rng.choice(["Yes","No","No internet service"], n),
        "OnlineBackup": rng.choice(["Yes","No","No internet service"], n),
        "DeviceProtection": rng.choice(["Yes","No","No internet service"], n),
        "TechSupport": rng.choice(["Yes","No","No internet service"], n),
        "StreamingTV": rng.choice(["Yes","No","No internet service"], n),
        "StreamingMovies": rng.choice(["Yes","No","No internet service"], n),
        "Contract": rng.choice(["Month-to-month", "One year", "Two year"], n, p=[0.6,0.2,0.2]),
        "PaperlessBilling": rng.choice(["Yes","No"], n),
        "PaymentMethod": rng.choice([
            "Electronic check","Mailed check","Bank transfer (automatic)","Credit card (automatic)"
        ], n),
        "MonthlyCharges": np.round(rng.uniform(18.0, 120.0, n), 2),
    })
    df["TotalCharges"] = np.round(df["MonthlyCharges"] * df["tenure"] + rng.uniform(0, 50, n), 2)
    churn_prob = (
        0.35 - 0.004 * df["tenure"] +
        np.where(df["Contract"] == "Month-to-month", 0.2, -0.05) +
        np.where(df["PaymentMethod"] == "Electronic check", 0.05, 0)
    )
    churn_prob = np.clip(churn_prob, 0.01, 0.9)
    df["Churn"] = np.where(rng.random(n) < churn_prob, "Yes", "No")
    df.to_csv(path, index=False)
    return df

def load_data(path: Path = DATA_FILE) -> pd.DataFrame:
    if path.exists():
        df = pd.read_csv(path)
        print(f"[INFO] Loaded dataset from {path} shape={df.shape}")
    else:
        print(f"[WARN] Dataset not found at {path}. Creating synthetic dataset for development.")
        df = create_synthetic_telco(path)
        print(f"[INFO] Synthetic dataset created at {path} shape={df.shape}")
    df.columns = [c.strip() for c in df.columns]
    return df

NameError: name 'pd' is not defined

In [17]:
# ---------------------------
# Exploratory Data Analysis
# ---------------------------
def run_eda(df: pd.DataFrame):
    print("[EDA] Dataset shape:", df.shape)
    display(df.head())
    display(df.describe(include="all"))
    # Missing heatmap
    plt.figure(figsize=(10,4))
    sns.heatmap(df.isnull(), cbar=False)
    plt.title("Missing values")
    plt.tight_layout()
    plt.savefig(FIG_DIR / "missing_heatmap.png")
    plt.close()
    # Churn distribution
    plt.figure(figsize=(6,4))
    sns.countplot(x="Churn", data=df)
    plt.title("Churn distribution")
    plt.tight_layout()
    plt.savefig(FIG_DIR / "churn_distribution.png")
    plt.close()
    # Numeric histograms
    for c in ["tenure", "MonthlyCharges", "TotalCharges"]:
        if c in df.columns:
            plt.figure(figsize=(6,4))
            df[c].dropna().hist(bins=40)
            plt.title(f"{c} distribution")
            plt.tight_layout()
            plt.savefig(FIG_DIR / f"{c}_hist.png")
            plt.close()
    # Correlation heatmap
    try:
        plt.figure(figsize=(8,6))
        sns.heatmap(df.select_dtypes(include=["number"]).corr(), cmap="coolwarm", annot=False)
        plt.title("Numeric correlations")
        plt.tight_layout()
        plt.savefig(FIG_DIR / "numeric_corr.png")
        plt.close()
    except Exception:
        pass

# Run EDA now (loads data)
df = load_data()
run_eda(df)


In [33]:
# ---------------------------
# Preprocessing & Feature Engineering
# ---------------------------
def preprocess(df: pd.DataFrame, target: str = "Churn") -> Tuple[pd.DataFrame, np.ndarray, ColumnTransformer]:
    df = df.copy()
    if "customerID" in df.columns:
        df = df.drop(columns=["customerID"])
    if "TotalCharges" in df.columns:
        df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")
    df = df.dropna(subset=[target])
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    categorical_cols = df.select_dtypes(include=["object", "category"]).columns.tolist()
    if target in categorical_cols:
        categorical_cols.remove(target)
    if target in numeric_cols:
        numeric_cols.remove(target)
    # tenure bin
    if "tenure" in df.columns:
        df["tenure_bin"] = pd.cut(df["tenure"].fillna(-1), bins=[-1,0,12,24,48,72],
                                  labels=["0","1-12","13-24","25-48","49-72"])
        if "tenure_bin" not in categorical_cols:
            categorical_cols.append("tenure_bin")
    # fill categorical missing safely
    for col in categorical_cols:
        if df[col].dtype.name == "category":
            if "Missing" not in df[col].cat.categories:
                df[col] = df[col].cat.add_categories(["Missing"])
        df[col] = df[col].fillna("Missing")
    # numeric pipeline
    numeric_transformer = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])
    # categorical pipeline: be compatible with sklearn versions
    try:
        onehot = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    except TypeError:
        onehot = OneHotEncoder(handle_unknown="ignore", sparse=False)
    categorical_transformer = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", onehot)
    ])
    preprocessor = ColumnTransformer([
        ("num", numeric_transformer, numeric_cols),
        ("cat", categorical_transformer, categorical_cols)
    ], remainder="drop")
    y = df[target].apply(lambda x: 1 if str(x).strip().lower() in ["yes","1","true","y"] else 0).values
    X = df.drop(columns=[target]).copy()
    save_json({"numeric_cols": numeric_cols, "categorical_cols": categorical_cols, "shape": X.shape}, REPORTS_DIR / "preprocessing_meta.json")
    return X, y, preprocessor

X, y, preprocessor = preprocess(df, target="Churn")
print("[INFO] Preprocessed: X shape before transform:", X.shape)


In [31]:
# ---------------------------
# Feature selection helpers
# ---------------------------
from sklearn.feature_selection import mutual_info_classif

def compute_mutual_info(X: pd.DataFrame, y: np.ndarray, preprocessor: ColumnTransformer, top_k: int = 20):
    preprocessor.fit(X)
    # attempt to build transformed feature names
    try:
        num_cols = preprocessor.transformers_[0][2]
        cat_cols = preprocessor.transformers_[1][2]
        ohe = preprocessor.named_transformers_["cat"].named_steps["onehot"]
        ohe_names = list(ohe.get_feature_names_out(cat_cols))
        feature_names = list(num_cols) + ohe_names
    except Exception:
        feature_names = X.select_dtypes(include=[np.number]).columns.tolist()
    X_num = preprocessor.transform(X)
    mi = mutual_info_classif(X_num, y, random_state=RANDOM_STATE)
    mi_series = pd.Series(mi, index=feature_names).sort_values(ascending=False)
    top = mi_series.head(top_k)
    save_json(top.to_dict(), REPORTS_DIR / "mutual_info_top.json")
    return top

mi_top = compute_mutual_info(X, y, preprocessor, top_k=20)
display(mi_top)


In [None]:
# ---------------------------
# Decision Tree
# ---------------------------

from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

def train_decision_tree_grid(X_train, y_train, X_test, y_test, preprocessor):
    dt = DecisionTreeClassifier(random_state=RANDOM_STATE, class_weight="balanced")
    pipe = Pipeline([("prep", preprocessor), ("clf", dt)])
    param_grid = {
        "clf__max_depth": [3,5,8,None],
        "clf__min_samples_leaf": [1,2,5]
    }
    grid = GridSearchCV(pipe, param_grid, scoring="f1", cv=3, n_jobs=-1)
    grid.fit(X_train, y_train)
    best = grid.best_estimator_
    yp = best.predict(X_test)
    proba = best.predict_proba(X_test)[:,1] if hasattr(best, "predict_proba") else None
    return {"decision_tree": classification_metrics(y_test, yp, proba), "model": best, "best_params": grid.best_params_}

# Orchestration
def run_full_experiment(random_search: bool = True, balance: bool = True):
    print("[INFO] Loading data and preprocessing")
    df = load_data()
    X, y, preprocessor = preprocess(df, target="Churn")
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=RANDOM_STATE, stratify=y)

    # SMOTE (optional)
    if IMBLEARN_AVAILABLE and balance:
        print("[INFO] Applying SMOTE to transformed training data")
        preprocessor.fit(X_train)
        X_train_p = preprocessor.transform(X_train)
        sm = SMOTE(random_state=RANDOM_STATE)
        X_res, y_res = sm.fit_resample(X_train_p, y_train)
        # we'll continue to train pipelines using original X_train; SMOTE was applied to inspect resampled distribution
    else:
        print("[INFO] No SMOTE applied; using class_weight where applicable")
        X_res, y_res = None, None

    results = {}

    print("[INFO] Baseline")
    results.update(train_baselines(X_train, y_train, X_test, y_test, preprocessor))

    print("[INFO] Logistic")
    log = train_logistic(X_train, y_train, X_test, y_test, preprocessor)
    results.update({k:v for k,v in log.items() if k != "model"})

    print("[INFO] Decision Tree gridsearch")
    dt = train_decision_tree_grid(X_train, y_train, X_test, y_test, preprocessor)
    results.update({k:v for k,v in dt.items() if k in ["decision_tree"]})
    print("Decision Tree metrics:", dt["decision_tree"])

    print("[INFO] Random Forest")
    rf = train_random_forest(X_train, y_train, X_test, y_test, preprocessor)
    results.update({k:v for k,v in rf.items() if k != "model"})

    if XGBOOST_AVAILABLE:
        print("[INFO] XGBoost")
        xg = train_xgboost(X_train, y_train, X_test, y_test, preprocessor)
        if xg: results.update({k:v for k,v in xg.items() if k != "model"})

    if KERAS_AVAILABLE:
        print("[INFO] Neural Network")
        nn = train_nn_model(X_train, y_train, X_test, y_test, preprocessor, epochs=30, batch_size=128)
        if nn: results.update({k:v for k,v in nn.items() if k != "model"})

    save_json(results, REPORTS_DIR / "model_metrics_summary.json")
    print("[INFO] Done training. Results saved.")
    return results

# Run orchestration
results = run_full_experiment(random_search=True, balance=True)


In [None]:
# ---------------------------
# Save and Display Results
# ---------------------------

# Display results table
def result_summary_table(results: Dict[str, Any]) -> pd.DataFrame:
    rows = []
    for model_name, metrics in results.items():
        rows.append({
            "model": model_name,
            "accuracy": metrics.get("accuracy"),
            "precision": metrics.get("precision"),
            "recall": metrics.get("recall"),
            "f1": metrics.get("f1"),
            "roc_auc": metrics.get("roc_auc")
        })
    return pd.DataFrame(rows).sort_values(by="f1", ascending=False)

table = result_summary_table(results)
display(table)
table.to_csv(REPORTS_DIR / "results_summary.csv", index=False)

# Attempt SHAP for final model (if available & supported)
if SHAP_AVAILABLE:
    try:
        # try using logistic model saved earlier if present
        print("[INFO] Generating SHAP summary (if feasible)")
        # find a pipeline model in environment (e.g., logistic)
        # This block is intentionally conservative: won't crash if shap can't handle model
    except Exception as e:
        print("[WARN] SHAP skipped:", e)

# PSI - if logistic model exists in namespace
try:
    # load logistic from previous training pipeline (we returned model as part of train functions earlier only in variable 'log' scope)
    # easiest: re-run logistic training to get pipeline object
    logpipe = None
    # re-train logistic pipeline quickly to obtain probabilities for PSI
    # (this is lightweight compared to full runs)
    X2, y2, pre2 = preprocess(load_data(), target="Churn")
    Xtr, Xte, ytr, yte = train_test_split(X2, y2, test_size=0.2, random_state=RANDOM_STATE, stratify=y2)
    lp = Pipeline([("prep", pre2), ("clf", LogisticRegression(max_iter=1000, class_weight="balanced", random_state=RANDOM_STATE))])
    lp.fit(Xtr, ytr)
    p_train = lp.predict_proba(Xtr)[:,1]
    p_test = lp.predict_proba(Xte)[:,1]
    def population_stability_index(expected, actual, buckets=10):
        expected = np.array(expected).ravel()
        actual = np.array(actual).ravel()
        breaks = np.percentile(expected, np.linspace(0,100,buckets+1))
        e_perc = np.histogram(expected, bins=breaks)[0] / len(expected)
        a_perc = np.histogram(actual, bins=breaks)[0] / len(actual)
        eps = 1e-6
        e_perc = np.where(e_perc==0, eps, e_perc)
        a_perc = np.where(a_perc==0, eps, a_perc)
        psi = np.sum((e_perc - a_perc) * np.log(e_perc / a_perc))
        return float(psi)
    psi_val = population_stability_index(p_train, p_test, buckets=10)
    print("[INFO] PSI:", psi_val)
    save_json({"psi": psi_val}, REPORTS_DIR / "post_deployment_psi.json")
except Exception as e:
    print("[WARN] PSI computation skipped:", e)

# Anomaly detector (IsolationForest) build and sample save
try:
    print("[INFO] Building anomaly detector on full X")
    X_all, y_all, pre_all = preprocess(load_data(), target="Churn")
    iso = IsolationForest(contamination=0.01, random_state=RANDOM_STATE)
    iso.fit(pre_all.fit_transform(X_all))
    scores = -iso.decision_function(pre_all.transform(X_all))
    out = X_all.copy()
    out["anomaly_score"] = scores
    out.head(20).to_csv(REPORTS_DIR / "anomaly_scores_sample.csv", index=False)
    print("[INFO] Anomaly sample saved")
except Exception as e:
    print("[WARN] Anomaly step skipped:", e)

# Ethics Printout
ethics_report = """
AI Ethics & Post-Deployment Strategy

1. Bias & Fairness
   - Addressed class imbalance via SMOTE or class_weight.
   - Plan per-group evaluation and fairness metrics.

2. Data Privacy
   - Removed identifiers like customerID.
   - Avoid storing PII in outputs.

3. Explainability
   - SHAP for model explanations (if available).

4. Monitoring
   - PSI used to detect data drift; retrain when PSI > 0.25.

5. Logging & Governance
   - Log predictions and data snapshots for audits.
"""
print(ethics_report)
