# Customer Churn EDA + PCA + ROC (Multi-Model) — Colab Notebook

This notebook performs end-to-end analysis on a churn dataset:
- **EDA**: summary stats, missing values, class balance, correlations, distributions.
- **PCA**: dimensionality reduction for visualization and as an optional step in the ML pipeline.
- **ML Models**: Logistic Regression, Random Forest, Gradient Boosting, SVM, KNN, Naive Bayes, MLP.
- **ROC Curves**: One-vs-Rest ROC curves and AUC comparison across models.
- **Model Selection**: pick best model by ROC-AUC and save it.

> Tip: If you don't have the dataset locally, upload it via the Upload cell or change the path to your file.

In [None]:
# If you're in Google Colab, uncomment the below to install any missing libs
# !pip install scikit-learn pandas matplotlib seaborn xgboost==1.7.6 imbalanced-learn

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.metrics import (
    classification_report, confusion_matrix, roc_auc_score,
    roc_curve, RocCurveDisplay, accuracy_score, precision_recall_fscore_support
)

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

import json
import os

pd.set_option('display.max_columns', None)

## Load Data

In [None]:
# Option A: If you uploaded the dataset file in this notebook session, set its path here:
csv_path = "/mnt/data/churn_dataset.csv"  # Replace with your uploaded file path if needed

df = pd.read_csv(csv_path)
print("Shape:", df.shape)
df.head()

## EDA: Overview & Cleaning Checks

In [None]:
print("Data Types\n", df.dtypes)
print("\nMissing values per column:\n", df.isna().sum())
print("\nClass balance:\n", df['churn'].value_counts(normalize=True))

# Basic stats for numeric columns
display(df.describe(include='number'))

# Categorical overview
for c in df.select_dtypes(include=['object']).columns:
    print(f"\nValue counts for {c}:")
    print(df[c].value_counts(dropna=False).head(10))

# Correlation heatmap (numeric only)
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
plt.figure(figsize=(8,6))
sns.heatmap(df[numeric_cols].corr(), annot=False)
plt.title("Correlation Heatmap (Numeric)")
plt.show()

# Distribution of target
plt.figure(figsize=(5,4))
df['churn'].value_counts().plot(kind='bar')
plt.title("Target Distribution (churn)")
plt.xlabel("Class")
plt.ylabel("Count")
plt.show()

## Train/Test Split & Preprocessing

In [None]:
X = df.drop(columns=['churn'])
y = df['churn']

numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X.select_dtypes(exclude=[np.number]).columns.tolist()

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("Train shape:", X_train.shape, " Test shape:", X_test.shape)

## PCA (Visual Exploration)

In [None]:
# We'll fit PCA on the preprocessed training data for visualization (2 components)
pca_for_plot = Pipeline(steps=[
    ("preprocess", preprocess),
    ("pca", PCA(n_components=2, random_state=42))
])

X_train_pca2 = pca_for_plot.fit_transform(X_train)
plt.figure(figsize=(6,5))
plt.scatter(X_train_pca2[:,0], X_train_pca2[:,1], c=y_train, alpha=0.6)
plt.title("PCA (2 Components) of Training Data")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.show()

# Explained variance using a deeper PCA
pca_exp = Pipeline(steps=[
    ("preprocess", preprocess),
    ("pca", PCA(n_components=10, random_state=42))
])
pca_exp.fit(X_train)
explained = pca_exp.named_steps["pca"].explained_variance_ratio_
print("Explained variance ratio (first 10 PCs):", np.round(explained, 4))
plt.figure(figsize=(6,4))
plt.plot(np.cumsum(explained), marker='o')
plt.title("Cumulative Explained Variance (First 10 PCs)")
plt.xlabel("Number of Components")
plt.ylabel("Cumulative Variance Explained")
plt.grid(True)
plt.show()

## Train Multiple Models & Evaluate

In [None]:
models = {
    "LogisticRegression": LogisticRegression(max_iter=500, n_jobs=None),
    "RandomForest": RandomForestClassifier(n_estimators=300, random_state=42),
    "GradientBoosting": GradientBoostingClassifier(random_state=42),
    "SVM_RBF": SVC(kernel='rbf', probability=True, random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=15),
    "NaiveBayes": GaussianNB(),
    "MLP": MLPClassifier(hidden_layer_sizes=(64,32), activation='relu', max_iter=300, random_state=42)
}

results = []
probas = {}
for name, clf in models.items():
    pipe = Pipeline(steps=[("preprocess", preprocess), ("clf", clf)])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    if hasattr(pipe.named_steps["clf"], "predict_proba"):
        y_score = pipe.predict_proba(X_test)[:,1]
    else:
        # fallback: decision function
        if hasattr(pipe.named_steps["clf"], "decision_function"):
            dec = pipe.decision_function(X_test)
            # scale to [0,1] via min-max for ROC
            dec_min, dec_max = dec.min(), dec.max()
            y_score = (dec - dec_min) / (dec_max - dec_min + 1e-9)
        else:
            # if no scores available, use predictions (degrades AUC meaningfully)
            y_score = y_pred.astype(float)
    auc = roc_auc_score(y_test, y_score)
    acc = accuracy_score(y_test, y_pred)
    prec, rec, f1, _ = precision_recall_fscore_support(y_test, y_pred, average="binary", zero_division=0)
    results.append({"model": name, "AUC": auc, "Accuracy": acc, "Precision": prec, "Recall": rec, "F1": f1})
    probas[name] = y_score
    print(f"{name}: AUC={auc:.4f}, Acc={acc:.4f}, Prec={prec:.4f}, Rec={rec:.4f}, F1={f1:.4f}")

results_df = pd.DataFrame(results).sort_values("AUC", ascending=False)
results_df

## ROC Curves

In [None]:
plt.figure(figsize=(7,6))
for name, y_score in probas.items():
    fpr, tpr, _ = roc_curve(y_test, y_score)
    plt.plot(fpr, tpr, label=f"{name}")
plt.plot([0,1], [0,1], linestyle='--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curves (Binary Churn)")
plt.legend()
plt.grid(True)
plt.show()

## Pick Best Model by AUC & Save

In [None]:
best = results_df.iloc[0]
best_model_name = best['model']
print("Best model:", best_model_name)

# Refit best model on full training data
best_clf = models[best_model_name]
best_pipe = Pipeline(steps=[("preprocess", preprocess), ("clf", best_clf)])
best_pipe.fit(X_train, y_train)

# Export: classification report and confusion matrix
y_pred_best = best_pipe.predict(X_test)
print("\nClassification Report (Best Model)\n")
print(classification_report(y_test, y_pred_best))

cm = confusion_matrix(y_test, y_pred_best)
plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt="d")
plt.title(f"Confusion Matrix — {best_model_name}")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

# Save results
import joblib
joblib.dump(best_pipe, "best_churn_model.joblib")
results_df.to_csv("model_results.csv", index=False)
print("Saved best model to best_churn_model.joblib and metrics to model_results.csv")

## Notes
- You can toggle PCA as a preprocessing step by inserting it inside the pipeline before the classifier. In this notebook, we use PCA only for visualization to avoid losing too much signal.
- Replace `csv_path` with the path to your dataset file if you upload it directly in Colab.
- To reuse the trained pipeline: `joblib.load("best_churn_model.joblib")` and call `.predict` or `.predict_proba`.