
# Churn Modeling — Imbalanced Data, Leakage-Safe, Production-Ready

**What this notebook does (best-practice pipeline):**
- Cleanly loads data and reports class imbalance.
- Leakage-safe **Pipeline** with `SMOTENC` (resampling **inside** CV), `ColumnTransformer`, and a class-weighted **Logistic Regression**.
- Tunes the **decision threshold** on cross-validated predictions to maximize **F1** (swap metric if needed).
- Evaluates with **PR-AUC**, **ROC-AUC**, confusion matrix, precision, recall, F1 on a held-out test set.
- Optional feature importance view (from the linear model coefficients).


In [None]:

# ------------------ CONFIG ------------------
FILE_PATH = 'churnintelecom.csv'  # update path if needed
TARGET = 'churn'                  # binary target: 1 = churn, 0 = no churn
CAT_COLS = ['international plan', 'voice mail plan']  # categorical flags present in dataset
DROP_COLS = ['phone number', 'state']                 # columns to drop (IDs / high-cardinality)

RANDOM_STATE = 42
TEST_SIZE = 0.2
N_SPLITS = 5


In [None]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_predict
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (classification_report, confusion_matrix,
                             precision_recall_curve, average_precision_score,
                             roc_auc_score, f1_score, precision_score, recall_score)

from imblearn.over_sampling import SMOTENC
from imblearn.pipeline import Pipeline as ImbPipeline


In [None]:

# ------------------ LOAD DATA ------------------
df = pd.read_csv(FILE_PATH)
print("Shape:", df.shape)
print("\nFirst rows:")
display(df.head())

print("\nInfo:")
print(df.info())
print("\nMissing values:")
print(df.isnull().sum())


In [None]:

# ------------------ CLASS BALANCE ------------------
if TARGET not in df.columns:
    raise ValueError(f"TARGET '{TARGET}' not found in columns: {df.columns.tolist()}")

y_raw = df[TARGET]
print("\nTarget value counts:")
print(y_raw.value_counts())
print("\nTarget distribution (%):")
print((y_raw.value_counts(normalize=True) * 100).round(2))


In [None]:

# ------------------ CLEAN & SPLIT ------------------
df2 = df.drop(columns=DROP_COLS, errors='ignore').copy()
df2[TARGET] = df2[TARGET].astype(int)

X = df2.drop(columns=[TARGET])
y = df2[TARGET].values

# Identify columns by dtype
num_cols = X.select_dtypes(include=np.number).columns.difference(CAT_COLS).tolist()
cat_cols = [c for c in CAT_COLS if c in X.columns]

print("Numeric columns:", num_cols)
print("Categorical columns:", cat_cols)

# Train/test split (stratified)
Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=TEST_SIZE, stratify=y, random_state=RANDOM_STATE)

print("\nTrain shape:", Xtr.shape, " Test shape:", Xte.shape)
print("Train class balance:", np.bincount(ytr))
print("Test  class balance:", np.bincount(yte))


In [None]:

# ------------------ PIPELINE ------------------
# Column indices for SMOTENC (based on raw X columns)
cat_idx = [X.columns.get_loc(c) for c in cat_cols]

preprocess = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore", drop=None, sparse_output=False), cat_cols),
    ],
    remainder="drop"
)

clf = LogisticRegression(
    class_weight="balanced",
    solver="lbfgs",
    C=1.0,
    max_iter=2000
)

pipe = ImbPipeline(steps=[
    ("smote", SMOTENC(categorical_features=cat_idx, random_state=RANDOM_STATE, k_neighbors=5)),
    ("prep", preprocess),
    ("clf", clf),
])

pipe


In [None]:

# ------------------ CV THRESHOLD TUNING ------------------
cv = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)

proba_cv = cross_val_predict(pipe, Xtr, ytr, cv=cv, method="predict_proba")[:, 1]

prec, rec, thr = precision_recall_curve(ytr, proba_cv)
f1s = 2 * prec * rec / (prec + rec + 1e-12)
best_idx = np.nanargmax(f1s)
best_thr = thr[max(best_idx, 0)] if best_idx < len(thr) else 0.5

print(f"Chosen threshold (F1-optimal on CV): {best_thr:.3f}")
print("PR-AUC (train CV):", average_precision_score(ytr, proba_cv))
print("ROC-AUC (train CV):", roc_auc_score(ytr, proba_cv))

# Optional: plot PR curve (train CV)
plt.figure()
plt.plot(rec, prec)
plt.xlabel("Recall"); plt.ylabel("Precision"); plt.title("Precision-Recall (Train CV)"); plt.grid(True); plt.show()


In [None]:

# ------------------ FINAL FIT & TEST EVAL ------------------
pipe.fit(Xtr, ytr)
proba_te = pipe.predict_proba(Xte)[:, 1]
pred_te = (proba_te >= best_thr).astype(int)

print("PR-AUC (test):", average_precision_score(yte, proba_te))
print("ROC-AUC (test):", roc_auc_score(yte, proba_te))
print("F1 (test):", f1_score(yte, pred_te))
print("Precision (test):", precision_score(yte, pred_te))
print("Recall (test):", recall_score(yte, pred_te))

print("\nConfusion Matrix (test):\n", confusion_matrix(yte, pred_te))
print("\nClassification Report (test):\n", classification_report(yte, pred_te, digits=3))

# PR curve (test)
prec_te, rec_te, _ = precision_recall_curve(yte, proba_te)
plt.figure()
plt.plot(rec_te, prec_te)
plt.xlabel("Recall"); plt.ylabel("Precision"); plt.title("Precision-Recall (Test)"); plt.grid(True); plt.show()

# ROC curve (test)
from sklearn.metrics import roc_curve
fpr, tpr, _ = roc_curve(yte, proba_te)
plt.figure()
plt.plot(fpr, tpr)
plt.xlabel("FPR"); plt.ylabel("TPR"); plt.title("ROC (Test)"); plt.grid(True); plt.show()


In [None]:

# ------------------ COEFFICIENTS / FEATURE IMPORTANCE (LOGREG) ------------------
# Refit on full training to extract coefficients along the transformed feature names
pipe.fit(Xtr, ytr)

# Build transformed feature names
ohe = pipe.named_steps['prep'].named_transformers_['cat']
num_names = num_cols
cat_names = []
if ohe is not None and len(cat_cols) > 0:
    cat_names = ohe.get_feature_names_out(cat_cols).tolist()

feat_names = num_names + cat_names

coef = pipe.named_steps['clf'].coef_.ravel()
coef_df = pd.DataFrame({"feature": feat_names, "coefficient": coef}).sort_values("coefficient")

print(coef_df.head())
print(coef_df.tail())

# Horizontal bar plot
coef_df_plot = coef_df.copy()
coef_df_plot = coef_df_plot.set_index("feature")
coef_df_plot.plot.barh(y="coefficient")
plt.title("LogReg Coefficients")
plt.xlabel("Coefficient")
plt.grid(True)
plt.show()



## (Optional) Use a Different Threshold Objective
If your business goal is **high recall** (catch most churners) with a minimum precision, you can pick the threshold that **maximizes recall given Precision ≥ X**. Replace the tuning cell with logic that searches thresholds under that constraint.
