# Customer Churn ML Notebook (Telecom)

This notebook reproduces the churn pipeline: loading data, preprocessing, training Logistic Regression & Random Forest, evaluation, explainability (permutation importance), segmentation, and saving outputs (cleaned CSV, segments, models, figures, and a PowerPoint report).

**Before running:** place your dataset at `/mnt/data/customer_churn_data.csv` or change the `INPUT_CSV` variable below.

In [None]:
# %%
# Config + imports
INPUT_CSV = '/mnt/data/customer_churn_data.csv'   # change to your path if needed
OUTPUT_DIR = '/mnt/data'
FIGS_DIR = OUTPUT_DIR + '/figs'
import os
os.makedirs(FIGS_DIR, exist_ok=True)

import pandas as pd, numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             roc_auc_score, confusion_matrix, RocCurveDisplay)
from sklearn.inspection import permutation_importance
import joblib, warnings
warnings.filterwarnings('ignore')

print('Ready. Change INPUT_CSV if needed and run the notebook cells sequentially.')

## Load and inspect data

In [None]:
# %%
df = pd.read_csv(INPUT_CSV)
print('Rows, cols:', df.shape)
display(df.head())

## Detect and normalize target column (robust mapping)

In [None]:
# %%
def map_to_binary_series(s):
    s = s.astype(str).str.strip().str.lower()
    yes = {'yes','y','true','1','churn','churned','left','inactive','lost'}
    no  = {'no','n','false','0','active','subscribed','subscriber'}
    mapped = s.map(lambda x: 1 if x in yes else (0 if x in no else np.nan))
    if mapped.isnull().mean() > 0.5:
        try:
            return pd.to_numeric(s, errors='coerce').astype('Int64')
        except:
            return mapped
    return mapped.astype('Int64')

# find target
target = None
for c in df.columns:
    if c.lower() in ['churn','is_churn','churned','target','label','churn_flag','churn_status']:
        target = c; break
if not target:
    for c in df.columns:
        vals = df[c].dropna().unique()
        if set(vals).issubset({0,1}):
            target = c; break
if not target and 'status' in df.columns:
    target = 'status'
if target is None:
    raise ValueError('No churn-like target found. Add a column named "churn" or similar.')

# normalize
if not pd.api.types.is_numeric_dtype(df[target]):
    df[target] = map_to_binary_series(df[target])
df = df[df[target].notnull()].reset_index(drop=True)
df[target] = df[target].astype(int)
print('Using target column:', target)
display(df[target].value_counts(normalize=True))

## Feature engineering (simple examples)

In [None]:
# %%
# add total_call_duration or recharge_count_agg if available
if any('call' in c.lower() and ('duration' in c.lower() or 'mins' in c.lower() or 'minutes' in c.lower()) for c in df.columns):
    call_cols = [c for c in df.columns if 'call' in c.lower() and ('duration' in c.lower() or 'mins' in c.lower() or 'minutes' in c.lower())]
    df['total_call_duration'] = df[call_cols].sum(axis=1, skipna=True)
if any('recharge' in c.lower() for c in df.columns):
    re_cols = [c for c in df.columns if 'recharge' in c.lower()]
    df['recharge_count_agg'] = df[re_cols].sum(axis=1, skipna=True)

df.drop_duplicates(inplace=True)
clean_path = os.path.join(OUTPUT_DIR, 'churn_cleaned.csv')
df.to_csv(clean_path, index=False)
print('Saved cleaned dataset to', clean_path)

## Prepare features and split

In [None]:
# %%
# keep customer id aside if present
cust_id_col = None
for c in df.columns:
    if 'id' in c.lower() and 'customer' in c.lower():
        cust_id_col = c; break
if not cust_id_col:
    for c in df.columns:
        if c.lower() in ['customer_id','id']:
            cust_id_col = c; break

X = df.drop(columns=[target] + ([cust_id_col] if cust_id_col else []), errors='ignore')
y = df[target].astype(int)

numeric_cols = X.select_dtypes(include=['int64','float64']).columns.tolist()
cat_cols = X.select_dtypes(include=['object','category','bool']).columns.tolist()

print('Numeric cols:', numeric_cols)
print('Categorical cols:', cat_cols)

numeric_proc = Pipeline([('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())])
categorical_proc = Pipeline([('imputer', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))]) if len(cat_cols)>0 else 'passthrough'
preproc = ColumnTransformer([('num', numeric_proc, numeric_cols), ('cat', categorical_proc, cat_cols)], remainder='drop')

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.25, random_state=42)
print('Train size:', X_train.shape[0], 'Test size:', X_test.shape[0])

## Train models

In [None]:
# %%
log_pipe = Pipeline([('pre', preproc), ('clf', LogisticRegression(max_iter=1000))])
rf_pipe = Pipeline([('pre', preproc), ('clf', RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1))])

log_pipe.fit(X_train, y_train)
rf_pipe.fit(X_train, y_train)

# save models
joblib.dump(log_pipe, os.path.join(OUTPUT_DIR, 'logistic_model.pkl'))
joblib.dump(rf_pipe, os.path.join(OUTPUT_DIR, 'rf_model.pkl'))
print('Saved models to', OUTPUT_DIR)

## Evaluate models and save figures

In [None]:
# %%
def evaluate(model, X_te, y_te):
    y_pred = model.predict(X_te)
    y_proba = model.predict_proba(X_te)[:,1]
    return {
        'accuracy': accuracy_score(y_te, y_pred),
        'precision': precision_score(y_te, y_pred, zero_division=0),
        'recall': recall_score(y_te, y_pred, zero_division=0),
        'f1': f1_score(y_te, y_pred, zero_division=0),
        'roc_auc': roc_auc_score(y_te, y_proba),
        'confusion_matrix': confusion_matrix(y_te, y_pred)
    }, y_proba, y_pred

log_metrics, log_proba, log_pred = evaluate(log_pipe, X_test, y_test)
rf_metrics, rf_proba, rf_pred = evaluate(rf_pipe, X_test, y_test)

print('Logistic metrics:', log_metrics)
print('RF metrics:', rf_metrics)

# ROC figures
RocCurveDisplay.from_estimator(log_pipe, X_test, y_test)
plt.title('Logistic ROC'); plt.savefig(os.path.join(FIGS_DIR, 'roc_logistic.png')); plt.close()

RocCurveDisplay.from_estimator(rf_pipe, X_test, y_test)
plt.title('RF ROC'); plt.savefig(os.path.join(FIGS_DIR, 'roc_rf.png')); plt.close()

# Confusion matrices (saved)
cm1 = log_metrics['confusion_matrix']
plt.imshow(cm1); plt.title('Logistic Confusion Matrix'); plt.colorbar(); plt.savefig(os.path.join(FIGS_DIR, 'cm_log.png')); plt.close()
cm2 = rf_metrics['confusion_matrix']
plt.imshow(cm2); plt.title('RF Confusion Matrix'); plt.colorbar(); plt.savefig(os.path.join(FIGS_DIR, 'cm_rf.png')); plt.close()

print('Saved figures in', FIGS_DIR)

## Feature importance (permutation)

In [None]:
# %%
try:
    pre = rf_pipe.named_steps['pre']
    X_test_trans = pre.transform(X_test)
    rf_clf = rf_pipe.named_steps['clf']
    perm = permutation_importance(rf_clf, X_test_trans, y_test, n_repeats=5, random_state=42, n_jobs=-1)
    feat_names = list(numeric_cols)
    if len(cat_cols) > 0:
        ohe = rf_pipe.named_steps['pre'].named_transformers_['cat'].named_steps['onehot']
        feat_names += list(ohe.get_feature_names_out(cat_cols))
    imp_scores = pd.Series(perm.importances_mean, index=feat_names).sort_values(ascending=False)
    imp_scores.head(20).plot.bar(figsize=(10,4)); plt.tight_layout(); plt.savefig(os.path.join(FIGS_DIR, 'perm_imp.png')); plt.close()
    display(imp_scores.head(20))
except Exception as e:
    print('Permutation importance failed:', e)

## Segmentation (Loyal / At Risk / Dormant)

In [None]:
# %%
full_proba = rf_pipe.predict_proba(X)[:,1]
seg_df = df.copy()
seg_df['_churn_proba'] = full_proba
seg_df['_is_churn'] = df[target].astype(int)
seg_df['_segment'] = 'Other'
seg_df.loc[(seg_df['_churn_proba'] < 0.2) & (seg_df['_is_churn']==0), '_segment'] = 'Loyal'
seg_df.loc[(seg_df['_churn_proba'] >= 0.6) & (seg_df['_is_churn']==0), '_segment'] = 'At Risk'
if 'total_call_duration' in seg_df.columns:
    low_call = seg_df['total_call_duration'].quantile(0.25)
    seg_df.loc[seg_df['total_call_duration'] <= low_call, '_segment'] = seg_df.loc[seg_df['total_call_duration'] <= low_call, '_segment'].apply(lambda x: 'Dormant' if x=='Other' else x)
elif 'recharge_count_agg' in seg_df.columns:
    low_rech = seg_df['recharge_count_agg'].quantile(0.25)
    seg_df.loc[seg_df['recharge_count_agg'] <= low_rech, '_segment'] = seg_df.loc[seg_df['recharge_count_agg'] <= low_rech, '_segment'].apply(lambda x: 'Dormant' if x=='Other' else x)

segments_path = os.path.join(OUTPUT_DIR, 'segments.csv')
seg_df.to_csv(segments_path, index=False)
print('Saved segments to', segments_path)
display(seg_df['_segment'].value_counts())

## PowerPoint report (automated)

In [None]:
# %%
# Create a PowerPoint report that includes the saved figures and a recommendation slide (requires python-pptx)
from pptx import Presentation
from pptx.util import Inches

prs = Presentation()
slide = prs.slides.add_slide(prs.slide_layouts[0])
slide.shapes.title.text = 'Customer Churn Analysis - Telecom'
slide.placeholders[1].text = f'Rows: {df.shape[0]} | Columns: {df.shape[1]} | Target: {target}'

s = prs.slides.add_slide(prs.slide_layouts[1])
s.shapes.title.text = 'Model Performance (Test set)'
tf = s.shapes.placeholders[1].text_frame
tf.text = f'Logistic ROC AUC: {log_metrics["roc_auc"]:.3f}'
tf.add_paragraph().text = f'Random Forest ROC AUC: {rf_metrics["roc_auc"]:.3f}'

def add_img_slide(prs, title, img_path):
    sl = prs.slides.add_slide(prs.slide_layouts[5])
    sl.shapes.title.text = title
    sl.shapes.add_picture(img_path, Inches(1), Inches(1.2), width=Inches(8))

imgs = ['roc_logistic.png', 'roc_rf.png', 'perm_imp.png', 'segments_pie.png']
for im in imgs:
    p = os.path.join(FIGS_DIR, im)
    if os.path.exists(p):
        add_img_slide(prs, im.replace('.png','').replace('_',' ').title(), p)

srec = prs.slides.add_slide(prs.slide_layouts[1])
srec.shapes.title.text = 'Final Recommendations'
srec.shapes.placeholders[1].text = ('1. Target "At Risk" with personalized offers & VIP outreach.\n'
                                   '2. Reward "Loyal" with upsell & loyalty benefits.\n'
                                   '3. Re-activate "Dormant" via low-cost bundles.\n'
                                   '4. Monitor complaints & time-to-resolution.\n'
                                   '5. Run A/B tests to measure incremental uplift.')

ppt_path = os.path.join(OUTPUT_DIR, 'customer_churn_report.pptx')
prs.save(ppt_path)
print('Saved PowerPoint to', ppt_path)

## Final notes and next steps
- Use `segments.csv` for campaign selection; add `campaign_id` for experiments.
- Install `shap` to produce SHAP explanations if you need deeper explainability.
- Tune model hyperparameters and add domain features (last recharge days, days since last call, complaint counts) for better performance.