# Customer Churn Prediction (AWS Mini-Project)

Sections: Data Loading → EDA → Feature Engineering → Baseline Models → Improved Model → Evaluation → Threshold Selection → Feature Importance → (Optional) AWS SageMaker Deploy → Conclusions

Links:
- Project brief: https://github.com/springboard-curriculum/mec2-projects/blob/main/Student_MLE_MiniProject_Churn_Prediction_AWS.md
- AWS reference: https://aws.amazon.com/blogs/machine-learning/build-tune-and-deploy-an-end-to-end-churn-prediction-model-using-amazon-sagemaker-pipelines/


In [None]:
# Data Loading
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

pd.set_option('display.max_columns', 200)

# TODO: set your dataset path
CSV_PATH = Path('data/churn.csv')
if CSV_PATH.exists():
    df = pd.read_csv(CSV_PATH)
else:
    # fallback tiny demo
    df = pd.DataFrame({
        'customer_id': range(1, 11),
        'tenure_months': [1,3,6,12,24,36,48,60,72,84],
        'monthly_spend': [35,40,38,45,50,60,65,70,80,90],
        'churn': [1,0,1,0,0,0,1,0,0,0],
        'plan': ['basic','basic','plus','plus','pro','pro','pro','plus','pro','basic']
    })

df.head()


In [None]:
# EDA — churn rate & class balance
assert 'churn' in df.columns, "Dataset must include 'churn' column (0/1)."
churn_rate = df['churn'].mean()
print(f"Churn rate: {churn_rate:.3f}")

# Class counts plot
ax = df['churn'].value_counts().sort_index().plot(kind='bar', color=['#4daf4a','#e41a1c'])
ax.set_xticklabels(['No churn (0)','Churn (1)'], rotation=0)
ax.set_title('Class Balance')
plt.show()

# Basic stats
display(df.describe(include='all'))


In [None]:
# Feature schema & utilities
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, precision_recall_curve, roc_curve, precision_score, recall_score, f1_score, confusion_matrix
import matplotlib.pyplot as plt
import numpy as np

TARGET = 'churn'
assert TARGET in df.columns, f"Target '{TARGET}' missing"

# Identify feature types (simple heuristic)
num_cols = [c for c in df.columns if df[c].dtype != 'O' and c not in [TARGET]]
cat_cols = [c for c in df.columns if df[c].dtype == 'O']

# Example engineered feature(s)
if 'monthly_spend' in df.columns:
    df['annual_spend'] = df['monthly_spend'] * 12
    if 'annual_spend' not in num_cols: num_cols.append('annual_spend')
if 'tenure_months' in df.columns:
    df['tenure_bucket'] = pd.cut(df['tenure_months'], bins=[-1,3,12,36,120], labels=['new','early','mid','long'])
    if 'tenure_bucket' not in cat_cols: cat_cols.append('tenure_bucket')

X = df[num_cols + cat_cols].copy()
y = df[TARGET].astype(int).copy()

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler(with_mean=False))
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

preprocess = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols)
    ]
)

# Helper to evaluate and plot
def evaluate(name, model, X_test, y_test, proba=None):
    if proba is None:
        if hasattr(model, 'predict_proba'):
            proba = model.predict_proba(X_test)[:,1]
        elif hasattr(model, 'decision_function'):
            from sklearn.preprocessing import MinMaxScaler
            scores = model.decision_function(X_test).reshape(-1,1)
            proba = MinMaxScaler().fit_transform(scores).ravel()
        else:
            proba = model.predict(X_test)
    preds = (proba >= 0.5).astype(int)
    auc = roc_auc_score(y_test, proba)
    prec = precision_score(y_test, preds, zero_division=0)
    rec = recall_score(y_test, preds, zero_division=0)
    f1 = f1_score(y_test, preds, zero_division=0)
    cm = confusion_matrix(y_test, preds)
    print(f"[{name}] AUC={auc:.3f}  Precision={prec:.3f}  Recall={rec:.3f}  F1={f1:.3f}")
    print("Confusion Matrix:\n", cm)
    # ROC
    fpr, tpr, _ = roc_curve(y_test, proba)
    plt.figure(figsize=(5,4)); plt.plot(fpr,tpr,label=f'{name} (AUC={auc:.2f})'); plt.plot([0,1],[0,1],'--',color='gray'); plt.xlabel('FPR'); plt.ylabel('TPR'); plt.title('ROC'); plt.legend(); plt.show()
    # PR
    precs, recs, _ = precision_recall_curve(y_test, proba)
    plt.figure(figsize=(5,4)); plt.plot(recs,precs,label=name); plt.xlabel('Recall'); plt.ylabel('Precision'); plt.title('PR Curve'); plt.legend(); plt.show()
    return {"auc":auc,"precision":prec,"recall":rec,"f1":f1}

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=42)


In [None]:
# Baselines: Logistic Regression & Decision Tree
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import make_pipeline

logreg = make_pipeline(preprocess, LogisticRegression(max_iter=1000, n_jobs=None))
logreg.fit(X_train, y_train)
res_logreg = evaluate('LogReg', logreg, X_test, y_test)

# simple tree
cart = make_pipeline(preprocess, DecisionTreeClassifier(max_depth=6, random_state=42))
cart.fit(X_train, y_train)
res_cart = evaluate('DecisionTree', cart, X_test, y_test)


In [None]:
# Improved model: XGBoost (if available) else RandomForest
try:
    from xgboost import XGBClassifier
    xgb = make_pipeline(preprocess, XGBClassifier(
        n_estimators=400, learning_rate=0.05, max_depth=5, subsample=0.9, colsample_bytree=0.8, random_state=42, eval_metric='auc'
    ))
    xgb.fit(X_train, y_train)
    res_xgb = evaluate('XGBoost', xgb, X_test, y_test)
except Exception as e:
    print('XGBoost not available:', e)
    from sklearn.ensemble import RandomForestClassifier
    rf = make_pipeline(preprocess, RandomForestClassifier(n_estimators=300, max_depth=None, random_state=42))
    rf.fit(X_train, y_train)
    res_xgb = evaluate('RandomForest', rf, X_test, y_test)


In [None]:
# Threshold selection & business sketch
# Sweep thresholds and compute precision/recall
model = xgb if 'xgb' in globals() else (rf if 'rf' in globals() else logreg)
if hasattr(model, 'predict_proba'):
    proba = model.predict_proba(X_test)[:,1]
else:
    proba = model.predict(X_test).astype(float)

ths = np.linspace(0.1, 0.9, 9)
rows = []
for t in ths:
    preds = (proba >= t).astype(int)
    rows.append({
        'threshold': t,
        'precision': precision_score(y_test, preds, zero_division=0),
        'recall': recall_score(y_test, preds, zero_division=0),
        'f1': f1_score(y_test, preds, zero_division=0)
    })
import pandas as pd
thr_df = pd.DataFrame(rows)
thr_df

# Simple business cost sketch:
# cost_retention per contacted customer, value_recovered per true churn caught
cost_retention = 5.0
value_recovered = 50.0
thr_df['expected_value'] = thr_df['recall']*value_recovered - (thr_df['precision']*cost_retention)
thr_df.sort_values('expected_value', ascending=False).head()


In [None]:
# Feature importance (tree-based) and optional SHAP
try:
    from xgboost import XGBClassifier
    base_est = model.named_steps.get('xgbclassifier') if hasattr(model, 'named_steps') else None
    if base_est is None and isinstance(model, XGBClassifier): base_est = model
    if base_est is not None:
        # importance in original feature space after preprocess is complex;
        # show model-native importance as a proxy
        importances = getattr(base_est, 'feature_importances_', None)
        if importances is not None:
            imp = pd.Series(importances).sort_values(ascending=False)[:20]
            imp.plot(kind='bar', title='Top model feature importances (model space)'); plt.show()
except Exception as e:
    print('Feature importance skipped:', e)

# Optional SHAP (if available)
try:
    import shap
    shap.initjs()
    # Use a small background sample due to cost
    X_small = X_train.sample(min(200, len(X_train)), random_state=42)
    # Get transformed features
    X_small_tx = preprocess.fit(X_train).transform(X_small)
    if 'xgbclassifier' in model.named_steps:
        explainer = shap.TreeExplainer(model.named_steps['xgbclassifier'])
        shap_values = explainer.shap_values(X_small_tx)
        shap.summary_plot(shap_values, X_small_tx, show=True)
except Exception as e:
    print('SHAP skipped:', e)


## Conclusions

- Recommended model: fill in XGBoost/RandomForest/LogReg based on AUC/PR results above.
- Threshold choice: pick threshold that maximizes expected_value (see threshold table) or balances precision/recall for your business.
- Next steps: add cross-validation; persist model with joblib; (optional) wire SageMaker Pipelines for S3 upload, training, HPO, RegisterModel, and batch transform.
- Monitoring: track p95 latency and error rate on inference; monitor drift via class priors and periodic AUC/PR checks on a labeled trickle sample.

