# Customer Retention Optimizer â€” End-to-End Pipeline

This notebook runs the full pipeline:
1. Download data
2. Process & engineer features
3. Train churn model
4. Optimize retention offers
5. Generate reports

In [None]:
import sys
from pathlib import Path

# Ensure project root is on the path
PROJECT_ROOT = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()
sys.path.insert(0, str(PROJECT_ROOT))
print(f"Project root: {PROJECT_ROOT}")

## 1. Download Data

In [None]:
from src.download_data import download

download()

## 2. Process & Engineer Features

In [None]:
from src.process import get_spark, load_and_clean, build_features, detect_outliers, cluster_customers, save_features

spark = get_spark()
sdf = load_and_clean(spark)
pdf = build_features(sdf)
pdf = detect_outliers(pdf)
pdf = cluster_customers(pdf)
save_features(pdf)
spark.stop()

print(f"\nShape: {pdf.shape}")
pdf.head()

## 3. Train Churn Prediction Model

In [None]:
from src.train import load_data, train_model, FEATURE_COLS, PROJECT_ROOT as PR
import mlflow
import pandas as pd

mlflow.set_tracking_uri(str(PR / "mlruns"))
mlflow.set_experiment("churn-prediction")

df, X, y = load_data()
print(f"Samples: {len(y):,}  |  Churn rate: {y.mean():.2%}")

best_model, metrics, y_prob, roc_data = train_model(X, y)

with mlflow.start_run(run_name="notebook_run"):
    mlflow.log_metrics({k: v for k, v in metrics.items() if isinstance(v, (int, float))})
    mlflow.sklearn.log_model(best_model, "churn_model")

df["p_churn"] = y_prob
df.to_parquet(PR / "data" / "processed" / "customer_features.parquet", index=False)

roc_df = pd.DataFrame({"fpr": roc_data[0], "tpr": roc_data[1]})
roc_df.to_csv(PR / "data" / "outputs" / "roc_data.csv", index=False)

for k, v in metrics.items():
    print(f"  {k}: {v}")

## 4. Optimize Retention Offers

In [None]:
from src.optimize import optimize

df = pd.read_parquet(PR / "data" / "processed" / "customer_features.parquet")
result = optimize(df, budget=5000.0, max_large_pct=0.10)
result.to_csv(PR / "data" / "outputs" / "offer_plan.csv", index=False)

result[["CustomerID", "p_churn", "offer", "offer_cost", "expected_saved_revenue"]].head(10)

## 5. Generate Reports

In [None]:
from src.report import plot_segments, plot_outliers, plot_roc, plot_offers, FIGURES_DIR

FIGURES_DIR.mkdir(parents=True, exist_ok=True)

df = pd.read_parquet(PR / "data" / "processed" / "customer_features.parquet")

plot_segments(df, FIGURES_DIR / "segments.png")
plot_outliers(df, FIGURES_DIR / "outliers.png")
plot_roc(FIGURES_DIR / "roc_curve.png")
plot_offers(result, FIGURES_DIR / "offer_allocation.png")

In [None]:
from IPython.display import Image, display

for fname in ["segments.png", "outliers.png", "roc_curve.png", "offer_allocation.png"]:
    fpath = FIGURES_DIR / fname
    if fpath.exists():
        print(f"\n--- {fname} ---")
        display(Image(filename=str(fpath), width=700))