# Solution: The Leaky Pipeline

This is the answer key for `drill_10_scaler_leakage.ipynb`.

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

np.random.seed(42)

In [None]:
# Generate data
n = 2000

df = pd.DataFrame({
    'customer_id': range(n),
    'tenure_days': np.random.uniform(30, 1000, n),
    'monthly_spend': np.random.exponential(100, n),
    'support_tickets': np.random.poisson(2, n),
    'logins_last_30d': np.random.poisson(15, n),
})

churn_prob = 1 / (1 + np.exp(
    2 - 0.002 * df['tenure_days'] - 0.005 * df['monthly_spend'] +
    0.2 * df['support_tickets'] - 0.05 * df['logins_last_30d']
))
df['churn'] = (np.random.random(n) < churn_prob).astype(int)

feature_cols = ['tenure_days', 'monthly_spend', 'support_tickets', 'logins_last_30d']
y = df['churn']

In [None]:
# BUGGY pipeline (for comparison)
X = df[feature_cols].copy()
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

df['segment'] = pd.qcut(df['tenure_days'], q=5, labels=['New', 'Active', 'Established', 'Loyal', 'Veteran'])
segment_churn_rate = df.groupby('segment')['churn'].transform('mean')
X_scaled = np.column_stack([X_scaled, segment_churn_rate])

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42, stratify=y)
model = LogisticRegression(max_iter=1000).fit(X_train, y_train)

train_auc = roc_auc_score(y_train, model.predict_proba(X_train)[:, 1])
test_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])

print(f"LEAKY Pipeline: Train AUC={train_auc:.3f}, Test AUC={test_auc:.3f}")

## SOLUTION: Leak-Free Pipeline

In [None]:
# Step 1: Use only safe features (no target-derived)
X_clean = df[feature_cols].copy()
y_clean = df['churn']

# Step 2: Split FIRST
X_train_clean, X_test_clean, y_train_clean, y_test_clean = train_test_split(
    X_clean, y_clean, test_size=0.3, random_state=42, stratify=y_clean
)

# Step 3: Fit scaler on TRAIN only
scaler_clean = StandardScaler()
X_train_scaled = scaler_clean.fit_transform(X_train_clean)  # Fit on train
X_test_scaled = scaler_clean.transform(X_test_clean)         # Transform test (no fit!)

# Step 4: Train model
model_clean = LogisticRegression(max_iter=1000)
model_clean.fit(X_train_scaled, y_train_clean)

# Step 5: Evaluate
train_auc_clean = roc_auc_score(y_train_clean, model_clean.predict_proba(X_train_scaled)[:, 1])
test_auc_clean = roc_auc_score(y_test_clean, model_clean.predict_proba(X_test_scaled)[:, 1])

print("=== Clean Pipeline Results ===")
print(f"Train AUC: {train_auc_clean:.3f}")
print(f"Test AUC:  {test_auc_clean:.3f}")

In [None]:
# Compare leaky vs clean
print("=== Comparison ===")
print(f"\n                    Leaky Pipeline    Clean Pipeline")
print(f"  Train AUC:        {train_auc:>14.3f}    {train_auc_clean:>14.3f}")
print(f"  Test AUC:         {test_auc:>14.3f}    {test_auc_clean:>14.3f}")
print(f"\n  Leakage inflation: {test_auc - test_auc_clean:+.3f}")
print(f"\nðŸ’¡ The 'clean' test AUC is what you'll actually get in production!")

In [None]:
# Self-check
assert test_auc_clean < test_auc, "Clean pipeline should have lower (realistic) AUC"
assert X_train_scaled.shape[1] == len(feature_cols), "Should have only safe features"

print("âœ“ Pipeline fixed!")
print(f"âœ“ Removed leaky feature")
print(f"âœ“ Scaler fitted on train only")
print(f"âœ“ Realistic AUC: {test_auc_clean:.3f}")

## Sample Postmortem

### What happened:
- Model showed AUC of ~0.92 in testing but dropped to ~0.72 in production.

### Root cause:
- Two leakage sources:
  1. **Scaler leakage:** StandardScaler was fit on ALL data before splitting, leaking test set statistics.
  2. **Target leakage:** `segment_churn_rate` was calculated using ALL churn labels, including test set labels.

### How to prevent:
- **Always split first.** Apply transformations after splitting, fitting only on train.
- **Review all features.** Any feature derived from the target is suspicious.
- **Use pipelines.** sklearn's Pipeline class enforces correct fit/transform order.