# Solution: Debug Drill 06 - The Runaway Booster

This is the solution notebook for the boosting overfitting drill.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

try:
    from xgboost import XGBClassifier
    HAS_XGBOOST = True
except ImportError:
    HAS_XGBOOST = False

np.random.seed(42)

In [None]:
# Load and prepare data
DATA_URL = 'https://raw.githubusercontent.com/189investmentai/ml-foundations-interactive/main/shared/data/'
customers = pd.read_csv(DATA_URL + 'streamcart_customers.csv')

if 'tenure_days' not in customers.columns:
    customers['tenure_days'] = (pd.to_datetime('2024-01-01') - pd.to_datetime(customers['signup_date'])).dt.days
if 'avg_order_value' not in customers.columns:
    customers['avg_order_value'] = customers['total_spend'] / customers['orders_total'].replace(0, 1)

feature_cols = ['tenure_days', 'orders_total', 'total_spend', 'support_tickets_total', 'avg_order_value']
available_features = [c for c in feature_cols if c in customers.columns]

X = customers[available_features].fillna(0)
y = customers['churn_30d']

X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.2, random_state=42, stratify=y_temp)

In [None]:
# Find where validation accuracy peaks
n_estimators_list = [10, 25, 50, 100, 200, 300, 400, 500]
results = []

for n in n_estimators_list:
    if HAS_XGBOOST:
        model = XGBClassifier(n_estimators=n, max_depth=10, learning_rate=0.5,
                              random_state=42, verbosity=0)
    else:
        model = GradientBoostingClassifier(n_estimators=n, max_depth=10, learning_rate=0.5,
                                           random_state=42)
    model.fit(X_train, y_train)
    
    results.append({
        'n_estimators': n,
        'val_acc': accuracy_score(y_val, model.predict(X_val))
    })

results_df = pd.DataFrame(results)
optimal_idx = results_df['val_acc'].idxmax()
optimal_n = results_df.loc[optimal_idx, 'n_estimators']
print(f"Optimal n_estimators: {optimal_n}")

In [None]:
# SOLUTION: Fixed gradient boosting
if HAS_XGBOOST:
    gb_fixed = XGBClassifier(
        n_estimators=500,
        max_depth=4,
        learning_rate=0.1,
        early_stopping_rounds=20,
        random_state=42,
        verbosity=0
    )
    gb_fixed.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
    stopped_at = gb_fixed.best_iteration
else:
    gb_fixed = GradientBoostingClassifier(
        n_estimators=int(optimal_n),
        max_depth=4,
        learning_rate=0.1,
        random_state=42
    )
    gb_fixed.fit(X_train, y_train)
    stopped_at = int(optimal_n)

train_acc_fixed = accuracy_score(y_train, gb_fixed.predict(X_train))
val_acc_fixed = accuracy_score(y_val, gb_fixed.predict(X_val))
test_acc_fixed = accuracy_score(y_test, gb_fixed.predict(X_test))

print("=== Fixed Gradient Boosting ===")
print(f"  Stopped at round: {stopped_at}")
print(f"  max_depth: 4")
print(f"  learning_rate: 0.1")
print(f"\n  Train Accuracy: {train_acc_fixed:.1%}")
print(f"  Val Accuracy:   {val_acc_fixed:.1%}")
print(f"  Test Accuracy:  {test_acc_fixed:.1%}")

In [None]:
# Compare to overfit model
if HAS_XGBOOST:
    gb_overfit = XGBClassifier(n_estimators=500, max_depth=10, learning_rate=0.5,
                               random_state=42, verbosity=0)
else:
    gb_overfit = GradientBoostingClassifier(n_estimators=500, max_depth=10, learning_rate=0.5,
                                            random_state=42)
gb_overfit.fit(X_train, y_train)

train_acc = accuracy_score(y_train, gb_overfit.predict(X_train))
test_acc = accuracy_score(y_test, gb_overfit.predict(X_test))

print("\n=== Comparison ===")
print(f"                    Overfit Model   Fixed Model")
print(f"  Rounds:           {500:>12}    {stopped_at:>12}")
print(f"  max_depth:        {10:>12}    {4:>12}")
print(f"  learning_rate:    {0.5:>12}    {0.1:>12}")
print(f"  Train Acc:        {train_acc:>12.1%}    {train_acc_fixed:>12.1%}")
print(f"  Test Acc:         {test_acc:>12.1%}    {test_acc_fixed:>12.1%}")
print(f"\n  Test improvement: +{test_acc_fixed - test_acc:.1%}")

## Sample Postmortem

### What happened:
- The model had high train accuracy but poor test accuracy
- The model performed terribly in production after deployment

### Root cause:
Three configuration issues:
1. No early stopping — model overtrained for 500 rounds
2. Learning rate too high (0.5) — each tree overfit quickly
3. Trees too deep (10) — individual trees were too complex

### How to prevent:
- ALWAYS use early stopping with gradient boosting
- Use learning_rate 0.01-0.1 (not 0.3+)
- Use max_depth 3-6 (not 8+)
- Monitor validation loss during training