# Module 6: Ensemble Methods for Churn Prediction

**Goal:** Compare single trees, Random Forests, and Gradient Boosting for churn prediction. Find the best ensemble configuration.

**Prerequisites:** Module 5 (Decision Trees)

**Expected Runtime:** ~45 minutes

**Outputs:**
- Random Forest and Gradient Boosting model comparison
- Hyperparameter tuning with early stopping
- Feature importance extraction and interpretation
- Stakeholder summary of ensemble results

---

## Setup

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report
import warnings
warnings.filterwarnings('ignore')

# Try to import XGBoost (optional but recommended)
try:
    from xgboost import XGBClassifier
    HAS_XGBOOST = True
    print("✓ XGBoost available")
except ImportError:
    HAS_XGBOOST = False
    print("⚠ XGBoost not installed, using sklearn GradientBoosting")

print("✓ Libraries loaded")

## 1. Load and Prepare Data

In [None]:
DATA_URL = 'https://raw.githubusercontent.com/189investmentai/ml-foundations-interactive/main/shared/data/'

customers = pd.read_csv(DATA_URL + 'streamcart_customers.csv')
print(f"Loaded {len(customers)} customers")

# Feature engineering
if 'tenure_days' not in customers.columns:
    customers['tenure_days'] = (pd.to_datetime('2024-01-01') - pd.to_datetime(customers['signup_date'])).dt.days
if 'avg_order_value' not in customers.columns:
    customers['avg_order_value'] = customers['total_spend'] / customers['orders_total'].replace(0, 1)

# Select features
feature_cols = ['tenure_days', 'orders_total', 'total_spend', 'support_tickets_total', 'avg_order_value']
available_features = [c for c in feature_cols if c in customers.columns]
print(f"Features: {available_features}")

X = customers[available_features].fillna(0)
y = customers['churn_30d']

# Train-validation-test split
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.2, random_state=42, stratify=y_temp)

print(f"\nTrain: {len(X_train)}, Validation: {len(X_val)}, Test: {len(X_test)}")
print(f"Churn rate: {y.mean():.1%}")

## 2. Baseline: Single Decision Tree

In [None]:
# Train a single tree for baseline
single_tree = DecisionTreeClassifier(max_depth=5, random_state=42)
single_tree.fit(X_train, y_train)

print("Single Decision Tree:")
print(f"  Train Accuracy: {single_tree.score(X_train, y_train):.1%}")
print(f"  Val Accuracy: {single_tree.score(X_val, y_val):.1%}")
print(f"  Test Accuracy: {single_tree.score(X_test, y_test):.1%}")
print(f"  Depth: {single_tree.get_depth()}, Leaves: {single_tree.get_n_leaves()}")

## 3. Random Forest

In [None]:
# Train Random Forest
rf = RandomForestClassifier(
    n_estimators=100,
    max_depth=None,  # Let trees grow
    min_samples_leaf=5,
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train, y_train)

print("Random Forest (100 trees):")
print(f"  Train Accuracy: {rf.score(X_train, y_train):.1%}")
print(f"  Val Accuracy: {rf.score(X_val, y_val):.1%}")
print(f"  Test Accuracy: {rf.score(X_test, y_test):.1%}")

## 4. TODO: Experiment with Number of Trees

Your task: Find how many trees you need before returns diminish.

In [None]:
# Experiment: Test different numbers of trees to see diminishing returns

n_trees_list = [1, 5, 10, 25, 50, 100, 200, 500]
rf_results = []

for n_trees in n_trees_list:
    rf_temp = RandomForestClassifier(
        n_estimators=n_trees,
        max_depth=None,
        min_samples_leaf=5,
        random_state=42,
        n_jobs=-1
    )
    rf_temp.fit(X_train, y_train)
    
    # Calculate train, validation, and test accuracy
    train_acc = rf_temp.score(X_train, y_train)
    val_acc = rf_temp.score(X_val, y_val)
    test_acc = rf_temp.score(X_test, y_test)
    
    rf_results.append({
        'n_trees': n_trees,
        'train_acc': train_acc,
        'val_acc': val_acc,
        'test_acc': test_acc
    })

rf_results_df = pd.DataFrame(rf_results)
rf_results_df

In [None]:
# Visualize results
plt.figure(figsize=(10, 5))
plt.plot(rf_results_df['n_trees'], rf_results_df['train_acc'], 'b-o', label='Train', linewidth=2)
plt.plot(rf_results_df['n_trees'], rf_results_df['val_acc'], 'g-s', label='Validation', linewidth=2)
plt.plot(rf_results_df['n_trees'], rf_results_df['test_acc'], 'r-^', label='Test', linewidth=2)
plt.xlabel('Number of Trees')
plt.ylabel('Accuracy')
plt.title('Random Forest: Accuracy vs Number of Trees')
plt.legend()
plt.grid(True, alpha=0.3)
plt.xscale('log')
plt.show()

print("\nObservation: Accuracy typically plateaus around 50-100 trees.")

## 5. Gradient Boosting

In [None]:
# Train Gradient Boosting
if HAS_XGBOOST:
    gb = XGBClassifier(
        n_estimators=100,
        max_depth=4,
        learning_rate=0.1,
        random_state=42,
        verbosity=0
    )
else:
    gb = GradientBoostingClassifier(
        n_estimators=100,
        max_depth=4,
        learning_rate=0.1,
        random_state=42
    )

gb.fit(X_train, y_train)

print("Gradient Boosting (100 rounds):")
print(f"  Train Accuracy: {gb.score(X_train, y_train):.1%}")
print(f"  Val Accuracy: {gb.score(X_val, y_val):.1%}")
print(f"  Test Accuracy: {gb.score(X_test, y_test):.1%}")

## 6. Early Stopping (Boosting Best Practice)

In [None]:
# Demonstrate overfitting without early stopping
if HAS_XGBOOST:
    gb_overfit = XGBClassifier(
        n_estimators=500,  # Many rounds
        max_depth=8,       # Deep trees
        learning_rate=0.3, # Fast learning
        random_state=42,
        verbosity=0
    )
    gb_overfit.fit(X_train, y_train)
    
    print("Overfitting Example (500 rounds, depth 8, lr 0.3):")
    print(f"  Train Accuracy: {gb_overfit.score(X_train, y_train):.1%}")
    print(f"  Test Accuracy: {gb_overfit.score(X_test, y_test):.1%}")
    print(f"  Gap: {gb_overfit.score(X_train, y_train) - gb_overfit.score(X_test, y_test):.1%} ← OVERFITTING!")

In [None]:
# Fix with early stopping
if HAS_XGBOOST:
    gb_early = XGBClassifier(
        n_estimators=500,
        max_depth=4,
        learning_rate=0.1,
        random_state=42,
        verbosity=0,
        early_stopping_rounds=10
    )
    
    gb_early.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        verbose=False
    )
    
    print(f"With Early Stopping:")
    print(f"  Stopped at round: {gb_early.best_iteration}")
    print(f"  Train Accuracy: {gb_early.score(X_train, y_train):.1%}")
    print(f"  Test Accuracy: {gb_early.score(X_test, y_test):.1%}")

## 7. Feature Importance Comparison

In [None]:
# Compare feature importance between methods
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# Single Tree
ax1 = axes[0]
imp1 = pd.Series(single_tree.feature_importances_, index=available_features).sort_values()
ax1.barh(imp1.index, imp1.values, color='#3b82f6')
ax1.set_title('Single Tree')
ax1.set_xlabel('Importance')

# Random Forest
ax2 = axes[1]
imp2 = pd.Series(rf.feature_importances_, index=available_features).sort_values()
ax2.barh(imp2.index, imp2.values, color='#22c55e')
ax2.set_title('Random Forest')
ax2.set_xlabel('Importance')

# Gradient Boosting
ax3 = axes[2]
imp3 = pd.Series(gb.feature_importances_, index=available_features).sort_values()
ax3.barh(imp3.index, imp3.values, color='#8b5cf6')
ax3.set_title('Gradient Boosting')
ax3.set_xlabel('Importance')

plt.tight_layout()
plt.show()

## 8. Final Model Comparison

In [None]:
# Compare all models on test set
models = {
    'Single Tree': single_tree,
    'Random Forest': rf,
    'Gradient Boosting': gb
}

print("Final Model Comparison (Test Set)")
print("=" * 50)

results = []
for name, model in models.items():
    train_acc = model.score(X_train, y_train)
    test_acc = model.score(X_test, y_test)
    test_f1 = f1_score(y_test, model.predict(X_test))
    
    results.append({
        'Model': name,
        'Train Acc': f'{train_acc:.1%}',
        'Test Acc': f'{test_acc:.1%}',
        'Test F1': f'{test_f1:.3f}',
        'Overfit Gap': f'{train_acc - test_acc:.1%}'
    })

results_df = pd.DataFrame(results)
print(results_df.to_string(index=False))

In [None]:
# Best model detailed report
best_model = rf  # Usually Random Forest or Gradient Boosting
best_name = 'Random Forest'

print(f"\n{best_name} - Detailed Classification Report:")
print(classification_report(y_test, best_model.predict(X_test), target_names=['Retained', 'Churned']))

## 9. Stakeholder Summary

### TODO: Write a 3-bullet summary (~100 words) for the leadership team

Template:
• **Recommendation:** [Which model do you recommend (RF or GB)? Why?]
• **Performance:** Test accuracy ___% vs single tree ___% - ensemble is [X]% better/worse
• **Key drivers:** Top 2-3 features driving predictions and what they suggest about churn

**Your Summary:**

_[Write your summary here]_

## Self-Check

Uncomment and run the asserts below to verify your ensemble models work correctly.

In [None]:
# SELF-CHECK: Verify your ensemble models
# Run this after training all models

from sklearn.metrics import accuracy_score
tree_acc = accuracy_score(y_test, single_tree.predict(X_test))
rf_acc = accuracy_score(y_test, rf.predict(X_test))
gb_acc = accuracy_score(y_test, gb.predict(X_test))

assert rf_acc >= tree_acc - 0.02, f"Random Forest ({rf_acc:.3f}) should match or beat single tree ({tree_acc:.3f})"
assert gb_acc >= tree_acc - 0.02, f"Gradient Boosting ({gb_acc:.3f}) should match or beat single tree ({tree_acc:.3f})"
assert len(rf.feature_importances_) > 0, "Feature importances should be available"
assert len(rf_results_df) > 0, "Should have results from n_trees experiment"

print("✅ Self-check passed!")
print(f"   Single Tree: {tree_acc:.1%}")
print(f"   Random Forest: {rf_acc:.1%}")
print(f"   Gradient Boosting: {gb_acc:.1%}")
print(f"   RF improvement over tree: {rf_acc - tree_acc:+.1%}")

---

## Self-Assessment Checklist

- [ ] I compared single tree vs Random Forest vs Gradient Boosting
- [ ] I observed diminishing returns with more trees
- [ ] I used early stopping to prevent boosting overfitting
- [ ] I extracted and compared feature importance
- [ ] I can explain why ensembles outperform single trees

## Next Steps

1. **Debug Drill:** Fix an overfit boosting model
2. **Module 7:** Feature Engineering - make your features better