# Solution: Debug Drill 05 - The Memorizing Tree

This is the solution notebook for the overfit tree drill.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

np.random.seed(42)

In [None]:
# Load and prepare data
DATA_URL = 'https://raw.githubusercontent.com/189investmentai/ml-foundations-interactive/main/shared/data/'
customers = pd.read_csv(DATA_URL + 'streamcart_customers.csv')

if 'tenure_days' not in customers.columns:
    customers['tenure_days'] = (pd.to_datetime('2024-01-01') - pd.to_datetime(customers['signup_date'])).dt.days
if 'avg_order_value' not in customers.columns:
    customers['avg_order_value'] = customers['total_spend'] / customers['orders_total'].replace(0, 1)

feature_cols = ['tenure_days', 'orders_total', 'total_spend', 'support_tickets_total', 'avg_order_value']
available_features = [c for c in feature_cols if c in customers.columns]

X = customers[available_features].fillna(0)
y = customers['churn_30d']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [None]:
# Find optimal depth
depths = range(1, 21)
results = []

for depth in depths:
    tree_temp = DecisionTreeClassifier(max_depth=depth, random_state=42)
    tree_temp.fit(X_train, y_train)
    
    results.append({
        'depth': depth,
        'train_acc': accuracy_score(y_train, tree_temp.predict(X_train)),
        'test_acc': accuracy_score(y_test, tree_temp.predict(X_test)),
        'n_leaves': tree_temp.get_n_leaves()
    })

results_df = pd.DataFrame(results)

optimal_idx = results_df['test_acc'].idxmax()
optimal_depth = results_df.loc[optimal_idx, 'depth']

print(f"Optimal depth: {optimal_depth}")

In [None]:
# SOLUTION: Fixed tree with proper constraints
tree_fixed = DecisionTreeClassifier(
    max_depth=optimal_depth,
    min_samples_leaf=10,
    min_samples_split=20,
    random_state=42
)
tree_fixed.fit(X_train, y_train)

train_acc_fixed = accuracy_score(y_train, tree_fixed.predict(X_train))
test_acc_fixed = accuracy_score(y_test, tree_fixed.predict(X_test))

print("=== Fixed Tree ===")
print(f"  Max Depth: {tree_fixed.get_depth()}")
print(f"  Number of Leaves: {tree_fixed.get_n_leaves()}")
print(f"  Train Accuracy: {train_acc_fixed:.1%}")
print(f"  Test Accuracy:  {test_acc_fixed:.1%}")
print(f"  Gap: {train_acc_fixed - test_acc_fixed:.1%}")

In [None]:
# Compare to overfit tree
tree_overfit = DecisionTreeClassifier(max_depth=20, min_samples_leaf=1, random_state=42)
tree_overfit.fit(X_train, y_train)

train_acc = accuracy_score(y_train, tree_overfit.predict(X_train))
test_acc = accuracy_score(y_test, tree_overfit.predict(X_test))

print("\n=== Comparison ===")
print(f"                    Overfit Tree    Fixed Tree")
print(f"  Depth:            {tree_overfit.get_depth():>8}        {tree_fixed.get_depth():>8}")
print(f"  Leaves:           {tree_overfit.get_n_leaves():>8}        {tree_fixed.get_n_leaves():>8}")
print(f"  Train Acc:        {train_acc:>8.1%}        {train_acc_fixed:>8.1%}")
print(f"  Test Acc:         {test_acc:>8.1%}        {test_acc_fixed:>8.1%}")
print(f"  Gap:              {train_acc - test_acc:>8.1%}        {train_acc_fixed - test_acc_fixed:>8.1%}")
print(f"\n  Test improvement: +{test_acc_fixed - test_acc:.1%}")

In [None]:
# Visualize fixed tree
plt.figure(figsize=(16, 10))
plot_tree(
    tree_fixed,
    feature_names=available_features,
    class_names=['Retained', 'Churned'],
    filled=True,
    rounded=True,
    fontsize=10
)
plt.title('Fixed Decision Tree (Interpretable!)')
plt.tight_layout()
plt.show()

## Sample Postmortem

### What happened:
- The tree had 99% train accuracy but only 65% test accuracy
- The 30%+ gap indicated severe overfitting

### Root cause:
- max_depth=20 was too deep â€” tree created thousands of tiny leaves
- min_samples_leaf=1 allowed single-sample leaves
- The tree memorized training data noise instead of learning patterns

### How to prevent:
- Always compare train vs test accuracy before deployment
- Use max_depth=3-5 as a starting point
- Require min_samples_leaf >= 10 to prevent memorization
- A large train-test gap (>15%) is a red flag for overfitting