# Module 5: Decision Trees for Interpretable Predictions

**Goal:** Build and interpret decision trees, understand overfitting, and extract business rules.

**Prerequisites:** Module 4 (Logistic Regression)

**Expected Runtime:** ~45 minutes

**Outputs:**
- Trained decision tree with visualization
- Overfitting diagnosis via depth/leaf constraints
- Extracted business rules from tree structure
- Tree vs logistic regression performance comparison

---

## Setup

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, f1_score
import warnings
warnings.filterwarnings('ignore')

print("✓ Libraries loaded")

## 1. Load and Prepare Data

In [None]:
DATA_URL = 'https://raw.githubusercontent.com/189investmentai/ml-foundations-interactive/main/shared/data/'

customers = pd.read_csv(DATA_URL + 'streamcart_customers.csv')
print(f"Loaded {len(customers)} customers")
customers.head()

In [None]:
# Feature engineering
if 'tenure_days' not in customers.columns:
    customers['tenure_days'] = (pd.to_datetime('2024-01-01') - pd.to_datetime(customers['signup_date'])).dt.days
if 'avg_order_value' not in customers.columns:
    customers['avg_order_value'] = customers['total_spend'] / customers['orders_total'].replace(0, 1)

# Select features
feature_cols = ['tenure_days', 'orders_total', 'total_spend', 'support_tickets_total', 'avg_order_value']
available_features = [c for c in feature_cols if c in customers.columns]
print(f"Features: {available_features}")

X = customers[available_features].fillna(0)
y = customers['churn_30d']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)
print(f"\nTrain: {len(X_train)}, Test: {len(X_test)}")
print(f"Churn rate: {y.mean():.1%}")

## 2. Baseline: Logistic Regression

In [None]:
# Fit logistic regression for comparison
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)

logreg_train_acc = logreg.score(X_train, y_train)
logreg_test_acc = logreg.score(X_test, y_test)

print("Logistic Regression Baseline:")
print(f"  Train Accuracy: {logreg_train_acc:.1%}")
print(f"  Test Accuracy: {logreg_test_acc:.1%}")
print(f"  Gap: {logreg_train_acc - logreg_test_acc:.1%}")

## 3. Unconstrained Tree (Overfitting Demo)

In [None]:
# Fit a deep tree with no constraints
tree_deep = DecisionTreeClassifier(random_state=42)
tree_deep.fit(X_train, y_train)

deep_train_acc = tree_deep.score(X_train, y_train)
deep_test_acc = tree_deep.score(X_test, y_test)

print("Unconstrained Decision Tree:")
print(f"  Train Accuracy: {deep_train_acc:.1%}")
print(f"  Test Accuracy: {deep_test_acc:.1%}")
print(f"  Gap: {deep_train_acc - deep_test_acc:.1%}")
print(f"\n  Depth: {tree_deep.get_depth()}")
print(f"  Number of leaves: {tree_deep.get_n_leaves()}")

### Self-Check: Overfitting

Notice the huge gap between train and test accuracy! The unconstrained tree has memorized the training data.

## 4. TODO: Find Optimal Depth

Your task: Try different max_depth values and find the one that maximizes TEST accuracy.

In [None]:
# Experiment: Find optimal depth by comparing train vs test accuracy

depths = range(1, 15)
results = []

for depth in depths:
    tree = DecisionTreeClassifier(max_depth=depth, random_state=42)
    tree.fit(X_train, y_train)
    
    # Calculate train and test accuracy
    train_acc = tree.score(X_train, y_train)
    test_acc = tree.score(X_test, y_test)
    
    results.append({
        'depth': depth,
        'train_acc': train_acc,
        'test_acc': test_acc,
        'n_leaves': tree.get_n_leaves()
    })

results_df = pd.DataFrame(results)
results_df

In [None]:
# Visualize the results
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Accuracy vs Depth
ax1 = axes[0]
ax1.plot(results_df['depth'], results_df['train_acc'], 'b-o', label='Train', linewidth=2)
ax1.plot(results_df['depth'], results_df['test_acc'], 'r-o', label='Test', linewidth=2)
ax1.axhline(y=logreg_test_acc, color='green', linestyle='--', label='Logistic Regression')
ax1.set_xlabel('Max Depth')
ax1.set_ylabel('Accuracy')
ax1.set_title('Train vs Test Accuracy by Depth')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Number of leaves vs Depth
ax2 = axes[1]
ax2.plot(results_df['depth'], results_df['n_leaves'], 'purple', marker='s', linewidth=2)
ax2.set_xlabel('Max Depth')
ax2.set_ylabel('Number of Leaves')
ax2.set_title('Model Complexity by Depth')
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Find and report the optimal depth
optimal_idx = results_df['test_acc'].idxmax()
optimal_depth = results_df.loc[optimal_idx, 'depth']

print(f"=== Optimal max_depth: {optimal_depth} ===")
print(f"Test accuracy: {results_df.loc[optimal_idx, 'test_acc']:.1%}")
print(f"Train accuracy: {results_df.loc[optimal_idx, 'train_acc']:.1%}")
print(f"Number of leaves: {results_df.loc[optimal_idx, 'n_leaves']}")

## 5. Train the Optimal Tree

In [None]:
# Train tree with optimal depth (using the value found above)
# Note: optimal_depth is set dynamically in the previous cell

tree_optimal = DecisionTreeClassifier(
    max_depth=optimal_depth,
    min_samples_leaf=10,
    random_state=42
)
tree_optimal.fit(X_train, y_train)

print(f"=== Optimal Decision Tree (depth={optimal_depth}) ===")
print(f"  Actual Depth: {tree_optimal.get_depth()}")
print(f"  Number of Leaves: {tree_optimal.get_n_leaves()}")
print(f"\n  Train Accuracy: {tree_optimal.score(X_train, y_train):.1%}")
print(f"  Test Accuracy: {tree_optimal.score(X_test, y_test):.1%}")
print(f"  Gap: {tree_optimal.score(X_train, y_train) - tree_optimal.score(X_test, y_test):.1%}")

## 6. Visualize the Tree

In [None]:
# Plot the tree structure
plt.figure(figsize=(20, 12))
plot_tree(
    tree_optimal,
    feature_names=available_features,
    class_names=['Retained', 'Churned'],
    filled=True,
    rounded=True,
    fontsize=10
)
plt.title('Decision Tree for Churn Prediction')
plt.tight_layout()
plt.show()

In [None]:
# Text representation
tree_rules = export_text(tree_optimal, feature_names=available_features)
print("Decision Tree Rules:")
print(tree_rules)

## 7. Feature Importance

In [None]:
# Feature importances
importances = pd.DataFrame({
    'feature': available_features,
    'importance': tree_optimal.feature_importances_
}).sort_values('importance', ascending=False)

print("Feature Importances:")
print(importances.to_string(index=False))

# Visualize
plt.figure(figsize=(10, 5))
plt.barh(importances['feature'], importances['importance'], color='#22c55e')
plt.xlabel('Importance (Gini reduction)')
plt.title('Feature Importance in Churn Prediction')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

## 8. Extract Business Rules

In [None]:
def extract_rules(tree, feature_names, class_names):
    """
    Extract human-readable rules from a decision tree.
    """
    from sklearn.tree import _tree
    
    tree_ = tree.tree_
    feature_name = [
        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
        for i in tree_.feature
    ]
    
    rules = []
    
    def recurse(node, path=""):
        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            
            # Left branch (<=)
            left_path = f"{path} AND {name} <= {threshold:.1f}" if path else f"{name} <= {threshold:.1f}"
            recurse(tree_.children_left[node], left_path)
            
            # Right branch (>)
            right_path = f"{path} AND {name} > {threshold:.1f}" if path else f"{name} > {threshold:.1f}"
            recurse(tree_.children_right[node], right_path)
        else:
            # Leaf node
            value = tree_.value[node][0]
            total = sum(value)
            class_idx = np.argmax(value)
            confidence = value[class_idx] / total
            
            if class_idx == 1:  # Only show churn rules
                rules.append({
                    'rule': path,
                    'prediction': class_names[class_idx],
                    'samples': int(total),
                    'confidence': confidence
                })
    
    recurse(0)
    return pd.DataFrame(rules).sort_values('confidence', ascending=False)

# Extract and display churn rules
churn_rules = extract_rules(tree_optimal, available_features, ['Retained', 'Churned'])
print("Top Churn Rules:")
for i, row in churn_rules.head(5).iterrows():
    print(f"\nIF {row['rule']}")
    print(f"   → {row['prediction']} ({row['confidence']:.0%} of {row['samples']} customers)")

## 9. Compare Models

In [None]:
# Final comparison
print("Model Comparison:")
print("=" * 50)
print(f"\nLogistic Regression:")
print(f"  Test Accuracy: {logreg_test_acc:.1%}")
print(f"  Interpretability: Coefficients")

print(f"\nDecision Tree (depth={optimal_depth}):")
print(f"  Test Accuracy: {tree_optimal.score(X_test, y_test):.1%}")
print(f"  Interpretability: {tree_optimal.get_n_leaves()} rules")

print(f"\nUnconstrained Tree:")
print(f"  Test Accuracy: {deep_test_acc:.1%}")
print(f"  Interpretability: {tree_deep.get_n_leaves()} rules (too many!)")

## 10. Stakeholder Summary

### TODO: Write a 3-bullet summary (~100 words) for the support team

Template:
• **The rules:** Top 2-3 rules that identify high-risk customers (e.g., "tenure < 30 days AND support tickets > 2")
• **Accuracy:** Test accuracy ___%, catching ___% of churners with these rules
• **How to use:** [Should they prioritize certain customer segments? What action should they take?]

**Your Summary:**

_[Write your summary here]_

In [None]:
# SELF-CHECK: Verify your tree experiments are correct
# Run this after completing the depth experiment

# Check that you found the optimal depth
assert 'optimal_depth' in dir(), "Should have found optimal_depth"
assert 1 <= optimal_depth <= 14, f"Optimal depth {optimal_depth} seems unusual"

# Check that optimal tree exists
assert 'tree_optimal' in dir(), "Should have trained tree_optimal"

# Check that tree is not overfitting
train_acc = tree_optimal.score(X_train, y_train)
test_acc = tree_optimal.score(X_test, y_test)
gap = train_acc - test_acc
assert gap < 0.20, f"Train-test gap ({gap:.1%}) suggests overfitting"

# Check that we beat the unconstrained tree
assert tree_optimal.get_n_leaves() < tree_deep.get_n_leaves(), "Optimal tree should have fewer leaves"

print("✅ Self-check passed!")
print(f"   Optimal depth: {optimal_depth}")
print(f"   Train accuracy: {train_acc:.1%}")
print(f"   Test accuracy: {test_acc:.1%}")
print(f"   Train-test gap: {gap:.1%}")
print(f"   Leaves: {tree_optimal.get_n_leaves()} (vs {tree_deep.get_n_leaves()} unconstrained)")

---

## Self-Assessment Checklist

- [ ] I identified the overfitting problem with unconstrained trees
- [ ] I found the optimal depth using train/test comparison
- [ ] I can visualize and interpret tree structure
- [ ] I extracted human-readable business rules
- [ ] I compared tree vs logistic regression tradeoffs

## Next Steps

1. **Debug Drill:** Fix an overfit tree
2. **Module 6:** Ensemble Methods — Random Forests and Boosting