# L04: Random Forests

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Digital-AI-Finance/methods-algorithms/blob/master/notebooks/L04_random_forests.ipynb)

**Course**: Methods and Algorithms - MSc Data Science

---

## Learning Objectives

By the end of this notebook, you will be able to:

1. Build decision trees and understand splitting criteria
2. Implement Random Forests with bootstrap and feature randomization
3. Interpret feature importance for business decisions
4. Use OOB error for model validation

## Setup

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

np.random.seed(42)
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['font.size'] = 12

print('Setup complete!')

## 1. Generate Fraud Detection Data

In [None]:
# Simulate fraud detection dataset
n_samples = 1000
fraud_rate = 0.1

n_fraud = int(n_samples * fraud_rate)
n_normal = n_samples - n_fraud

# Normal transactions
normal_data = {
    'amount': np.random.lognormal(4, 1, n_normal),
    'hour': np.random.choice(range(8, 22), n_normal),
    'is_foreign': np.random.binomial(1, 0.1, n_normal),
    'previous_fraud': np.random.binomial(1, 0.02, n_normal),
    'account_age_days': np.random.randint(100, 2000, n_normal),
    'transaction_freq': np.random.uniform(5, 20, n_normal),
    'device_changed': np.random.binomial(1, 0.05, n_normal),
    'location_distance': np.random.exponential(20, n_normal),
    'is_fraud': np.zeros(n_normal)
}

# Fraudulent transactions (different patterns)
fraud_data = {
    'amount': np.random.lognormal(6, 1.5, n_fraud),
    'hour': np.random.choice([0, 1, 2, 3, 4, 22, 23], n_fraud),
    'is_foreign': np.random.binomial(1, 0.7, n_fraud),
    'previous_fraud': np.random.binomial(1, 0.3, n_fraud),
    'account_age_days': np.random.randint(10, 100, n_fraud),
    'transaction_freq': np.random.uniform(0.5, 3, n_fraud),
    'device_changed': np.random.binomial(1, 0.6, n_fraud),
    'location_distance': np.random.exponential(500, n_fraud),
    'is_fraud': np.ones(n_fraud)
}

# Combine datasets
df = pd.concat([pd.DataFrame(normal_data), pd.DataFrame(fraud_data)], ignore_index=True)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)  # Shuffle

print(f'Dataset shape: {df.shape}')
print(f'Fraud rate: {df.is_fraud.mean():.1%}')
df.head()

In [None]:
# Visualize data distribution
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

# Amount distribution
axes[0, 0].hist(df[df.is_fraud==0].amount, bins=30, alpha=0.6, label='Normal', color='blue')
axes[0, 0].hist(df[df.is_fraud==1].amount, bins=30, alpha=0.6, label='Fraud', color='red')
axes[0, 0].set_xlabel('Transaction Amount')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].legend()
axes[0, 0].set_title('Amount Distribution')

# Hour distribution
axes[0, 1].hist(df[df.is_fraud==0].hour, bins=24, alpha=0.6, label='Normal', color='blue')
axes[0, 1].hist(df[df.is_fraud==1].hour, bins=24, alpha=0.6, label='Fraud', color='red')
axes[0, 1].set_xlabel('Hour of Day')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].legend()
axes[0, 1].set_title('Transaction Hour')

# Account age vs Amount
scatter = axes[1, 0].scatter(df.account_age_days, df.amount, c=df.is_fraud, 
                             cmap='coolwarm', alpha=0.5, s=30)
axes[1, 0].set_xlabel('Account Age (days)')
axes[1, 0].set_ylabel('Amount')
axes[1, 0].set_title('Account Age vs Amount')

# Location distance
axes[1, 1].hist(df[df.is_fraud==0].location_distance, bins=30, alpha=0.6, label='Normal', color='blue')
axes[1, 1].hist(df[df.is_fraud==1].location_distance, bins=30, alpha=0.6, label='Fraud', color='red')
axes[1, 1].set_xlabel('Location Distance (km)')
axes[1, 1].set_ylabel('Frequency')
axes[1, 1].legend()
axes[1, 1].set_title('Location Distance')

plt.tight_layout()
plt.show()

## 2. Single Decision Tree

In [None]:
# Prepare data
feature_cols = ['amount', 'hour', 'is_foreign', 'previous_fraud', 
                'account_age_days', 'transaction_freq', 'device_changed', 'location_distance']
X = df[feature_cols].values
y = df['is_fraud'].values

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f'Training samples: {len(X_train)}')
print(f'Test samples: {len(X_test)}')

In [None]:
# Train a single decision tree
tree = DecisionTreeClassifier(max_depth=4, random_state=42)
tree.fit(X_train, y_train)

# Evaluate
train_acc = tree.score(X_train, y_train)
test_acc = tree.score(X_test, y_test)

print(f'Single Tree - Train Accuracy: {train_acc:.3f}')
print(f'Single Tree - Test Accuracy: {test_acc:.3f}')

In [None]:
# Visualize the tree
plt.figure(figsize=(20, 10))
plot_tree(tree, feature_names=feature_cols, class_names=['Normal', 'Fraud'],
          filled=True, rounded=True, fontsize=10)
plt.title('Decision Tree for Fraud Detection')
plt.tight_layout()
plt.show()

In [None]:
# Show tree variance - train on different samples
accuracies = []
for seed in range(20):
    X_sample, _, y_sample, _ = train_test_split(X_train, y_train, 
                                                 test_size=0.5, random_state=seed)
    tree_temp = DecisionTreeClassifier(max_depth=4, random_state=42)
    tree_temp.fit(X_sample, y_sample)
    accuracies.append(tree_temp.score(X_test, y_test))

print(f'Single Tree Variance: {np.std(accuracies):.3f}')
print(f'Accuracy Range: [{min(accuracies):.3f}, {max(accuracies):.3f}]')

## 3. Random Forest

In [None]:
# Train Random Forest with OOB score
rf = RandomForestClassifier(n_estimators=100, max_features='sqrt',
                            oob_score=True, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)

# Evaluate
train_acc = rf.score(X_train, y_train)
test_acc = rf.score(X_test, y_test)
oob_score = rf.oob_score_

print(f'Random Forest - Train Accuracy: {train_acc:.3f}')
print(f'Random Forest - OOB Score: {oob_score:.3f}')
print(f'Random Forest - Test Accuracy: {test_acc:.3f}')

In [None]:
# OOB error vs number of trees
oob_errors = []
n_trees_range = range(1, 201, 10)

for n_trees in n_trees_range:
    rf_temp = RandomForestClassifier(n_estimators=n_trees, max_features='sqrt',
                                     oob_score=True, random_state=42, n_jobs=-1)
    rf_temp.fit(X_train, y_train)
    oob_errors.append(1 - rf_temp.oob_score_)

plt.figure(figsize=(10, 6))
plt.plot(n_trees_range, oob_errors, 'o-', linewidth=2, markersize=6)
plt.xlabel('Number of Trees')
plt.ylabel('OOB Error Rate')
plt.title('OOB Error vs Number of Trees')
plt.grid(True, alpha=0.3)
plt.show()

## 4. Feature Importance

In [None]:
# Get feature importance
importances = rf.feature_importances_
indices = np.argsort(importances)

# Plot
plt.figure(figsize=(10, 6))
plt.barh(range(len(feature_cols)), importances[indices], align='center')
plt.yticks(range(len(feature_cols)), [feature_cols[i] for i in indices])
plt.xlabel('Mean Decrease in Impurity')
plt.title('Feature Importance (Random Forest)')
plt.tight_layout()
plt.show()

# Print ranking
print('\nFeature Importance Ranking:')
for i in indices[::-1]:
    print(f'  {feature_cols[i]}: {importances[i]:.4f}')

In [None]:
# Business interpretation
print('\n=== Business Interpretation ===')
print('\nTop Fraud Indicators:')

top_features = [(feature_cols[i], importances[i]) for i in indices[::-1][:4]]
strategies = {
    'location_distance': 'Flag transactions >500km from usual location',
    'amount': 'Review transactions >$1000, especially for new accounts',
    'account_age_days': 'New accounts (<100 days) require extra verification',
    'is_foreign': 'Foreign transactions need additional authentication',
    'device_changed': 'Send verification when new device detected',
    'hour': 'Flag transactions between midnight and 5am',
    'transaction_freq': 'Monitor sudden changes in transaction frequency',
    'previous_fraud': 'High-risk customers need enhanced monitoring'
}

for feature, importance in top_features:
    print(f'\n{feature} (importance: {importance:.3f})')
    print(f'  Strategy: {strategies.get(feature, "N/A")}')

## 5. Model Comparison

In [None]:
# Compare single tree vs random forest
from sklearn.metrics import roc_auc_score, roc_curve

# Predictions
tree_proba = tree.predict_proba(X_test)[:, 1]
rf_proba = rf.predict_proba(X_test)[:, 1]

# ROC curves
tree_fpr, tree_tpr, _ = roc_curve(y_test, tree_proba)
rf_fpr, rf_tpr, _ = roc_curve(y_test, rf_proba)

tree_auc = roc_auc_score(y_test, tree_proba)
rf_auc = roc_auc_score(y_test, rf_proba)

plt.figure(figsize=(8, 6))
plt.plot(tree_fpr, tree_tpr, label=f'Single Tree (AUC = {tree_auc:.3f})', linewidth=2)
plt.plot(rf_fpr, rf_tpr, label=f'Random Forest (AUC = {rf_auc:.3f})', linewidth=2)
plt.plot([0, 1], [0, 1], 'k--', alpha=0.5)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve Comparison')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
# Confusion matrix for Random Forest
y_pred = rf.predict(X_test)
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(6, 5))
plt.imshow(cm, interpolation='nearest', cmap='Blues')
plt.colorbar()
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix (Random Forest)')
plt.xticks([0, 1], ['Normal', 'Fraud'])
plt.yticks([0, 1], ['Normal', 'Fraud'])

for i in range(2):
    for j in range(2):
        plt.text(j, i, cm[i, j], ha='center', va='center', fontsize=14)

plt.tight_layout()
plt.show()

print('\nClassification Report:')
print(classification_report(y_test, y_pred, target_names=['Normal', 'Fraud']))

## Exercises

### Exercise 1: Hyperparameter Tuning
Tune `max_features` using OOB error. Compare 'sqrt', 'log2', and 0.3.

In [None]:
# Solution: Compare different max_features values using OOB score
from sklearn.model_selection import GridSearchCV

# Method 1: Manual comparison using OOB score
max_features_options = ['sqrt', 'log2', 0.3]
results = []

print("Comparing max_features using OOB Score:")
print("-" * 45)

for mf in max_features_options:
    rf_temp = RandomForestClassifier(
        n_estimators=100,
        max_features=mf,
        oob_score=True,
        random_state=42,
        n_jobs=-1
    )
    rf_temp.fit(X_train, y_train)
    test_acc = rf_temp.score(X_test, y_test)
    results.append({
        'max_features': mf,
        'oob_score': rf_temp.oob_score_,
        'test_accuracy': test_acc
    })
    print(f"max_features={str(mf):6s}: OOB={rf_temp.oob_score_:.3f}, Test Acc={test_acc:.3f}")

# Method 2: Grid search with cross-validation
print("\nGrid Search with Cross-Validation:")
print("-" * 45)

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, None],
    'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(
    RandomForestClassifier(random_state=42, oob_score=True),
    param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)
grid_search.fit(X_train, y_train)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best CV score: {grid_search.best_score_:.3f}")
print(f"Test accuracy with best params: {grid_search.score(X_test, y_test):.3f}")

### Exercise 2: Permutation Importance
Calculate permutation importance and compare to MDI.

In [None]:
# Solution: Calculate permutation importance and compare to MDI
from sklearn.inspection import permutation_importance

# Calculate permutation importance on test set
perm_importance = permutation_importance(
    rf, X_test, y_test,
    n_repeats=10,
    random_state=42,
    n_jobs=-1
)

# Create comparison dataframe
comparison_df = pd.DataFrame({
    'Feature': feature_cols,
    'MDI (Mean Decrease Impurity)': rf.feature_importances_,
    'Permutation Importance': perm_importance.importances_mean,
    'Perm Std': perm_importance.importances_std
})
comparison_df = comparison_df.sort_values('Permutation Importance', ascending=False)

print("Feature Importance Comparison: MDI vs Permutation")
print("=" * 65)
print(comparison_df.to_string(index=False))

# Visualize comparison
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# MDI importance
ax1 = axes[0]
indices_mdi = np.argsort(rf.feature_importances_)
ax1.barh(range(len(feature_cols)), rf.feature_importances_[indices_mdi], color='steelblue')
ax1.set_yticks(range(len(feature_cols)))
ax1.set_yticklabels([feature_cols[i] for i in indices_mdi])
ax1.set_xlabel('Mean Decrease in Impurity')
ax1.set_title('MDI Feature Importance')

# Permutation importance
ax2 = axes[1]
indices_perm = np.argsort(perm_importance.importances_mean)
ax2.barh(range(len(feature_cols)), perm_importance.importances_mean[indices_perm],
         xerr=perm_importance.importances_std[indices_perm], color='darkorange', capsize=3)
ax2.set_yticks(range(len(feature_cols)))
ax2.set_yticklabels([feature_cols[i] for i in indices_perm])
ax2.set_xlabel('Mean Accuracy Decrease')
ax2.set_title('Permutation Feature Importance')

plt.tight_layout()
plt.show()

print("\nKey Insight: Permutation importance is generally more reliable than MDI")
print("because MDI can be biased toward high-cardinality features.")

## Summary

Key takeaways:

1. **Decision Trees** split data using impurity criteria (Gini, entropy)
2. **Random Forests** combine many trees with bootstrap + feature randomization
3. **OOB Error** provides free cross-validation estimate
4. **Feature Importance** helps interpret and explain model decisions
5. **Ensembles** reduce variance without increasing bias