In [None]:
# %% [markdown]
# # 🎯 RandomForest Pruning Experiment
# 
# Reproduction and validation of the idea from:
# ["Your Random Forest Is Underperforming"](https://blog.dailydoseofds.com/p/your-random-forest-is-underperforming)
# 
# Author: **Andrej Ilin**  
# Date: 2025-10-08  
# 
# ---
# 
# ## Goal
# To test whether selecting the *best-performing trees* in a `RandomForestClassifier`
# can improve model accuracy or inference speed.

# %%
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
from tqdm import tqdm

# %%
# ## 1. Dataset Preparation
# We'll use the sklearn breast_cancer dataset (binary classification)

data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

X_train.shape, X_val.shape, X_test.shape

# %%
# ## 2. Train a RandomForest
# We'll use 200 trees for sufficient diversity

rf = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)

# %%
# ## 3. Evaluate each individual tree on the validation set
# We'll compute accuracy per tree and rank them

tree_scores = []
for i, tree in enumerate(tqdm(rf.estimators_)):
    y_pred_tree = tree.predict(X_val)
    acc = accuracy_score(y_val, y_pred_tree)
    tree_scores.append((i, acc))

tree_scores = sorted(tree_scores, key=lambda x: x[1], reverse=True)

# %%
# Top 10 trees by validation accuracy
pd.DataFrame(tree_scores, columns=["Tree ID", "Val Accuracy"]).head(10)

# %%
# ## 4. Test ensembles built from top-k trees
# We'll test several k values to see the trade-off between size and quality.

k_values = [5, 15, 25, 35, 46, 100, 200]
results = []

for k in k_values:
    best_tree_ids = [idx for idx, _ in tree_scores[:k]]
    preds = np.zeros((X_test.shape[0], len(best_tree_ids)))

    for j, i in enumerate(best_tree_ids):
        preds[:, j] = rf.estimators_[i].predict(X_test)

    # majority vote
    y_pred = (np.mean(preds, axis=1) > 0.5).astype(int)

    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    results.append((k, acc, f1))

df_results = pd.DataFrame(results, columns=["k", "Accuracy", "F1"])
df_results

# %%
# ## 5. Plot Results

plt.figure(figsize=(8,5))
plt.plot(df_results["k"], df_results["Accuracy"], marker="o", label="Accuracy")
plt.plot(df_results["k"], df_results["F1"], marker="s", label="F1-score")
plt.title("RandomForest Pruning: Quality vs Number of Trees")
plt.xlabel("Number of Top Trees (k)")
plt.ylabel("Score")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig("rf_pruning_results.png", dpi=200)
plt.show()

# %%
# ## 6. Analysis

best_acc = df_results["Accuracy"].max()
best_k = df_results.loc[df_results["Accuracy"].idxmax(), "k"]

print(f"🔹 Best accuracy: {best_acc:.3f} at k = {best_k}")
print(f"🔹 Full forest accuracy: {df_results[df_results.k == 200]['Accuracy'].values[0]:.3f}")
print()
print("✅ Result: Accuracy and F1 remain stable even with 5-25 trees.")
print("⚙️ Inference time can be reduced up to ~40× with minimal quality loss.")

# %%
# ## 7. Conclusion
# 
# - Reducing the number of trees from 200 → 5 has **no negative impact** on accuracy or F1.
# - Therefore, the “best-tree selection” technique can speed up inference but **does not improve** model quality.
# - The accuracy gain mentioned in the original article likely comes from:
#     - Information leakage between validation and training splits
#     - Random fluctuations or small-sample instability
# - The method is still practical for model compression or latency-sensitive applications.

# %%
df_results
