# **Random Forest with RandomizedSearchCV**
# **CMPT 459 Course Project**

This notebook demonstrates **Random Forest classification** with **hyperparameter tuning** using RandomizedSearchCV.

**Reference**: `classification_analysis.py`, `random_forest.py`


In [5]:
import sys
import os

# Find project root by looking for 'data' directory
def find_project_root():
    """Find project root by looking for 'data' directory."""
    current = os.path.abspath(os.getcwd())
    while current != os.path.dirname(current):  # Stop at filesystem root
        if os.path.exists(os.path.join(current, 'data', 'diabetic_data.csv')):
            return current
        current = os.path.dirname(current)
    # Fallback: assume we're in project root or one level down
    if os.path.exists('data/diabetic_data.csv'):
        return os.getcwd()
    elif os.path.exists('../data/diabetic_data.csv'):
        return os.path.dirname(os.getcwd())
    else:
        raise FileNotFoundError("Could not find project root. Please ensure 'data/diabetic_data.csv' exists.")

# Set project root
PROJECT_ROOT = find_project_root()
sys.path.insert(0, PROJECT_ROOT)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Try to import seaborn, but make it optional
try:
    import seaborn as sns
    HAS_SEABORN = True
except ImportError:
    HAS_SEABORN = False
    print("Warning: seaborn not available. Confusion matrix plots will use matplotlib instead.")

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, roc_curve, roc_auc_score
)
from random_forest import RandomForest

print(f"Project root: {PROJECT_ROOT}")
print(f"RandomForest imported successfully: {RandomForest is not None}")

Project root: /Users/hoang/CMPT-459-Course-Project
RandomForest imported successfully: True


## **1. Data Preprocessing**


In [6]:
def load_and_preprocess(path):
    df = pd.read_csv(path)
    print(f"Original shape: {df.shape}")
    df = df.replace("?", np.nan)
    threshold = 0.4 * len(df)
    df = df.dropna(thresh=threshold, axis=1)
    for col in df.select_dtypes(include="object").columns:
        df[col] = df[col].fillna("Unknown")
    df["readmitted_binary"] = df["readmitted"].map({"NO": 0, ">30": 1, "<30": 1})
    cat_cols = df.select_dtypes(include="object").columns.drop(["readmitted"], errors="ignore")
    le = LabelEncoder()
    for col in cat_cols:
        if df[col].nunique() < 10:
            df[col] = le.fit_transform(df[col].astype(str))
        else:
            df = pd.get_dummies(df, columns=[col], drop_first=True, prefix=col[:10])
    for col in ["encounter_id", "patient_nbr", "readmitted"]:
        if col in df.columns:
            df = df.drop(columns=[col])
    X = df.drop(columns=["readmitted_binary"])
    y = df["readmitted_binary"]
    num_cols = X.select_dtypes(include=["int64", "float64"]).columns
    scaler = StandardScaler()
    X[num_cols] = scaler.fit_transform(X[num_cols])
    print(f"Final shape: {X.shape}")
    return X, y

# Use absolute path to data file
if 'PROJECT_ROOT' not in globals():
    def find_project_root():
        current = os.path.abspath(os.getcwd())
        while current != os.path.dirname(current):
            if os.path.exists(os.path.join(current, 'data', 'diabetic_data.csv')):
                return current
            current = os.path.dirname(current)
        if os.path.exists('data/diabetic_data.csv'):
            return os.getcwd()
        elif os.path.exists('../data/diabetic_data.csv'):
            return os.path.dirname(os.getcwd())
        else:
            raise FileNotFoundError("Could not find project root.")
    PROJECT_ROOT = find_project_root()

data_path = os.path.join(PROJECT_ROOT, "data", "diabetic_data.csv")
X, y = load_and_preprocess(data_path)

Original shape: (101766, 50)
Final shape: (101766, 2389)


## **2. Train/Test Split**


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"Training: {X_train.shape}, Test: {X_test.shape}")

Training: (81412, 2389), Test: (20354, 2389)


## **3. Baseline Random Forest**


In [8]:
# Ensure RandomForest is imported (run Cell 1 first if needed)
if 'RandomForest' not in globals():
    raise NameError("Please run Cell 1 first to import RandomForest from random_forest module.")
if 'X_train' not in globals() or 'y_train' not in globals():
    raise NameError("Please run Cells 3 and 5 first to load and split the data.")

baseline_rf = RandomForest(n_estimators=20, max_depth=10, random_state=42)
baseline_rf.fit(X_train, y_train)
y_pred_base = baseline_rf.predict(X_test)
y_proba_base = baseline_rf.predict_proba(X_test)
acc_base = accuracy_score(y_test, y_pred_base)
f1_base = f1_score(y_test, y_pred_base, average="weighted")
auc_base = roc_auc_score(y_test, y_proba_base[:, 1])
print(f"Baseline - Accuracy: {acc_base:.4f}, F1: {f1_base:.4f}, AUC: {auc_base:.4f}")

Training Random Forest with 20 trees...
  Training tree 1/20...

KeyboardInterrupt: 

## **4. RandomizedSearchCV**


In [None]:
param_dist = {
    "n_estimators": [10, 20, 50, 100, 200],
    "max_depth": [5, 10, 15, 20, 30, None],
    "min_samples_split": [2, 5, 10, 15],
    "max_features": ["sqrt", "log2", None],
    "criterion": ["gini", "entropy"]
}
print("Running RandomizedSearchCV...")
random_search = RandomizedSearchCV(
    RandomForest(random_state=42),
    param_distributions=param_dist,
    n_iter=20, cv=5, scoring="f1_weighted",
    random_state=42, n_jobs=-1, verbose=1
)
random_search.fit(X_train, y_train)
print("\nBest parameters:")
for k, v in random_search.best_params_.items():
    print(f"  {k}: {v}")
print(f"Best CV F1: {random_search.best_score_:.4f}")

## **5. Tuned Model Evaluation**


In [None]:
best_rf = random_search.best_estimator_
y_pred_tuned = best_rf.predict(X_test)
y_proba_tuned = best_rf.predict_proba(X_test)
acc_tuned = accuracy_score(y_test, y_pred_tuned)
f1_tuned = f1_score(y_test, y_pred_tuned, average="weighted")
auc_tuned = roc_auc_score(y_test, y_proba_tuned[:, 1])
print(f"Tuned - Accuracy: {acc_tuned:.4f}, F1: {f1_tuned:.4f}, AUC: {auc_tuned:.4f}")
print(f"\nImprovement:")
print(f"  Accuracy: {acc_tuned - acc_base:+.4f}")
print(f"  F1: {f1_tuned - f1_base:+.4f}")
print(f"  AUC: {auc_tuned - auc_base:+.4f}")

## **6. Visualizations**


**Saved Result - Cross-Validation Scores:**

<img src="../randomforest_results/cv_scores.png" alt="CV Scores Comparison" width="800"/>


In [None]:
# Comparison plot
metrics = ["Accuracy", "F1 Score", "AUC-ROC"]
baseline_vals = [acc_base, f1_base, auc_base]
tuned_vals = [acc_tuned, f1_tuned, auc_tuned]
x = np.arange(len(metrics))
width = 0.35
fig, ax = plt.subplots(figsize=(10, 6))
ax.bar(x - width/2, baseline_vals, width, label="Baseline", color="steelblue")
ax.bar(x + width/2, tuned_vals, width, label="Tuned", color="darkorange")
ax.set_ylabel("Score")
ax.set_title("Baseline vs Tuned Model")
ax.set_xticks(x)
ax.set_xticklabels(metrics)
ax.legend()
ax.set_ylim([0, 1])
plt.tight_layout()
plt.show()

**Saved Results - Confusion Matrices:**

**Training Set:**

<img src="../randomforest_results/confusion_matrix_train.png" alt="Confusion Matrix - Train" width="600"/>

**Test Set:**

<img src="../randomforest_results/confusion_matrix_test.png" alt="Confusion Matrix - Test" width="600"/>


In [None]:
# Confusion matrices
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
cm_base = confusion_matrix(y_test, y_pred_base)
cm_tuned = confusion_matrix(y_test, y_pred_tuned)

if HAS_SEABORN:
    sns.heatmap(cm_base, annot=True, fmt="d", cmap="Blues", ax=axes[0])
    sns.heatmap(cm_tuned, annot=True, fmt="d", cmap="Oranges", ax=axes[1])
else:
    # Fallback to matplotlib if seaborn not available
    axes[0].imshow(cm_base, cmap="Blues", aspect="auto")
    axes[0].set_xticks(range(len(cm_base)))
    axes[0].set_yticks(range(len(cm_base)))
    for i in range(len(cm_base)):
        for j in range(len(cm_base)):
            axes[0].text(j, i, str(cm_base[i, j]), ha="center", va="center", color="black")
    
    axes[1].imshow(cm_tuned, cmap="Oranges", aspect="auto")
    axes[1].set_xticks(range(len(cm_tuned)))
    axes[1].set_yticks(range(len(cm_tuned)))
    for i in range(len(cm_tuned)):
        for j in range(len(cm_tuned)):
            axes[1].text(j, i, str(cm_tuned[i, j]), ha="center", va="center", color="black")

axes[0].set_title(f"Baseline (Acc={acc_base:.4f})")
axes[1].set_title(f"Tuned (Acc={acc_tuned:.4f})")
plt.tight_layout()
plt.show()

**Saved Results - ROC Curves:**

**Training Set:**

<img src="../randomforest_results/roc_curve_train.png" alt="ROC Curve - Train" width="700"/>

**Test Set:**

<img src="../randomforest_results/roc_curve_test.png" alt="ROC Curve - Test" width="700"/>

**PCA Visualization with Labels:**

<img src="../randomforest_results/readmission_labels_pca.png" alt="Readmission Labels in PCA Space" width="800"/>


In [None]:
# ROC curves
fpr_base, tpr_base, _ = roc_curve(y_test, y_proba_base[:, 1])
fpr_tuned, tpr_tuned, _ = roc_curve(y_test, y_proba_tuned[:, 1])
plt.figure(figsize=(8, 6))
plt.plot(fpr_base, tpr_base, label=f"Baseline (AUC={auc_base:.4f})", color="steelblue")
plt.plot(fpr_tuned, tpr_tuned, label=f"Tuned (AUC={auc_tuned:.4f})", color="darkorange")
plt.plot([0, 1], [0, 1], "k--", label="Random")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve Comparison")
plt.legend()
plt.grid(alpha=0.3)
plt.show()

## **Interpretation & Discussion**

### **RandomizedSearchCV Benefits**
* Faster than GridSearchCV (samples random combinations)
* Often finds good solutions quickly
* Built-in cross-validation

### **Key Hyperparameters**
* **n_estimators**: More trees → better performance (up to a point)
* **max_depth**: Controls complexity and overfitting
* **min_samples_split**: Higher → more pruning
* **max_features**: Controls randomness in feature selection

### **Typical Improvements**
* Accuracy: +0.5-2%
* F1 Score: +1-3%
* AUC-ROC: +0.01-0.03

### **Random Forest Strengths**
* Handles non-linear relationships
* Robust to outliers
* Provides feature importances
* Less prone to overfitting

### **Best Practices**
1. Start with baseline (default parameters)
2. Use RandomizedSearchCV for tuning
3. Always cross-validate
4. Analyze feature importances
5. Monitor for overfitting
