# Random Forest Explained


This notebook mirrors the workflow from the logistic-regression analysis, but focuses on a RandomForestClassifier. We:

* load the wildfire training/test CSV files and reuse the same preprocessing pipeline;
* inspect the target distribution and a couple of feature relationships;
* train a baseline random forest with default hyperparameters;
* perform a manual sweep over `n_estimators` and `max_depth` to understand their impact;
* review evaluation metrics (accuracy, classification report, confusion matrix) and visualise tuning results.


In [None]:

from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

plt.style.use('seaborn-v0_8')
DATA_DIR = Path.cwd()
TRAIN_PATH = DATA_DIR / 'wildfires_training.csv'
TEST_PATH = DATA_DIR / 'wildfires_test.csv'
RANDOM_STATE = 42


In [None]:

train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)
print(f"Train shape: {train_df.shape}, Test shape: {test_df.shape}")
train_df.head()


In [None]:

y_train = train_df['fire'].map({'yes': 1, 'no': 0}).astype(int)
X_train = train_df.drop(columns='fire')
y_test = test_df['fire'].map({'yes': 1, 'no': 0}).astype(int)
X_test = test_df.drop(columns='fire')

scaler = StandardScaler().fit(X_train)
Xtr = scaler.transform(X_train)
Xte = scaler.transform(X_test)

print('Scaled feature shapes:', Xtr.shape, Xte.shape)
print('Training class counts:')
print(y_train.value_counts())


In [None]:

ax = y_train.value_counts().sort_index().plot(kind='bar', color=['tab:blue', 'tab:red'])
ax.set_xticklabels(['no fire (0)', 'fire (1)'], rotation=0)
ax.set_ylabel('Count')
ax.set_title('Training class distribution')
plt.show()


In [None]:

plt.figure(figsize=(6, 4))
sns.scatterplot(
    data=train_df,
    x='temp',
    y='humidity',
    hue=y_train,
    palette={0: 'tab:blue', 1: 'tab:red'},
    alpha=0.7,
)
plt.title('Temperature vs Humidity coloured by fire label')
plt.show()


## Baseline Random Forest

In [None]:

baseline_rf = RandomForestClassifier(random_state=RANDOM_STATE, n_jobs=-1)
baseline_rf.fit(X_train, y_train)

train_pred = baseline_rf.predict(X_train)
test_pred = baseline_rf.predict(X_test)
print(f"Train accuracy: {accuracy_score(y_train, train_pred):.4f}")
print(f"Test accuracy : {accuracy_score(y_test, test_pred):.4f}
")
print('Classification report (test set):')
print(classification_report(y_test, test_pred))

cm = confusion_matrix(y_test, test_pred)
plt.figure(figsize=(4, 3))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted label')
plt.ylabel('True label')
plt.title('Confusion matrix (baseline RF)')
plt.show()


## Manual hyperparameter sweep

In [None]:

n_estimators_list = (50, 100, 200, 400)
max_depth_list = (6, 8, 10)

rows = []
for depth in max_depth_list:
    for n_estimators in n_estimators_list:
        rf = RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=depth,
            random_state=RANDOM_STATE,
            n_jobs=-1,
        )
        rf.fit(X_train, y_train)
        rows.append(
            {
                'max_depth': depth,
                'n_estimators': n_estimators,
                'train_acc': rf.score(X_train, y_train),
                'test_acc': rf.score(X_test, y_test),
            }
        )

rf_results = pd.DataFrame(rows).sort_values(['max_depth', 'n_estimators']).reset_index(drop=True)
rf_results


In [None]:

best_rf = rf_results.loc[rf_results['test_acc'].idxmax()]
print('Best configuration:')
print(best_rf)


In [None]:

plt.figure(figsize=(6, 4))
for depth, group in rf_results.groupby('max_depth'):
    group = group.sort_values('n_estimators')
    plt.plot(group['n_estimators'], group['test_acc'], marker='o', label=f'max_depth={depth}')
plt.xscale('log')
plt.xlabel('n_estimators (log scale)')
plt.ylabel('Test accuracy')
plt.title('Random Forest test accuracy vs n_estimators')
plt.grid(alpha=0.3)
plt.legend()
plt.tight_layout()
plt.show()


## Feature importance

In [None]:

rf_best = RandomForestClassifier(
    n_estimators=int(best_rf['n_estimators']),
    max_depth=int(best_rf['max_depth']),
    random_state=RANDOM_STATE,
    n_jobs=-1,
)
rf_best.fit(X_train, y_train)
importances = pd.Series(rf_best.feature_importances_, index=X_train.columns).sort_values(ascending=False)

plt.figure(figsize=(6, 4))
importances.plot(kind='bar')
plt.ylabel('Importance')
plt.title('Feature importances (best RF)')
plt.tight_layout()
plt.show()



### Summary

* The baseline random forest perfectly fits the training data but leaves room on the test set, hinting at overfitting.
* Constraining `max_depth` and adjusting `n_estimators` reveals a sweet spot (see table/plot) where test accuracy stabilises around 0.86.
* Feature importances highlight which environmental variables contribute most to the fire/no-fire decision boundary.
