In [4]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, mean_squared_error, root_mean_squared_error

print("--- TRACK A: COMPETING RANDOM FOREST AGAINST THE XGBOOST CHAMPION ON ORIGINAL DATA ---")

# ==============================================================================
# STEP 1: LOAD AND PREPARE THE ORIGINAL df_analysis.csv DATA
# ==============================================================================
print("\n--- Step 1: Loading and Preparing df_analysis.csv ---")
try:
    # We load the original dataset, not df_historical
    df = pd.read_csv('..\data\df_analysis.csv')
    df['Date'] = pd.to_datetime(df['Year'].astype(str) + '-' + df['Day_of_Year'].astype(str), format='%Y-%j')
    df.set_index('Date', inplace=True)
    print("✅ Successfully loaded and processed df_analysis.csv")
except FileNotFoundError:
    print("❌ FATAL ERROR: df_analysis.csv not found.")
    exit()



--- TRACK A: COMPETING RANDOM FOREST AGAINST THE XGBOOST CHAMPION ON ORIGINAL DATA ---

--- Step 1: Loading and Preparing df_analysis.csv ---
✅ Successfully loaded and processed df_analysis.csv


In [6]:
# This is the full feature engineering from PHASE 1. No RH features are included.
print("Engineering features to match the Phase 1 Champion model...")
df['Daily_Temp_Range'] = df['Max_Temp_C'] - df['Min_Temp_C']
for i in [1, 2, 3, 7]:
    for col in ['Max_Temp_C', 'Min_Temp_C', 'Precipitation_mm']:
        df[f'{col}_lag_{i}'] = df[col].shift(i)
for window in [7, 30]:
    for col in ['Max_Temp_C', 'Min_Temp_C']:
        df[f'{col}_rolling_mean_{window}d'] = df[col].rolling(window=window).mean()
        df[f'{col}_rolling_std_{window}d'] = df[col].rolling(window=window).std()
    df[f'Precipitation_mm_rolling_sum_{window}d'] = df['Precipitation_mm'].rolling(window=window).sum()
    df[f'Precipitation_mm_rolling_std_{window}d'] = df['Precipitation_mm'].rolling(window=window).std()
df['Daily_Temp_Range_lag_1'] = df['Daily_Temp_Range'].shift(1)
df.dropna(inplace=True)

# Create targets (3-day horizon) and define X, y
for h in range(1, 4):
    df[f'Target_Max_Temp_C_t+{h}'] = df['Max_Temp_C'].shift(-h)
    df[f'Target_Min_Temp_C_t+{h}'] = df['Min_Temp_C'].shift(-h)
df.dropna(subset=[f'Target_Max_Temp_C_t+3'], inplace=True)

target_cols = [col for col in df.columns if 'Target_' in str(col)]
feature_cols = [col for col in df.columns if col not in ['Max_Temp_C', 'Min_Temp_C'] + target_cols]
X = df[feature_cols]
y = df[target_cols]

# Use the same train/test split for a fair comparison
split_date = '2020-01-01'
X_train, X_test = X[X.index < split_date], X[X.index >= split_date]
y_train, y_test = y[y.index < split_date], y[y.index >= split_date]
print("✅ Data prepared for modeling. This is the same data the XGBoost Champion was trained on.")

# ==============================================================================
# STEP 2: TUNE AND TRAIN THE RANDOM FOREST MODEL
# ==============================================================================
print("\n--- Step 2: Tuning Random Forest with RandomizedSearchCV ---")
# Define a robust parameter grid for Random Forest
param_dist = {
    'n_estimators': [100, 250, 400, 500],       # How many trees in the forest
    'max_features': [0.5, 0.75, 1.0],           # Number of features to consider at each split
    'max_depth': [10, 20, 30, 40, None],        # Maximum depth of the tree
    'min_samples_split': [2, 5, 10, 15],        # Minimum number of samples required to split a node
    'min_samples_leaf': [1, 2, 4, 6],           # Minimum number of samples required at each leaf node
    'bootstrap': [True]                         # Always True for Random Forest (sampling with replacement)
}

# We tune on the primary target (t+1 Max Temp) and then wrap the best model
# in MultiOutputRegressor to handle all targets.
primary_target_train = y_train['Target_Max_Temp_C_t+1']

rf_estimator = RandomForestRegressor(random_state=42, n_jobs=-1, oob_score=True, bootstrap=True)
random_search = RandomizedSearchCV(
    estimator=rf_estimator,
    param_distributions=param_dist,
    n_iter=50,  # A reasonable number of iterations for a fair search
    cv=TimeSeriesSplit(n_splits=5),
    scoring='neg_mean_absolute_error',
    verbose=1,
    random_state=42
)
random_search.fit(X_train, primary_target_train)

print(f"\n✅ Best parameters found for Random Forest: {random_search.best_params_}")

# Create the final multi-output model using the best found parameters
final_rf_model = MultiOutputRegressor(
    RandomForestRegressor(**random_search.best_params_, random_state=42, n_jobs=-1)
)
final_rf_model.fit(X_train, y_train)
print("✅ Final Random Forest model trained.")

# ==============================================================================
# STEP 3: EVALUATE THE RANDOM FOREST CHALLENGER
# ==============================================================================
print("\n--- Step 3: Evaluating Random Forest Performance ---")
rf_predictions = final_rf_model.predict(X_test)
rf_pred_df = pd.DataFrame(rf_predictions, index=X_test.index, columns=y_test.columns)

print("\n--- RANDOM FOREST MODEL PERFORMANCE (Challenging the XGBoost Champion) ---")
for i in range(1, 4):
    mae_max = mean_absolute_error(y_test[f'Target_Max_Temp_C_t+{i}'], rf_pred_df[f'Target_Max_Temp_C_t+{i}'])
    rmse_max = root_mean_squared_error(y_test[f'Target_Max_Temp_C_t+{i}'], rf_pred_df[f'Target_Max_Temp_C_t+{i}'])
    mae_min = mean_absolute_error(y_test[f'Target_Min_Temp_C_t+{i}'], rf_pred_df[f'Target_Min_Temp_C_t+{i}'])
    rmse_min = root_mean_squared_error(y_test[f'Target_Min_Temp_C_t+{i}'], rf_pred_df[f'Target_Min_Temp_C_t+{i}'])
    print(f"Horizon t+{i}: Max Temp MAE = {mae_max:.2f}°C | Min Temp MAE = {mae_min:.2f}°C")
    print(f"Horizon t+{i}: Max Temp RMSE = {rmse_max:.2f}°C | Min Temp RMSE = {rmse_min:.2f}°C")

Engineering features to match the Phase 1 Champion model...
✅ Data prepared for modeling. This is the same data the XGBoost Champion was trained on.

--- Step 2: Tuning Random Forest with RandomizedSearchCV ---
Fitting 5 folds for each of 50 candidates, totalling 250 fits



✅ Best parameters found for Random Forest: {'n_estimators': 100, 'min_samples_split': 15, 'min_samples_leaf': 6, 'max_features': 0.75, 'max_depth': 20, 'bootstrap': True}
✅ Final Random Forest model trained.

--- Step 3: Evaluating Random Forest Performance ---

--- RANDOM FOREST MODEL PERFORMANCE (Challenging the XGBoost Champion) ---
Horizon t+1: Max Temp MAE = 1.40°C | Min Temp MAE = 0.96°C
Horizon t+1: Max Temp RMSE = 1.98°C | Min Temp RMSE = 1.27°C
Horizon t+2: Max Temp MAE = 1.60°C | Min Temp MAE = 1.07°C
Horizon t+2: Max Temp RMSE = 2.22°C | Min Temp RMSE = 1.43°C
Horizon t+3: Max Temp MAE = 1.68°C | Min Temp MAE = 1.13°C
Horizon t+3: Max Temp RMSE = 2.34°C | Min Temp RMSE = 1.51°C
