
# Random Forest(tree-based) Regression Experiment (seems to be best model)


In [5]:
import numpy as np
import pandas as pd
from scipy.optimize import curve_fit
import matplotlib.pyplot as plt
import sys
import os

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "../lib")))

from data_load import load_data
from validation import rmse_validation, mae_validation, r2_validation

Load data

In [6]:

date_start = pd.to_datetime("2025-04-13 12:00:00 AM")
date_end = pd.to_datetime("2025-04-27 12:00:00 AM")
all_data = load_data(date_start, date_end)
garages = [x + " Garage" for x in ["North", "South", "West", "South Campus"]]

Define model parameters

In [7]:

model_params = {
    'n_estimators': 100,
    'random_state': 42
}


In [8]:
# Process each garage
for garage in garages:
    print(f"\nProcessing {garage}...")
    # Get garage data
    garage_data = all_data[all_data['garage name'] == garage]
    
    # Preprocess
    x_data = (garage_data['timestamp'] - date_start).dt.total_seconds() / 60
    y_data = garage_data['fullness']
    
    # Reshape for sklearn
    X = x_data.values.reshape(-1, 1)
    y = y_data.values
    
    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Create and fit model
    model = RandomForestRegressor(**model_params)
    model.fit(X_train, y_train)
    
    # Predict
    y_pred = model.predict(X_test)
    
    # Sort for plotting
    sorted_indices = np.argsort(X_test.flatten())
    X_test_sorted = X_test[sorted_indices]
    y_test_sorted = y_test[sorted_indices]
    y_pred_sorted = y_pred[sorted_indices]
    
    # Plot
    plt.figure(figsize=(10, 5))
    plt.scatter(X_test, y_test, color='blue', alpha=0.5, label='raw data')
    plt.plot(X_test_sorted, y_pred_sorted, color='red', label='fitted curve')
    plt.xlabel('Time (minutes)')
    plt.ylabel('Fullness')
    plt.title(f'{garage} - Random Forest Regression')
    plt.legend()
    plt.show()
    
    # Calculate metrics
    rmse = rmse_validation(y_test, y_pred)
    mae = mae_validation(y_test, y_pred)
    r2 = r2_validation(y_test, y_pred)
    
    print(f"\nMetrics for {garage}:")
    print(f"RMSE: {rmse}")
    print(f"MAE: {mae}")
    print(f"R²: {r2}")


Processing North Garage...


NameError: name 'train_test_split' is not defined