# Baseline Models

## STEP 0: Environment setup
Import libraries, load config, and initialize a file+stdout logger for a reproducible run.

In [None]:
import os
import sys
import logging

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import torch
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# Add src/ directory to path for config + logger utils
sys.path.append(os.path.abspath('../src'))
import config
from utils import setup_logger

logger = setup_logger(
    name="baseline-model",
    log_dir=getattr(config, "LOG_DIR", os.path.join(os.getcwd(), "log")),
    filename="baseline-log.txt",
    level=logging.INFO,
    mode="w",
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logger.info(f"Device: {device}")
logger.info("Baseline notebook initialized.")

## STEP 1: Load and preprocess
Load the cleaned dataset and create basic time features used by the baselines (hour/minute/day-of-week).

In [None]:
# --- 1. Load Data ---
cleaned_path = os.path.join(config.DATA_DIR, "vehicle_positions_cleaned.csv")

if os.path.exists(cleaned_path):
    logger.info(f"Loading data from: {cleaned_path}")
    df = pd.read_csv(cleaned_path)
else:
    error_msg = f"Could not find dataset at {cleaned_path}"
    logger.error(error_msg)
    raise FileNotFoundError(error_msg)

# --- 2. Feature Engineering (Extracting Time) ---
# Ensure timestamp is datetime
df['dt'] = pd.to_datetime(df['timestamp'])

# Log the time range of the dataset for context
logger.info(f"Dataset time range: {df['dt'].min()} to {df['dt'].max()}")

# Create numeric features
df['hour'] = df['dt'].dt.hour
df['minute'] = df['dt'].dt.minute
df['day_of_week'] = df['dt'].dt.dayofweek  # Monday=0, Sunday=6

# --- 3. Select Features for Linear Regression ---
# We use Space (Lat/Lon) and Time (Hour/Min/Day)
# Check if config has feature definitions, otherwise default to hardcoded list
feature_cols = getattr(config, 'BASELINE_FEATURES', ['latitude', 'longitude', 'hour', 'minute', 'day_of_week'])
target_col = getattr(config, 'TARGET_COL', 'delay_seconds')

logger.info(f"Selected Features: {feature_cols}")
logger.info(f"Target Variable: {target_col}")

# Drop rows with missing values in these specific columns
initial_rows = len(df)
df_model = df.dropna(subset=feature_cols + [target_col]).copy()
dropped_rows = initial_rows - len(df_model)

if dropped_rows > 0:
    logger.warning(f"Dropped {dropped_rows} rows due to missing values in feature/target columns.")
else:
    logger.info("No rows dropped; data is clean.")

logger.info(f"Data ready for modeling. Final Row Count: {len(df_model):,}")

## STEP 2: Split and scale
Split into train/test and apply `StandardScaler` (fit on train only) to keep the evaluation fair and stable.

In [None]:
# --- 1. Prepare Data for Splitting ---
X = df_model[feature_cols]
y = df_model[target_col]

# Retrieve split parameters from config or use defaults
test_size = getattr(config, 'TEST_SIZE', 0.2)
random_state = getattr(config, 'RANDOM_STATE', 42)

logger.info(f"Splitting data with Test Size: {test_size} and Random State: {random_state}")

# --- 2. Split Data ---
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=test_size,
    random_state=random_state,
    )

logger.info("Data split complete.")
logger.info(f"  - Train: {X_train.shape[0]:,} rows")
logger.info(f"  - Test:  {X_test.shape[0]:,} rows")

# --- 3. Scale Data (Normalize inputs) ---
scaler = StandardScaler()

# Fit on TRAIN, transform on BOTH to avoid leakage
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

logger.info("Feature scaling complete.")

# Safety check for NaNs after scaling (e.g., if a column had 0 variance)
if np.isnan(X_train_scaled).any() or np.isnan(X_test_scaled).any():
    logger.error("NaNs detected after scaling. Check for constant features.")
else:
    logger.info("Scaling verification passed: No NaNs detected.")

## STEP 3: Dummy baseline
Establish a minimum bar: always predicting the train mean delay.

In [None]:
# --- 4. Establishing the 'Dummy' Baseline ---
logger.info("--- Starting Dummy Baseline Training ---")

# Create a dummy regressor that always predicts the MEAN of the training set
dummy_regr = DummyRegressor(strategy="mean")
dummy_regr.fit(X_train_scaled, y_train)

# Predict
y_pred_dummy = dummy_regr.predict(X_test_scaled)

# Calculate Metrics
mae_dummy = mean_absolute_error(y_test, y_pred_dummy)
mse_dummy = mean_squared_error(y_test, y_pred_dummy)
rmse_dummy = np.sqrt(mse_dummy)
r2_dummy = r2_score(y_test, y_pred_dummy)

# Log the results clearly
logger.info("--- DUMMY BASELINE RESULTS ---")
logger.info(f"Strategy: Always predict global mean ({y_train.mean():.2f} sec)")
logger.info(f"MAE (Mean Absolute Error):  {mae_dummy:.2f} seconds")
logger.info(f"RMSE (Root Mean Sq Error):  {rmse_dummy:.2f} seconds")
logger.info(f"R2 Score (Variance expl.):  {r2_dummy:.4f}") 

# Store baseline MAE in config or a variable for easy comparison later
baseline_mae = mae_dummy

## STEP 4: Linear regression
Train a simple linear model on (lat, lon, time) features and compare against the dummy baseline.

In [None]:
# --- 5. Training Linear Regression ---
logger.info("--- Starting Linear Regression Training ---")

# 1. Initialize and Train
# Optional: Load hyperparameters from config if they exist (e.g., fit_intercept)
fit_intercept = getattr(config, 'LIN_REG_FIT_INTERCEPT', True)
lin_reg = LinearRegression(fit_intercept=fit_intercept)

lin_reg.fit(X_train_scaled, y_train)
logger.info("Model training complete.")

# 2. Predict on Test Set
y_pred_lin = lin_reg.predict(X_test_scaled)

# 3. Evaluate
mae_lin = mean_absolute_error(y_test, y_pred_lin)
mse_lin = mean_squared_error(y_test, y_pred_lin)
rmse_lin = np.sqrt(mse_lin)
r2_lin = r2_score(y_test, y_pred_lin)

logger.info("--- LINEAR REGRESSION RESULTS ---")
logger.info(f"MAE (Mean Absolute Error): {mae_lin:.2f} seconds")
logger.info(f"RMSE (Root Mean Sq Error): {rmse_lin:.2f} seconds")
logger.info(f"R² Score:                  {r2_lin:.4f}")

# 4. Compare with Baseline
improvement = mae_dummy - mae_lin
logger.info(f"Improvement over Dummy:    {improvement:.2f} seconds")

if improvement > 0:
    logger.info("RESULT: The model SUCCESSFULY beat the baseline.")
else:
    logger.warning("RESULT: The model FAILED to beat the baseline. Features may be non-predictive.")

# 5. Inspect Coefficients (Feature Importance)
# This shows us which features the linear model thinks are most important
logger.info("--- Learned Coefficients (Weights) ---")
for feature_name, coef in zip(feature_cols, lin_reg.coef_):
    logger.info(f"  {feature_name}: {coef:.4f}")
logger.info(f"  Intercept: {lin_reg.intercept_:.4f}")

## STEP 5: Diagnostics (linear regression)
Plot coefficients, actual vs predicted, and residual distribution; save the figure under `plots/`.

In [None]:
# --- 6. Visualization: Linear Regression Diagnostics ---
logger.info("--- Generating Linear Regression Visualizations ---")

plots_dir = os.path.abspath(getattr(config, 'PLOTS_DIR', os.path.join(os.getcwd(), 'plots')))
os.makedirs(plots_dir, exist_ok=True)

plt.figure(figsize=(20, 6))

# --- A. COEFFICIENTS ---
plt.subplot(1, 3, 1)

coeffs = pd.DataFrame({
    'Feature': feature_cols,
    'Weight (Coefficient)': lin_reg.coef_,
})
coeffs['Abs_Weight'] = coeffs['Weight (Coefficient)'].abs()
coeffs = coeffs.sort_values(by='Abs_Weight', ascending=False)

sns.barplot(
    x='Weight (Coefficient)',
    y='Feature',
    hue='Feature',
    data=coeffs,
    palette='coolwarm',
    legend=False,
 )
plt.title('Linear Regression: Feature Coefficients', fontsize=16)
plt.xlabel('Weight (Impact on Delay)')
plt.grid(axis='x', alpha=0.3)

# --- B. ACTUAL VS PREDICTED ---
plt.subplot(1, 3, 2)

if len(y_test) > 500:
    rng = np.random.RandomState(getattr(config, 'RANDOM_STATE', 42))
    indices = rng.choice(len(y_test), 500, replace=False)
    y_test_sample = y_test.iloc[indices]
    y_pred_sample = y_pred_lin[indices]
    logger.info("Subsampled 500 points for clearer scatter plot.")
else:
    y_test_sample = y_test
    y_pred_sample = y_pred_lin

sns.scatterplot(x=y_test_sample, y=y_pred_sample, alpha=0.6, color='blue', label='Predictions')

axis_min = min(y_test_sample.min(), y_pred_sample.min())
axis_max = max(y_test_sample.max(), y_pred_sample.max())
plt.plot([axis_min, axis_max], [axis_min, axis_max], 'r--', linewidth=2, label='Perfect Prediction')

plt.xlabel('Actual Delay (seconds)')
plt.ylabel('Predicted Delay (seconds)')
plt.title(f'Actual vs Predicted (MAE: {mae_lin:.0f}s)', fontsize=16)
plt.legend()
plt.grid(True, alpha=0.3)

# --- C. RESIDUAL DISTRIBUTION ---
plt.subplot(1, 3, 3)
residuals = y_test - y_pred_lin

sns.histplot(residuals, bins=50, kde=True, color='green', line_kws={'linewidth': 2})
plt.axvline(0, color='red', linestyle='--')
plt.title('Distribution of Errors (Residuals)', fontsize=16)
plt.xlabel('Error (Seconds)')
plt.ylabel('Frequency')
plt.xlim(-500, 500)

plt.tight_layout()

plot_path = os.path.join(plots_dir, 'baseline_linear_regression_diagnostics.png')
plt.savefig(plot_path, dpi=150, bbox_inches='tight')
logger.info(f"Saved plot: {plot_path}")
plt.show()

logger.info("Top coefficients (by absolute weight):")
for _, row in coeffs.iterrows():
    logger.info(f"  {row['Feature']}: {row['Weight (Coefficient)']:.4f}")

## STEP 6: Random Forest baseline
Train a non-linear baseline and evaluate on the held-out test split.

In [None]:
# --- 6. Non-Linear Baseline (Random Forest) ---
logger.info("--- Starting Random Forest Training ---")

# 1. Initialize Random Forest
# We pull parameters from config if available, otherwise default to the baseline settings
n_estimators = getattr(config, 'RF_N_ESTIMATORS', 50)
max_depth = getattr(config, 'RF_MAX_DEPTH', 10)
random_state = getattr(config, 'RANDOM_STATE', 42)

logger.info(f"Training Random Forest (n_estimators={n_estimators}, max_depth={max_depth})...")

rf_model = RandomForestRegressor(
    n_estimators=n_estimators,
    max_depth=max_depth,
    random_state=random_state,
    n_jobs=-1  # Use all available CPU cores
)

# 2. Train
rf_model.fit(X_train_scaled, y_train)
logger.info("Random Forest training complete.")

# 3. Predict
y_pred_rf = rf_model.predict(X_test_scaled)

# 4. Evaluate
mae_rf = mean_absolute_error(y_test, y_pred_rf)
mse_rf = mean_squared_error(y_test, y_pred_rf) # Added for full context
rmse_rf = np.sqrt(mse_rf)
r2_rf = r2_score(y_test, y_pred_rf)

logger.info("--- RANDOM FOREST RESULTS ---")
logger.info(f"MAE (Mean Absolute Error): {mae_rf:.2f} seconds")
logger.info(f"RMSE (Root Mean Sq Error): {rmse_rf:.2f} seconds")
logger.info(f"R² Score:                  {r2_rf:.4f}")

# Calculate Improvement over Dummy
improvement_rf = mae_dummy - mae_rf
logger.info(f"Improvement over Dummy:    {improvement_rf:.2f} seconds")

# Optional: Compare vs Linear if available
if 'mae_lin' in locals():
    improvement_lin = mae_lin - mae_rf
    logger.info(f"Improvement over Linear:   {improvement_lin:.2f} seconds")

if improvement_rf > 0:
    logger.info("RESULT: Random Forest SUCCESSFULY beat the baseline.")
else:
    logger.warning("RESULT: Random Forest FAILED to beat the baseline.")

# 5. Feature Importance
# See which features actually matter the most
importances = pd.DataFrame({
    'Feature': feature_cols,
    'Importance': rf_model.feature_importances_
}).sort_values(by='Importance', ascending=False)

logger.info("\n--- What actually drives delay? (Feature Importance) ---")
for index, row in importances.iterrows():
    logger.info(f"Feature: {row['Feature']:<15} | Importance: {row['Importance']:.4f}")

## STEP 7: Diagnostics (Random Forest)
Save a diagnostics figure under `plots/` to validate fit and residual behavior.

In [None]:
# --- 7. Visualization: Random Forest Diagnostics ---
logger.info("--- Generating Random Forest Diagnostics ---")

plots_dir = os.path.abspath(getattr(config, 'PLOTS_DIR', os.path.join(os.getcwd(), 'plots')))
os.makedirs(plots_dir, exist_ok=True)

plt.figure(figsize=(20, 6))

# --- A. FEATURE IMPORTANCE ---
plt.subplot(1, 3, 1)

feat_imp = pd.DataFrame({
    'Feature': feature_cols,
    'Importance': rf_model.feature_importances_,
}).sort_values(by='Importance', ascending=False)

sns.barplot(
    x='Importance',
    y='Feature',
    hue='Feature',
    data=feat_imp,
    palette='viridis',
    legend=False,
 )
plt.title('Random Forest: Feature Importance', fontsize=16)
plt.xlabel('Importance Score')
plt.grid(axis='x', alpha=0.3)

# --- B. ACTUAL VS PREDICTED ---
plt.subplot(1, 3, 2)

if len(y_test) > 1000:
    rng = np.random.RandomState(getattr(config, 'RANDOM_STATE', 42))
    indices = rng.choice(len(y_test), 1000, replace=False)
    y_sample = y_test.iloc[indices]
    pred_sample = y_pred_rf[indices]
    logger.info("Subsampled 1,000 points for scatter plot clarity.")
else:
    y_sample = y_test
    pred_sample = y_pred_rf

sns.scatterplot(x=y_sample, y=pred_sample, alpha=0.5, color='royalblue', s=40)

axis_min = min(y_sample.min(), pred_sample.min())
axis_max = max(y_sample.max(), pred_sample.max())
plt.plot([axis_min, axis_max], [axis_min, axis_max], 'r--', linewidth=2, label='Perfect Prediction')

plt.title('Actual vs. Predicted Delays', fontsize=16)
plt.xlabel('Actual Delay (s)')
plt.ylabel('Predicted Delay (s)')
plt.legend()
plt.grid(True, alpha=0.3)

# --- C. RESIDUAL DISTRIBUTION ---
plt.subplot(1, 3, 3)
residuals = y_test - y_pred_rf

sns.histplot(residuals, bins=50, kde=True, color='purple', line_kws={'linewidth': 2})
plt.axvline(0, color='red', linestyle='--')
plt.title('Distribution of Prediction Errors (Residuals)', fontsize=16)
plt.xlabel('Error (Seconds)')
plt.ylabel('Frequency')
plt.xlim(-500, 500)

plt.tight_layout()

diag_plot_path = os.path.join(plots_dir, 'baseline_rf_diagnostics.png')
plt.savefig(diag_plot_path, dpi=150, bbox_inches='tight')
logger.info(f"Saved plot: {diag_plot_path}")
plt.show()

## STEP 8: Stop history feature (target encoding)
Add `stop_history_mean` computed on the train split (then mapped to test) to avoid leakage.

In [None]:
# --- 8. Advanced Feature Engineering (Target Encoding) ---
logger.info("--- Starting Advanced Feature Engineering (Target Encoding) ---")

# Check if 'last_stop_id' exists in the original dataframe (df is from Step 2)
if 'last_stop_id' not in df.columns:
    error_msg = "Column 'last_stop_id' missing from dataframe. Cannot perform target encoding."
    logger.error(error_msg)
    raise ValueError(error_msg)

# 1. Create the 'Memory' (Calculate mean delay per stop on TRAIN set)
# We strictly use X_train/y_train to avoid "Data Leakage" (cheating)
train_df_temp = X_train.copy()
train_df_temp['target'] = y_train

# Retrieve the ID from the original dataframe using the matching indices
train_df_temp['last_stop_id'] = df.loc[X_train.index, 'last_stop_id']

# Calculate average delay for each specific stop
stop_history = train_df_temp.groupby('last_stop_id')['target'].mean()
global_mean_delay = y_train.mean()

logger.info(f"Learned historical delays for {len(stop_history)} unique stops.")
logger.info(f"Global mean delay (fallback for new stops): {global_mean_delay:.2f} seconds")

# 2. Map this 'Memory' to the Features
def add_history_feature(X_data, original_df, mapping, global_mean):
    """
    Look up the stop ID for each row in X_data, find its historical average delay,
    and fill with global_mean if the stop was never seen during training.
    """
    # Get IDs corresponding to the split (Train or Test)
    ids = original_df.loc[X_data.index, 'last_stop_id']
    
    # Map and fill NaNs (Handling unknown stops in test set)
    return ids.map(mapping).fillna(global_mean).values.reshape(-1, 1)

# Add the new feature to Train and Test arrays
# Note: X_train_scaled is a numpy array, so we use np.hstack to append the new column
X_train_history = np.hstack([
    X_train_scaled, 
    add_history_feature(X_train, df, stop_history, global_mean_delay)
])

X_test_history = np.hstack([
    X_test_scaled, 
    add_history_feature(X_test, df, stop_history, global_mean_delay)
])

logger.info(f"Feature Engineering Complete. Feature count increased: {X_train_scaled.shape[1]} -> {X_train_history.shape[1]}")

# 3. Retrain Random Forest with the new 'Smart' feature
# We use slightly deeper trees (max_depth=12) as suggested in your snippet to leverage the new info
n_estimators = getattr(config, 'RF_N_ESTIMATORS', 50)
max_depth_smart = getattr(config, 'RF_SMART_MAX_DEPTH', 12)
random_state = getattr(config, 'RANDOM_STATE', 42)

logger.info(f"Training Enhanced Random Forest (Max Depth: {max_depth_smart})...")

rf_smart = RandomForestRegressor(
    n_estimators=n_estimators, 
    max_depth=max_depth_smart, 
    random_state=random_state, 
    n_jobs=-1
)
rf_smart.fit(X_train_history, y_train)
logger.info("Enhanced Random Forest training complete.")

# 4. Predict & Evaluate
y_pred_smart = rf_smart.predict(X_test_history)

mae_smart = mean_absolute_error(y_test, y_pred_smart)
mse_smart = mean_squared_error(y_test, y_pred_smart)
rmse_smart = np.sqrt(mse_smart)
r2_smart = r2_score(y_test, y_pred_smart)

logger.info("--- ENHANCED BASELINE RESULTS (with Stop History) ---")
logger.info(f"MAE (Mean Absolute Error): {mae_smart:.2f} seconds")
logger.info(f"RMSE (Root Mean Sq Error): {rmse_smart:.2f} seconds")
logger.info(f"R² Score:                  {r2_smart:.4f}")

# Compare with previous Standard Random Forest
improvement_smart = mae_rf - mae_smart
logger.info(f"Improvement over Standard RF: {improvement_smart:.2f} seconds")

if improvement_smart > 0:
    logger.info("SUCCESS: Target Encoding (Stop History) improved model performance.")
else:
    logger.warning("No improvement detected. The historical average might not be predictive.")

## STEP 9: Enhanced Random Forest diagnostics
Visualize the effect of stop history and compare MAE across baseline variants.

In [None]:
# --- 9. Visualizing the Enhanced Baseline & Model Comparison ---
logger.info("--- Generating Enhanced Model Visualizations ---")

plots_dir = os.path.abspath(getattr(config, 'PLOTS_DIR', os.path.join(os.getcwd(), 'plots')))
os.makedirs(plots_dir, exist_ok=True)

plt.figure(figsize=(20, 12))

# --- A. NEW FEATURE IMPORTANCE ---
plt.subplot(2, 2, 1)

feature_names_enhanced = feature_cols + ['stop_history_mean']

feat_imp_smart = pd.DataFrame({
    'Feature': feature_names_enhanced,
    'Importance': rf_smart.feature_importances_,
}).sort_values(by='Importance', ascending=False)

sns.barplot(
    x='Importance',
    y='Feature',
    hue='Feature',
    data=feat_imp_smart,
    palette='magma',
    legend=False,
 )
plt.title('Enhanced RF: Feature Importance', fontsize=16, fontweight='bold')
plt.xlabel('Importance Score')
plt.grid(axis='x', alpha=0.3)

# --- B. ACTUAL VS PREDICTED ---
plt.subplot(2, 2, 2)

if len(y_test) > 1000:
    rng = np.random.RandomState(getattr(config, 'RANDOM_STATE', 42))
    indices = rng.choice(len(y_test), 1000, replace=False)
    y_sample = y_test.iloc[indices]
    pred_sample = y_pred_smart[indices]
    logger.info("Subsampled 1,000 points for Enhanced Scatter Plot.")
else:
    y_sample = y_test
    pred_sample = y_pred_smart

sns.scatterplot(x=y_sample, y=pred_sample, alpha=0.5, color='forestgreen', s=40)

axis_min = min(y_sample.min(), pred_sample.min())
axis_max = max(y_sample.max(), pred_sample.max())
plt.plot([axis_min, axis_max], [axis_min, axis_max], 'r--', linewidth=2, label='Perfect Prediction')

plt.title(f'Enhanced Model: Actual vs Predicted (MAE: {mae_smart:.0f}s)', fontsize=16, fontweight='bold')
plt.xlabel('Actual Delay (s)')
plt.ylabel('Predicted Delay (s)')
plt.legend()
plt.grid(True, alpha=0.3)

# --- C. MODEL COMPARISON (MAE) ---
plt.subplot(2, 1, 2)

progress_df = pd.DataFrame({
    'Model': ['Dummy', 'Linear Regression', 'Basic RF', 'Enhanced RF'],
    'MAE': [mae_dummy, mae_lin, mae_rf, mae_smart],
}).sort_values(by='MAE')

sns.barplot(x='MAE', y='Model', hue='Model', data=progress_df, palette='coolwarm', legend=False)
plt.title('Baseline Comparison (Lower MAE is Better)', fontsize=18, fontweight='bold')
plt.xlabel('Mean Absolute Error (seconds)')
plt.ylabel('Model')
plt.grid(axis='x', alpha=0.3)

plt.tight_layout()
comp_plot_path = os.path.join(plots_dir, 'baseline_enhanced_rf_comparison_dashboard.png')
plt.savefig(comp_plot_path, dpi=150, bbox_inches='tight')
logger.info(f"Saved plot: {comp_plot_path}")
plt.show()

## STEP 10: Lag feature
Add `prev_stop_delay` (delay at previous stop) to provide real-time context for the final baseline.

In [None]:
# --- 10. Temporal Context (Lag Features) ---
logger.info("--- Starting Temporal Context Engineering (Lag Features) ---")

# 1. Sort to ensure correct order
# We need to process the WHOLE dataframe to ensure we have the sequence for every trip
logger.info("Sorting dataframe by Trip ID and Timestamp...")
df_sorted = df.sort_values(by=['trip_id', 'timestamp']).copy()

# 2. Shift the delay by 1 to get "Previous Delay"
# We group by trip_id so we don't accidentally shift delay from Trip A to Trip B
logger.info("Calculating previous stop delays...")
df_sorted['prev_stop_delay'] = df_sorted.groupby('trip_id')['delay_seconds'].shift(1)

# 3. Handle NaNs (The first stop of every trip has no "previous" delay)
# We fill with 0 (assuming trips start on time)
# You could also fill with global mean, but 0 is logically sound for "start of trip"
df_sorted['prev_stop_delay'] = df_sorted['prev_stop_delay'].fillna(0)

# 4. Re-merge with our Training/Testing splits
# Since we split randomly earlier, we use the original index to map values back correctly
# Note: We must ensure indices align perfectly
logger.info("Mapping lag features back to training/testing sets...")
X_train_lag = df_sorted.loc[X_train.index, 'prev_stop_delay'].values.reshape(-1, 1)
X_test_lag  = df_sorted.loc[X_test.index,  'prev_stop_delay'].values.reshape(-1, 1)

# 5. Stack it onto our existing "History" features
# Current Stack: [Scaled Features] + [Stop History] + [Prev Delay]
X_train_final = np.hstack([X_train_history, X_train_lag])
X_test_final  = np.hstack([X_test_history,  X_test_lag])

logger.info(f"Feature Engineering Complete. Final feature count: {X_train_final.shape[1]}")

# 6. Train Final Random Forest
n_estimators = getattr(config, 'RF_N_ESTIMATORS', 50)
max_depth_final = getattr(config, 'RF_FINAL_MAX_DEPTH', 12) # Maybe deeper for more features?
random_state = getattr(config, 'RANDOM_STATE', 42)

logger.info(f"Training Final 'Context-Aware' Random Forest (Max Depth: {max_depth_final})...")

rf_final = RandomForestRegressor(
    n_estimators=n_estimators, 
    max_depth=max_depth_final, 
    random_state=random_state, 
    n_jobs=-1
)
rf_final.fit(X_train_final, y_train)
logger.info("Final Random Forest training complete.")

# 7. Evaluate
y_pred_final = rf_final.predict(X_test_final)

mae_final = mean_absolute_error(y_test, y_pred_final)
mse_final = mean_squared_error(y_test, y_pred_final)
rmse_final = np.sqrt(mse_final)
r2_final = r2_score(y_test, y_pred_final)

logger.info("--- FINAL CONTEXT-AWARE MODEL RESULTS ---")
logger.info(f"MAE (Mean Absolute Error): {mae_final:.2f} seconds")
logger.info(f"RMSE (Root Mean Sq Error): {rmse_final:.2f} seconds")
logger.info(f"R² Score:                  {r2_final:.4f}")

# Compare with previous "History" Model
improvement_final = mae_smart - mae_final
logger.info(f"Improvement over History Model: {improvement_final:.2f} seconds")

if improvement_final > 0:
    logger.info("SUCCESS: Adding real-time lag features improved accuracy.")
else:
    logger.warning("No improvement from lag features. Check if 'prev_stop_delay' has valid data.")

# --- Save Final Metrics to Config/File for later comparison with GNN ---
final_metrics = {
    "MAE": mae_final,
    "RMSE": rmse_final,
    "R2": r2_final
}
logger.info(f"Final Baselines Established: {final_metrics}")

## STEP 11: Final baseline dashboard
Generate a single dashboard (feature importance, scatter, residuals, leaderboard) and save it under `plots/`.

In [None]:
# --- 11. Final Visualization & Leaderboard ---
logger.info("--- Generating Final Model Dashboard ---")

plots_dir = os.path.abspath(getattr(config, 'PLOTS_DIR', os.path.join(os.getcwd(), 'plots')))
os.makedirs(plots_dir, exist_ok=True)

plt.figure(figsize=(20, 14))

# --- A. FEATURE IMPORTANCE ---
plt.subplot(2, 2, 1)

final_feature_names = feature_cols + ['stop_history_mean', 'prev_stop_delay']

feat_imp_final = pd.DataFrame({
    'Feature': final_feature_names,
    'Importance': rf_final.feature_importances_,
}).sort_values(by='Importance', ascending=False)

sns.barplot(
    x='Importance',
    y='Feature',
    hue='Feature',
    data=feat_imp_final,
    palette='rocket',
    legend=False,
 )
plt.title('Final Model: Feature Importance', fontsize=16, fontweight='bold')
plt.xlabel('Importance Score')
plt.grid(axis='x', alpha=0.3)

# --- B. ACTUAL VS PREDICTED ---
plt.subplot(2, 2, 2)

if len(y_test) > 1000:
    rng = np.random.RandomState(getattr(config, 'RANDOM_STATE', 42))
    indices = rng.choice(len(y_test), 1000, replace=False)
    y_sample = y_test.iloc[indices]
    pred_sample = y_pred_final[indices]
    logger.info("Subsampled 1,000 points for Final Scatter Plot.")
else:
    y_sample = y_test
    pred_sample = y_pred_final

sns.scatterplot(x=y_sample, y=pred_sample, alpha=0.6, color='darkorange', s=40)

axis_min = min(y_sample.min(), pred_sample.min())
axis_max = max(y_sample.max(), pred_sample.max())
plt.plot([axis_min, axis_max], [axis_min, axis_max], 'k--', linewidth=3, label='Perfect Prediction')

plt.title(f'Actual vs. Predicted (R²: {r2_final:.2f})', fontsize=16, fontweight='bold')
plt.xlabel('Actual Delay (s)')
plt.ylabel('Predicted Delay (s)')
plt.legend()
plt.grid(True, alpha=0.3)

# --- C. RESIDUAL DISTRIBUTION ---
plt.subplot(2, 2, 3)
residuals_final = y_test - y_pred_final
sns.histplot(residuals_final, bins=50, kde=True, color='teal', line_kws={'linewidth': 2})
plt.axvline(0, color='black', linestyle='--')
plt.title('Final Model Residual Distribution', fontsize=16, fontweight='bold')
plt.xlabel('Error (Seconds)')
plt.ylabel('Frequency')
plt.xlim(-500, 500)

# --- D. LEADERBOARD ---
plt.subplot(2, 2, 4)
leaderboard_df = pd.DataFrame({
    'Model': ['Dummy', 'Linear', 'Basic RF', 'Enhanced RF', 'Final RF'],
    'MAE': [mae_dummy, mae_lin, mae_rf, mae_smart, mae_final],
}).sort_values(by='MAE')
sns.barplot(x='MAE', y='Model', hue='Model', data=leaderboard_df, palette='Spectral', legend=False)
plt.title('Leaderboard (Lower MAE is Better)', fontsize=16, fontweight='bold')
plt.xlabel('Mean Absolute Error (seconds)')
plt.ylabel('Model')
plt.grid(axis='x', alpha=0.3)

plt.tight_layout()
dashboard_path = os.path.join(plots_dir, 'baseline_final_dashboard.png')
plt.savefig(dashboard_path, dpi=150, bbox_inches='tight')
logger.info(f"Saved plot: {dashboard_path}")
plt.show()