# IRI Estimation using Random Forest with Group Cross-Validation

This notebook trains a Random Forest regression model to predict the
International Roughness Index (IRI) using Z-axis vibration features
and vehicle speed.

The workflow is designed to:
- avoid data leakage using GroupKFold
- stabilize training using log-transformed IRI
- evaluate performance using RMSE, relative RMSE, and correlation
- save per-fold predictions and plots for analysis


## 1. Import Required Libraries

This section imports all necessary Python libraries for:
- data handling
- machine learning
- evaluation
- plotting
- saving trained models

Random Forest is used as the regression model.


In [21]:
import matplotlib
matplotlib.use("Agg")

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import joblib

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
from sklearn.dummy import DummyRegressor


## 2. Configuration Settings

This section defines:
- output directory for results
- input feature columns
- target variable
- grouping column for cross-validation
- number of folds
- Random Forest hyperparameters

Keeping configuration in one place makes the experiment reproducible.


In [22]:
BASE_RESULTS_DIR = "Raw_data/results_rf_6videos_feature"
os.makedirs(BASE_RESULTS_DIR, exist_ok=True)

FEATURES = ["z_std", "z_rms", "z_peak_to_peak","acc_mag_rms", "speed"]
TARGET = "iri_est"
GROUP = "sensor_video_id"

N_SPLITS = 4

RF_PARAMS = {
    "n_estimators": 300,
    "max_depth": None,
    "min_samples_leaf": 2,
    "min_samples_split": 5,
    "max_features": "sqrt",
    "random_state": 42,
    "n_jobs": -1
}


## 3. Utility Functions

These helper functions are used throughout the notebook to:
- create directories
- compute evaluation metrics
- compute a dummy baseline RMSE
- build the Random Forest model
- generate per-frame predictions
- plot true vs predicted IRI per video

No modeling logic is placed here; only reusable utilities.


In [23]:
def ensure_dir(path):
    """
    Create a directory if it does not exist.
    """
    os.makedirs(path, exist_ok=True)
    return path


def compute_metrics(y_true, y_pred):
    """
    Compute evaluation metrics for regression.

    Metrics:
    - RMSE
    - Relative RMSE
    - Pearson correlation
    """
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    rrmse = rmse / (np.mean(y_true) + 1e-8)
    corr = pearsonr(y_true, y_pred)[0] if len(y_true) > 1 else np.nan
    return rmse, rrmse, corr


def compute_dummy_rmse(y_true):
    """
    Compute RMSE of a dummy model that predicts
    the mean IRI value.
    """
    dum = DummyRegressor(strategy="mean")
    dum.fit(np.zeros((len(y_true), 1)), y_true)
    y_pred = dum.predict(np.zeros((len(y_true), 1)))
    return np.sqrt(mean_squared_error(y_true, y_pred))


def build_rf():
    """
    Build and return a RandomForestRegressor
    using predefined hyperparameters.
    """
    return RandomForestRegressor(**RF_PARAMS)


## 4. Plotting and Prediction Utilities

These functions:
- generate per-frame IRI predictions
- plot true vs predicted IRI for each video

Plots are saved per fold for visual inspection.


In [24]:
def plot_per_video(df, out_dir, title_prefix):
    ensure_dir(out_dir)

    for vid in df[GROUP].unique():
        vdf = df[df[GROUP] == vid]

        plt.figure(figsize=(10, 4))
        plt.plot(vdf["mt"], vdf[TARGET], label="True IRI", linewidth=2)
        plt.plot(vdf["mt"], vdf["iri_pred"], "--", label="Predicted IRI")
        plt.xlabel("Frame (MT)")
        plt.ylabel("IRI")
        plt.title(f"{title_prefix} | Video {vid}")
        plt.legend()
        plt.tight_layout()
        plt.savefig(os.path.join(out_dir, f"{title_prefix}_video_{vid}.png"))
        plt.close()


def predict_per_frame(model, df):
    """
    Generate per-frame IRI predictions using the trained model.
    Predictions are converted back from log-space.
    """
    preds = np.expm1(model.predict(df[FEATURES]))
    df = df.copy()
    df["iri_pred"] = preds
    return df


## 5. Random Forest Training with Group Cross-Validation

This is the main training loop.

For each fold:
1. Split data using GroupKFold
2. Train Random Forest on log-transformed IRI
3. Evaluate on train and test sets
4. Compare against a dummy baseline
5. Save metrics, predictions, and plots


In [25]:
def run_iri_rf_cv(df):
    ensure_dir(BASE_RESULTS_DIR)

    gkf = GroupKFold(n_splits=N_SPLITS)
    fold = 1

    X = df[FEATURES].values
    y = df[TARGET].values
    groups = df[GROUP].values

    for train_idx, test_idx in gkf.split(X, y, groups):
        print(f"\n========== FOLD {fold} ==========")

        train_df = df.iloc[train_idx].copy()
        test_df = df.iloc[test_idx].copy()

        train_df["iri_log"] = np.log1p(train_df[TARGET])
        test_df["iri_log"] = np.log1p(test_df[TARGET])

        fold_dir = ensure_dir(os.path.join(BASE_RESULTS_DIR, f"fold_{fold}"))
        train_dir = ensure_dir(os.path.join(fold_dir, "train_results"))
        test_dir = ensure_dir(os.path.join(fold_dir, "test_results"))

        model = build_rf()
        model.fit(train_df[FEATURES], train_df["iri_log"])

        joblib.dump(model, os.path.join(train_dir, "models_rf.pkl"))

        train_pred = np.expm1(model.predict(train_df[FEATURES]))
        test_pred = np.expm1(model.predict(test_df[FEATURES]))

        train_rmse = np.sqrt(mean_squared_error(train_df[TARGET], train_pred))
        test_rmse = np.sqrt(mean_squared_error(test_df[TARGET], test_pred))

        train_corr = pearsonr(train_df[TARGET], train_pred)[0]
        test_corr = pearsonr(test_df[TARGET], test_pred)[0]

        train_dummy_rmse = compute_dummy_rmse(train_df[TARGET].values)
        test_dummy_rmse = compute_dummy_rmse(test_df[TARGET].values)

        train_rrmse = train_rmse / (train_dummy_rmse + 1e-8)
        test_rrmse = test_rmse / (test_dummy_rmse + 1e-8)

        pd.DataFrame([{
            "rmse": train_rmse,
            "dummy_rmse": train_dummy_rmse,
            "rrmse": train_rrmse,
            "correlation": train_corr
        }]).to_csv(os.path.join(train_dir, "train_metrics.csv"), index=False)

        pd.DataFrame([{
            "rmse": test_rmse,
            "dummy_rmse": test_dummy_rmse,
            "rrmse": test_rrmse,
            "correlation": test_corr
        }]).to_csv(os.path.join(test_dir, "test_metrics.csv"), index=False)

        train_out = predict_per_frame(model, train_df)
        test_out = predict_per_frame(model, test_df)

        ensure_dir(os.path.join(train_dir, "perframe"))
        ensure_dir(os.path.join(test_dir, "perframe"))

        train_out.to_csv(
            os.path.join(train_dir, "perframe", "train_predictions.csv"),
            index=False
        )
        test_out.to_csv(
            os.path.join(test_dir, "perframe", "test_predictions.csv"),
            index=False
        )

        plot_per_video(train_out, os.path.join(train_dir, "plots"), "train")
        plot_per_video(test_out, os.path.join(test_dir, "plots"), "test")

        print(f"Fold {fold} | Test RMSE: {test_rmse:.3f} | Corr: {test_corr:.3f}")
        fold += 1

    print("\nAll RF folds completed.")


## 6. Run the Training Pipeline

Load the prepared training dataset and start
Random Forest training with group-based cross-validation.


In [26]:
df = pd.read_csv("Raw_data_1/iri_training_data.csv")
run_iri_rf_cv(df)



Fold 1 | Test RMSE: 1.224 | Corr: 0.166

Fold 2 | Test RMSE: 1.192 | Corr: 0.613

Fold 3 | Test RMSE: 1.460 | Corr: 0.653

Fold 4 | Test RMSE: 1.597 | Corr: 0.093

All RF folds completed.
