# ETA-Insight: Multi-Model Evaluation & Advanced Visualization

This notebook trains and compares **4 Models** on the augmented 600k dataset:
1.  **XGBoost** (Gradient Boosting)
2.  **LightGBM** (Gradient Boosting - Fast)
3.  **Random Forest** (Bagging - Strong Baseline)
4.  **Linear Regression** (Baseline - Simple)

It also includes advanced visualizations:
-   **Mode-wise Accuracy**: Boxplots showing error distribution for Air vs. Rail vs. Road.
-   **Feature Importance**: What drives the ETA?
-   **Residual Analysis**: Are errors random or biased?

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', None)
sns.set_style("whitegrid")

## 1. Load Data & Preprocess

In [None]:
def load_and_preprocess():
    file_path = 'Cleaned_Training_Data_Augmented.csv'
    print(f"Loading: {file_path}")
    df = pd.read_csv(file_path)
    
    # Split
    train, val = train_test_split(df, test_size=0.2, random_state=42)
    print(f"Training: {len(train)} | Validation: {len(val)}")
    return train, val

df_train, df_val = load_and_preprocess()

# Feature Engineering
df_train['Route'] = df_train['PolCode'].astype(str) + "_" + df_train['PodCode'].astype(str) + "_" + df_train['ModeOfTransport'].astype(str)
df_val['Route'] = df_val['PolCode'].astype(str) + "_" + df_val['PodCode'].astype(str) + "_" + df_val['ModeOfTransport'].astype(str)

FEATURES = ['PolCode', 'PodCode', 'ModeOfTransport', 'Route']
TARGET = 'Actual_Duration_Hours'

# Label Encoding
combined = pd.concat([df_train[FEATURES], df_val[FEATURES]], axis=0)
for col in FEATURES:
    le = LabelEncoder()
    le.fit(combined[col].astype(str))
    df_train[col] = le.transform(df_train[col].astype(str))
    df_val[col] = le.transform(df_val[col].astype(str))

X_train = df_train[FEATURES]
y_train = df_train[TARGET]
X_val = df_val[FEATURES]
y_val = df_val[TARGET]

# Log Transform Target
y_train_log = np.log1p(y_train)
y_val_log = np.log1p(y_val)

def calculate_wape(y_true, y_pred):
    return np.sum(np.abs(y_true - y_pred)) / np.sum(np.abs(y_true))

## 2. Model Training (4 Models)

In [None]:
model_results = {}

# --- 1. Linear Regression (Baseline) ---
print("Training Linear Regression...")
lr = LinearRegression()
lr.fit(X_train, y_train_log)
preds_lr = np.expm1(lr.predict(X_val))
model_results['Linear Regression'] = preds_lr

# --- 2. Random Forest (simplified) ---
print("Training Random Forest (Sampled for speed)...")
# Using max_depth=10 and n_estimators=20 to keep it fast on 600k rows
rf = RandomForestRegressor(n_estimators=20, max_depth=10, n_jobs=4, random_state=42)
rf.fit(X_train, y_train_log)
preds_rf = np.expm1(rf.predict(X_val))
model_results['Random Forest'] = preds_rf

# --- 3. XGBoost ---
print("Training XGBoost...")
xg_reg = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=1000, learning_rate=0.05, max_depth=10, n_jobs=4)
xg_reg.fit(X_train, y_train_log)
preds_xgb = np.expm1(xg_reg.predict(X_val))
model_results['XGBoost'] = preds_xgb

# --- 4. LightGBM ---
print("Training LightGBM...")
lgb_train = lgb.Dataset(X_train, y_train_log)
params = {'objective': 'regression', 'metric': 'rmse', 'num_leaves': 50, 'learning_rate': 0.05, 'verbose': -1}
gbm = lgb.train(params, lgb_train, num_boost_round=1000)
preds_lgb = np.expm1(gbm.predict(X_val))
model_results['LightGBM'] = preds_lgb

## 3. Advanced Evaluation Matrices

In [None]:
results_df = []

for name, preds in model_results.items():
    rmse = np.sqrt(mean_squared_error(y_val, preds))
    mae = mean_absolute_error(y_val, preds)
    r2 = r2_score(y_val, preds)
    acc = 100 * (1 - calculate_wape(y_val, preds))
    
    results_df.append({
        'Model': name,
        'Accuracy (1-WAPE)': f"{acc:.2f}%",
        'RMSE (Hours)': round(rmse, 2),
        'MAE (Hours)': round(mae, 2),
        'R2 Score': round(r2, 4)
    })

metrics_table = pd.DataFrame(results_df).set_index('Model')
print(metrics_table)

## 4. Visualizations

In [None]:
# A. Mode-wise Error Analysis (Boxplot)
# Add predictions to validation df for analysis
df_val['Pred_XGB'] = preds_xgb
df_val['Abs_Error'] = abs(df_val['Actual_Duration_Hours'] - df_val['Pred_XGB'])

# Restore Mode labels for plotting (inverse transform not strictly needed if we map 0,1,2, but cleaner to use original if available. 
# Here we just reload or use integers. Let's assume 0,1,2 corresponds to the sorted specific modes (Air, Rail, Road).
# Ideally we inverse transform, but for now we plot by encoded ID)

plt.figure(figsize=(10, 6))
sns.boxplot(x='ModeOfTransport', y='Abs_Error', data=df_val, showfliers=False)
plt.title('Absolute Error Distribution by Transport Mode (XGBoost)')
plt.ylabel('Absolute Error (Hours)')
plt.xlabel('Mode ID (Encoded)')
plt.show()

# B. Feature Importance (XGBoost)
xgb.plot_importance(xg_reg, importance_type='gain', max_num_features=10, title='Feature Importance (Gain)', height=0.5)
plt.show()

# C. Prediction vs Actual (Heteroscedasticity Check)
plt.figure(figsize=(10, 6))
sns.scatterplot(x=y_val, y=preds_xgb, alpha=0.1, color='blue')
plt.plot([0, y_val.max()], [0, y_val.max()], 'r--', lw=2)
plt.xlabel('Actual Hours')
plt.ylabel('Predicted Hours')
plt.title('XGBoost: Predicted vs Actual')
plt.show()

# D. Residual Histogram
plt.figure(figsize=(10, 6))
residuals = y_val - preds_xgb
sns.histplot(residuals, bins=50, kde=True, color='purple')
plt.title('Residual Distribution (Errors)')
plt.xlabel('Residual (Actual - Pred)')
plt.show()