# Model Comparison & Final Operational Report
## Objective
1.  **Benchmark**: Random Forest vs. XGBoost.
2.  **Metrics**: RMSE, MAE, **Huber Loss** (Robustness), and **Distribution Error**.
3.  **Final Output**: A "Next State" predictor for every company.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import LabelEncoder

# 1. Load & Prep Data
df = pd.read_csv('shipment_booking_data_2021_2025.csv')b
df['booking_date'] = pd.to_datetime(df['booking_date'])

df_daily = df.set_index('booking_date').groupby([
    'company_name', 
    pd.Grouper(freq='D')
])['shipment_type'].value_counts().unstack(fill_value=0).reset_index()
df_daily.columns.name = None

# Feature Engineering (Reusing proven logic)
df_daily = df_daily.sort_values(['company_name', 'booking_date']).reset_index(drop=True)
target_cols = ['Air', 'Express', 'International', 'Surface']
feat_cols = []

for col in target_cols:
    if col not in df_daily.columns: df_daily[col] = 0
    # Targets
    df_daily[f'target_{col}'] = df_daily.groupby('company_name')[col].shift(-1)
    # Features
    df_daily[f'lag_1_{col}'] = df_daily.groupby('company_name')[col].shift(0)
    feat_cols.append(f'lag_1_{col}')
    df_daily[f'lag_7_{col}'] = df_daily.groupby('company_name')[col].shift(6)
    feat_cols.append(f'lag_7_{col}')
    df_daily[f'roll_7_{col}'] = df_daily.groupby('company_name')[col].transform(lambda x: x.rolling(7, min_periods=1).mean())
    feat_cols.append(f'roll_7_{col}')

df_daily['day_of_week'] = df_daily['booking_date'].dt.dayofweek
df_daily['day'] = df_daily['booking_date'].dt.day
df_daily['month'] = df_daily['booking_date'].dt.month
feat_cols.extend(['day_of_week', 'day', 'month'])

le = LabelEncoder()
df_daily['company_encoded'] = le.fit_transform(df_daily['company_name'])
feat_cols.append('company_encoded')

df_clean = df_daily.dropna().copy()
print("Data Ready. Shape:", df_clean.shape)

Data Ready. Shape: (14551, 26)


## 2. Model Benchmarking
Comparing **Random Forest** (Baseline) vs **XGBoost** (Challenger) on the Test Set (Nov-Dec 2025).

In [2]:
from sklearn.linear_model import HuberRegressor # Using sklearn's Huber loss logic for metric calc

def huber_loss(y_true, y_pred, delta=1.0):
    error = y_true - y_pred
    is_small_error = np.abs(error) <= delta
    squared_loss = 0.5 * error**2
    linear_loss = delta * (np.abs(error) - 0.5 * delta)
    return np.where(is_small_error, squared_loss, linear_loss).mean()

split_date = '2025-11-01'
train_df = df_clean[df_clean['booking_date'] < split_date]
test_df = df_clean[df_clean['booking_date'] >= split_date].copy()

results = []

for target in target_cols:
    X_train = train_df[feat_cols]
    y_train = train_df[f'target_{target}']
    X_test = test_df[feat_cols]
    y_test = test_df[f'target_{target}']
    
    # 1. Random Forest
    rf = RandomForestRegressor(n_estimators=100, random_state=42)
    rf.fit(X_train, y_train)
    rf_pred = rf.predict(X_test)
    
    # 2. XGBoost
    xgb = XGBRegressor(n_estimators=500, learning_rate=0.05, max_depth=6, random_state=42)
    xgb.fit(X_train, y_train)
    xgb_pred = xgb.predict(X_test)
    
    # Metrics
    for name, pred in [('Random Forest', rf_pred), ('XGBoost', xgb_pred)]:
        r_mae = mean_absolute_error(y_test, pred)
        r_rmse = np.sqrt(mean_squared_error(y_test, pred))
        r_huber = huber_loss(y_test, pred, delta=1.35)
        
        # Save comparison
        results.append({
            'Shipment Type': target,
            'Model': name,
            'MAE': r_mae,
            'RMSE': r_rmse,
            'Huber Loss': r_huber
        })
        
        # Save predictions to test_df for Distribution calc later
        # Naming convention: Pred_{Model}_{Type}
        test_df[f'Pred_{name}_{target}'] = np.maximum(0, pred)

res_df = pd.DataFrame(results)
print("\n--- MODEL PERFORMANCE --- ")
print(res_df.pivot(index='Shipment Type', columns='Model', values=['MAE', 'Huber Loss']).round(2))


--- MODEL PERFORMANCE --- 
                        MAE            Huber Loss        
Model         Random Forest XGBoost Random Forest XGBoost
Shipment Type                                            
Air                    1.99    1.94          1.90    1.84
Express                1.39    1.41          1.15    1.18
International          0.70    0.69          0.35    0.35
Surface                1.97    1.98          1.88    1.90


## 3. Distribution Accuracy
Evaluating how well each model predicts the **mix** (percentage share) of shipment types.

In [3]:
# Determine Winning Model for Final Prediction
# Calculate Total Distribution Error (Sum of absolute difference in % shares)

models_list = ['Random Forest', 'XGBoost']
dist_errors = {}

for model in models_list:
    # Calculate sum of volumes for this model's predictions
    total_pred = sum(test_df[f'Pred_{model}_{t}'] for t in target_cols)
    total_actual = sum(test_df[f'target_{t}'] for t in target_cols)
    
    error_accum = 0
    for t in target_cols:
        # Avoid div by zero
        actual_share = (test_df[f'target_{t}'] / total_actual).fillna(0)
        pred_share = (test_df[f'Pred_{model}_{t}'] / total_pred).fillna(0)
        
        # Error = |Actual% - Pred%|
        error_accum += (abs(actual_share - pred_share) * 100).mean()
        
    dist_errors[model] = error_accum / len(target_cols) # Avg % error per category

print("\n--- AVERAGE DISTRIBUTION ERROR (% deviation per category) ---")
for m, err in dist_errors.items():
    print(f"{m}: {err:.2f}%")
    
winner = min(dist_errors, key=dist_errors.get)
print(f"\n>> WINNER: {winner}")


--- AVERAGE DISTRIBUTION ERROR (% deviation per category) ---
Random Forest: 9.24%
XGBoost: 9.26%

>> WINNER: Random Forest


## 4. Final Operational Forecast (The "Oracle")
Generating the final Next Day predictions using the **Winner Model (XGBoost usually)**.

In [4]:
# Train Winning Model on FULL Data
final_models = {}
for target in target_cols:
    if winner == 'XGBoost':
        m = XGBRegressor(n_estimators=500, learning_rate=0.05, max_depth=6, random_state=42)
    else:
        m = RandomForestRegressor(n_estimators=100, random_state=42)
        
    m.fit(df_clean[feat_cols], df_clean[f'target_{target}'])
    final_models[target] = m

def predict_next_state(company_name):
    # Get latest data point
    last_row = df_clean[df_clean['company_name'] == company_name].sort_values('booking_date').iloc[-1]
    
    # Update Time Features for 'Tomorrow'
    next_date = last_row['booking_date'] + pd.Timedelta(days=1)
    
    # Construct Input Vector
    input_data = last_row[feat_cols].copy()
    input_data['day_of_week'] = next_date.dayofweek
    input_data['day'] = next_date.day
    input_data['month'] = next_date.month

    # Predict
    preds = {}
    total_vol = 0
    for t in target_cols:
        val = final_models[t].predict(pd.DataFrame([input_data]))[0]
        val = int(max(0, round(val)))
        preds[t] = val
        total_vol += val
        
    # Calculate Distribution
    dist_str = []
    for t in target_cols:
        pct = (preds[t] / total_vol * 100) if total_vol > 0 else 0
        dist_str.append(f"{t}: {pct:.1f}%")
        
    print(f"\n=== PREDICTION FOR: {company_name} ===")
    print(f"Next Booking Date: {next_date.strftime('%Y-%m-%d')} (Tomorrow)")
    print(f"Predicted Total Volume: {total_vol}")
    print(f"Shipment Distribution: {', '.join(dist_str)}")
    
    return next_date, total_vol, preds

# Run for all companies
all_companies = df['company_name'].unique()
for comp in all_companies:
    predict_next_state(comp)


=== PREDICTION FOR: BlueDart ===
Next Booking Date: 2025-12-31 (Tomorrow)
Predicted Total Volume: 12
Shipment Distribution: Air: 33.3%, Express: 25.0%, International: 0.0%, Surface: 41.7%

=== PREDICTION FOR: Delhivery ===
Next Booking Date: 2025-12-31 (Tomorrow)
Predicted Total Volume: 23
Shipment Distribution: Air: 26.1%, Express: 21.7%, International: 4.3%, Surface: 47.8%

=== PREDICTION FOR: DTDC ===
Next Booking Date: 2025-12-31 (Tomorrow)
Predicted Total Volume: 13
Shipment Distribution: Air: 30.8%, Express: 15.4%, International: 0.0%, Surface: 53.8%

=== PREDICTION FOR: FedEx India ===
Next Booking Date: 2025-12-31 (Tomorrow)
Predicted Total Volume: 10
Shipment Distribution: Air: 30.0%, Express: 20.0%, International: 10.0%, Surface: 40.0%

=== PREDICTION FOR: DHL Express ===
Next Booking Date: 2025-12-31 (Tomorrow)
Predicted Total Volume: 19
Shipment Distribution: Air: 31.6%, Express: 21.1%, International: 10.5%, Surface: 36.8%

=== PREDICTION FOR: XpressBees ===
Next Booking D