In [49]:
import pandas as pd
holidays_only_df = pd.read_csv("holidays_only_df.csv",sep=",")

In [51]:
test_holidays_only =pd.read_csv("test_holidays_only.csv",sep=",")

In [53]:
train_holidays_only =pd.read_csv("train_holidays_only.csv",sep=",")

In [55]:
no_holidays_df =pd.read_csv("no_holidays_df.csv",sep=",")

In [56]:
test_no_holidays =pd.read_csv("test_no_holidays.csv",sep=",")

In [59]:
train_no_holidays =pd.read_csv("train_no_holidays.csv",sep=",")

In [61]:
all_dates_inclholidays =pd.read_csv("all_dates_inclholidays.csv",sep=",")

In [63]:
test_all_dates_inclholidays =pd.read_csv("test_all_dates_inclholidays.csv",sep=",")

In [65]:
train_all_dates_inclholidays =pd.read_csv("train_all_dates_inclholidays.csv",sep=",")

In [66]:
#We need an extremely simple baseline that requires minimal modeling assumptions. 
#We need it to show that our other “smarter” models add value. 
#Trolleys are used when there is physical logistics movement:outbound(packages leaving), 
#inbound(packages returning).So the packages moved are the trolley usage.
#In baseline:
#Packages moved yesterday = trolleys needed today. 
#trolleys(t)=total_issued_packages(t-1)+total_returned_packages(t-1)
#So the predicted number of trolleys is the flow of packages in the previous period.

In [69]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

# Baseline: 

def add_baseline_pred(df: pd.DataFrame,
                      issued_col: str = "total_issued_lag_1",
                      returned_col: str = "total_returned_lag_1",
                      pred_col: str = "trolleys_pred_baseline") -> pd.DataFrame:
    df = df.copy()

    # Safety checks
    for c in [issued_col, returned_col]:
        if c not in df.columns:
            raise KeyError(f"Missing column '{c}' in dataframe.")

    df[pred_col] = df[issued_col].fillna(0) + df[returned_col].fillna(0)
    return df
    
def evaluate_baseline(df: pd.DataFrame,
                      y_col: str = "trolleys",
                      pred_col: str = "trolleys_pred_baseline") -> dict:
    y_true = df[y_col].astype(float)
    y_pred = df[pred_col].astype(float)

    return {
        "n_rows": len(df),
        "MAE": mean_absolute_error(y_true, y_pred)
    }

In [71]:
results = []

for name, df_test in {
    "holidays_only": test_holidays_only,
    "no_holidays": test_no_holidays,
    "all_dates_inclholidays": test_all_dates_inclholidays
}.items():
    df_pred = add_baseline_pred(df_test)
    metrics = evaluate_baseline(df_pred)
    metrics["dataset"] = name
    results.append(metrics)

baseline_metrics = pd.DataFrame(results).set_index("dataset")
print(baseline_metrics)

                        n_rows           MAE
dataset                                     
holidays_only             3265  1.435126e+06
no_holidays              14521  1.278229e+06
all_dates_inclholidays   17786  1.307031e+06


In [None]:
#The naive logistics baseline yields a mean absolute error of approximately 1.3 million trolleys 
#across all dataframes types. This indicates that while total package movement provides a reasonable proxy 
#for logistics activity, substantial prediction error remains due to unmodeled seasonal, 
#product, and location-specific effects.