In [1]:
# Load data and packages
import csv
from joblib import Parallel, delayed
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import pmdarima as pm
from pmdarima import auto_arima
import os
from scipy.stats import boxcox
from scipy.special import inv_boxcox
from statsmodels.tsa.arima.model import ARIMA

directory_path = os.getcwd() + "\\Data\\sorting_event_volumes_2023.csv"

df = pd.read_csv(directory_path)

In [2]:
def fill_missing_events(df):
    df_filled_list = []
    
    for center in df['sorting_center_name'].unique():
        df_center = df[df['sorting_center_name'] == center]
        output_belts = df_center['output_belt'].unique()
        
        min_date = df_center['scanning_date'].min()
        max_date = df_center['scanning_date'].max()

        all_dates = pd.date_range(start=pd.Timestamp(year=min_date.year, month=1, day=1), end=pd.Timestamp(year=max_date.year, month=max_date.month, day=1) + pd.offsets.MonthEnd(0))
        
        all_combinations = pd.MultiIndex.from_product(
            [[center], all_dates, output_belts],
            names=['sorting_center_name', 'scanning_date', 'output_belt']
        )
        
        all_combinations_df = pd.DataFrame(index=all_combinations).reset_index()
        df_filled_center = pd.merge(all_combinations_df, df_center, 
                                    on=['sorting_center_name', 'scanning_date', 'output_belt'], 
                                    how='left')
        
        df_filled_center['no_of_events'] = df_filled_center['no_of_events'].fillna(0.0001)
        df_filled_list.append(df_filled_center)
    
    df_filled = pd.concat(df_filled_list, ignore_index=True)
    
    return df_filled

In [None]:
# Data cleaning
print("Number of rows original dataset is: " + str(df.shape[0]))

df = df.loc[df["event_type"] == "LAJ", :]
df.drop(['event_location', 'input_belt', 'position'], axis=1, inplace = True)
df.dropna(inplace = True)
df['output_belt'] = df['output_belt'].astype(int)
df = df.groupby(['sorting_center_name', 'scanning_date', 'output_belt'], as_index = False)['no_of_events'].sum()
df['scanning_date'] = pd.to_datetime(df['scanning_date'])
df = fill_missing_events(df)

print("Number of rows cleaned dataset is: " + str(df.shape[0]))

In [None]:
# Data preparation
df['day'] = df['scanning_date'].dt.day
df['month'] = df['scanning_date'].dt.month
df['weekday'] = df['scanning_date'].dt.dayofweek + 1
df['week'] = df['scanning_date'].dt.isocalendar().week
df['week_of_month'] = (df['day'] - 1) // 7 + 1
df['yearday'] = df['scanning_date'].dt.day_of_year
df['yearday_sin'] = np.sin(df['yearday'] / 7 * 2 * np.pi)
df['yearday_cos'] = np.cos(df['yearday'] / 7 * 2 * np.pi)

sorting_center_names = df["sorting_center_name"].unique()
df["sorting_center_name"].value_counts()


In [5]:
def tune_hyperparameters(df_output_belt):
    # Perform Box-Cox transformation
    df_output_belt["no_of_events_boxcox"], lam = boxcox(df_output_belt["no_of_events"])
    df_output_belt["no_of_events_diff"] = df_output_belt["no_of_events_boxcox"].diff()
    df_output_belt.dropna(inplace=True)

    train_groupby = df_output_belt.iloc[:-int(len(df_output_belt) * 0.25)]

    # Tune the model
    tuning_model = auto_arima(train_groupby["no_of_events_boxcox"], 
                               seasonal=False, 
                               stepwise=True,  
                               suppress_warnings=True, 
                               trace=False)

    p, d, q = tuning_model.order
    return (p, d, q)

def defineHyperparameters(df, sortingcenter):
    df_sorting_center = df[df["sorting_center_name"] == sortingcenter]
    df_sorting_center.drop(["sorting_center_name"], axis=1, inplace=True)
    
    hyperparameterList = []
    output_belts = df_sorting_center["output_belt"].unique()

    results = Parallel(n_jobs=-1)(delayed(tune_hyperparameters)(
        df_sorting_center[df_sorting_center["output_belt"] == output_belt]) 
        for output_belt in output_belts)
    
    hyperparameterList.extend(results)

    hyperparameter_df = pd.DataFrame(hyperparameterList, columns=["p", "d", "q"])
    hyperparameter_df.to_csv(f'hyperparameters_ARIMA_{sortingcenter}.csv', index=False)

    return hyperparameterList

In [None]:
sorting_center_names = df["sorting_center_name"].unique()

for sorting_center_name in sorting_center_names:
    defineHyperparameters(df, sorting_center_name)

In [6]:
def plot_forecasts(train, test, forecasts, title):
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=train["scanning_date"], y=train["no_of_events"], name="Train"))
    fig.add_trace(go.Scatter(x=test["scanning_date"], y=test["no_of_events"], name="Test"))
    fig.add_trace(go.Scatter(x=test["scanning_date"], y=forecasts, name="Test"))
    fig.update_layout(template="simple_white", font=dict(size=18), title_text=title,
                      width=650, title_x=0.5, height=400, xaxis_title="scanning_date",
                      yaxis_title="no_of_events")
    
    return fig.show()

In [32]:
# Test ARIMA per output belt
def ARIMA_model(df, sortingcenter, hyperparameters):
    df_sortingcenter = df[df["sorting_center_name"] == sortingcenter]
    output_belts = df_sortingcenter['output_belt'].unique()

    daily_errors = {}
    daily_mse = {}
    daily_mae = {}

    for index, output_belt in enumerate(output_belts):
        df_output_belt = df_sortingcenter[df_sortingcenter["output_belt"] == output_belt]
        
        df_output_belt["no_of_events_boxcox"], lam = boxcox(df_output_belt["no_of_events"])
        df_output_belt["no_of_events_diff"] = df_output_belt["no_of_events_boxcox"].diff()
        df_output_belt.dropna(inplace=True)

        train = df_output_belt.iloc[:-int(len(df_output_belt) * 0.25)]
        test = df_output_belt.iloc[-int(len(df_output_belt) * 0.25):-int(len(df_output_belt) * 0.20)]

        p, d, q = hyperparameters[index]

        arima_model = ARIMA(train["no_of_events_boxcox"], order=(p, d, q)).fit()

        boxcox_forecast = arima_model.forecast(len(test))
        forecasts = inv_boxcox(boxcox_forecast, lam)

        #plot_forecasts(train, test, forecasts, "ARIMA")

        for day in range(len(test)):
            actual = test.iloc[day]["no_of_events"]
            forecast = forecasts.iloc[day]

            squared_difference = (actual - forecast) ** 2
            absolute_difference = abs(actual - forecast)

            if day not in daily_errors:
                daily_errors[day] = {}
                daily_errors[day]["mse"] = []
                daily_errors[day]["mae"] = []
            
            daily_errors[day]["mse"].append(squared_difference)
            daily_errors[day]["mae"].append(absolute_difference)

    for day in range(len(daily_errors)):
        mse = sum(daily_errors[day]["mse"]) / len(daily_errors[day]["mse"])
        mae = sum(daily_errors[day]["mae"]) / len(daily_errors[day]["mae"])
        daily_mse[day] = mse
        daily_mae[day] = mae

    return daily_errors, daily_mse, daily_mae


In [None]:
sorting_center_names = df["sorting_center_name"].unique()
MSE_dict = {}
VSE_dict = {}
MAE_dict = {}
daily_errors = {}

for sorting_center_name in sorting_center_names:
    hyperparameters_df = pd.read_csv("Data/hyperparameters ARIMA/hyperparameters_ARIMA_{}.csv".format(sorting_center_name))
    hyperparameterList = [tuple(row) for row in hyperparameters_df.to_numpy()]
    daily_errors_sorting_center, mse, mae = ARIMA_model(df, sorting_center_name, hyperparameterList)

    daily_errors[sorting_center_name] = daily_errors_sorting_center
    MSE_dict[sorting_center_name] = sum(mse.values()) / len(mse)
    VSE_dict[sorting_center_name] = np.var(list(mse.values()), ddof=1)
    MAE_dict[sorting_center_name] = sum(mae.values()) / len(mae)
    
    print(MSE_dict[sorting_center_name], VSE_dict[sorting_center_name], MAE_dict[sorting_center_name])

daily_mse = {}
daily_mae = {}

for day in range(len(daily_errors)):
    mse = 0
    mae = 0
    n_output_belts = 0
    for sorting_center_name in sorting_center_names:
        mse += sum(daily_errors[sorting_center_name][day]["mse"])
        mae += sum(daily_errors[sorting_center_name][day]["mae"])
        n_output_belts += len(daily_errors[sorting_center_name][day]["mse"])
    daily_mse[day] = mse / n_output_belts
    daily_mae[day] = mae / n_output_belts

print(daily_mse)

MSE_dict["total"] = sum(daily_mse.values()) / len(daily_mse)
VSE_dict["total"] = np.var(list(daily_mse.values()), ddof=1)
MAE_dict["total"] = sum(daily_mae.values()) / len(daily_mae)

with open("Results/results_ARIMA.csv", mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Sorting center', 'MSE', 'VSE', 'MAE'])
    
    for key in MSE_dict.keys():
        writer.writerow([key, MSE_dict[key], VSE_dict[key], MAE_dict[key]])


In [None]:
df = pd.read_csv(os.getcwd() + "\\Data\\sorting_event_volumes_2023.csv")

days = 14
output_belts = df["output_belt"].unique()
number_of_output_belts = len(output_belts)
daily_errors = {}
daily_mse = {}

# For each output belt make forecasts
for output_belt in output_belts:
    test = pd.Dataframe()
    forecast = pd.DataFrame()

    # For each day, calculate the squared deviation
    for day in range(days):
        actual = test.iloc[day]
        forecast = forecast.iloc[day]

        squared_deviation = (actual - forecast) ** 2

        # If day not in directory, add empty list
        if day not in daily_errors:
            daily_errors[day] = []
            
        daily_errors[day].append(squared_deviation)

# Calculate for each day the average squared deviation
for day in range(days):
    se = sum(daily_errors[day].values()) / number_of_output_belts
    daily_mse[day] = se

MSE = sum(daily_mse.values()) / days
VSE = np.var(list(daily_mse.values()), ddof=1)