In [9]:
# Load data and packages
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import pmdarima as pm
from pmdarima import auto_arima
from scipy.stats import boxcox
from scipy.special import inv_boxcox
from statsmodels.tsa.arima.model import ARIMA

directory_path = os.getcwd() + "\\Data\\sorting_event_volumes_2023.csv"

df = pd.read_csv(directory_path)

In [10]:
def fill_missing_events(df):
    all_dates = pd.date_range(start=df['scanning_date'].min(), end=df['scanning_date'].max())

    all_combinations = pd.MultiIndex.from_product([df["sorting_center_name"].unique(), all_dates, df['output_belt'].unique()], names=['sorting_center_name', 'scanning_date', 'output_belt'])
    all_combinations_df = pd.DataFrame(index=all_combinations).reset_index()

    df_filled = pd.merge(all_combinations_df, df, on=['sorting_center_name', 'scanning_date', 'output_belt'], how='left')
    df_filled['no_of_events'] = df_filled['no_of_events'].fillna(0.0001)

    return df_filled

In [None]:
# Data cleaning
print("Number of rows original dataset is: " + str(df.shape[0]))

df = df.loc[df["event_type"] == "LAJ", :]
df.drop(['event_location', 'input_belt', 'position'], axis=1, inplace = True)
df.dropna(inplace = True)
df['output_belt'] = df['output_belt'].astype(int)
df = df.groupby(['sorting_center_name', 'scanning_date', 'output_belt'], as_index = False)['no_of_events'].sum()
df['scanning_date'] = pd.to_datetime(df['scanning_date'])
df = fill_missing_events(df)

print("Number of rows cleaned dataset is: " + str(df.shape[0]))

In [None]:
# Data preparation
df['day'] = df['scanning_date'].dt.day
df['month'] = df['scanning_date'].dt.month
df['weekday'] = df['scanning_date'].dt.dayofweek + 1
df['week'] = df['scanning_date'].dt.isocalendar().week
df['week_of_month'] = (df['day'] - 1) // 7 + 1
df['yearday'] = df['scanning_date'].dt.day_of_year
df['yearday_sin'] = np.sin(df['yearday'] / 7 * 2 * np.pi)
df['yearday_cos'] = np.cos(df['yearday'] / 7 * 2 * np.pi)

sorting_center_names = df["sorting_center_name"].unique()
df["sorting_center_name"].value_counts()


In [None]:
df_KUOPIO_0 = df[(df["sorting_center_name"] == "KUOPIO") & (df["output_belt"] == 0)]
print(df_KUOPIO_0.head())

In [None]:
# Determine hyperparameters per sorting center
df_vantaa = df[df["sorting_center_name"] == "VANTAA"]
df_vantaa.drop(["sorting_center_name"], axis=1, inplace=True)

df_vantaa["no_of_events_boxcox"], lam = boxcox(df_vantaa["no_of_events"])
df_vantaa["no_of_events_diff"] = df_vantaa["no_of_events_boxcox"].diff()
df_vantaa.dropna(inplace=True)

train = df_vantaa.iloc[:-int(len(df_vantaa) * 0.25)]

# TUNE PER SORTING CENTER
model = auto_arima(train["no_of_events_boxcox"], 
                    seasonal=False, 
                    stepwise=True,  
                    suppress_warnings=True, 
                    trace=False)   

p, d, q = model.order
print(f"Optimal order: p={p}, d={d}, q={q}")

In [None]:
# Train and test ARIMA per output belt
output_belts = df_vantaa['output_belt'].unique()
for output_belt in output_belts:
    df_output_belt = df_vantaa[df_vantaa["output_belt"] == output_belt]
    
    '''df_output_belt["no_of_events_boxcox"], lam = boxcox(df_output_belt["no_of_events"])
    df_output_belt["no_of_events_diff"] = df_output_belt["no_of_events_boxcox"].diff()
    df_output_belt.dropna(inplace=True)'''

    train = df_output_belt.iloc[:-int(len(df_output_belt) * 0.25)]
    test = df_output_belt.iloc[-int(len(df_output_belt) * 0.10):-int(len(df_output_belt) * 0.05)]
    
    # TUNE PER SORTING CENTER
    '''model = auto_arima(train["no_of_events_boxcox"], 
                       seasonal=False, 
                       stepwise=True,  
                       suppress_warnings=True, 
                       trace=False)'''  

    arima_model = ARIMA(train["no_of_events_boxcox"], order=(p, d, q)).fit()

    boxcox_forecast = arima_model.forecast(len(test))
    forecasts = inv_boxcox(boxcox_forecast, lam)

    break

In [None]:
def plot_forecasts(forecast: list[float], title: str) -> None:
    """Function to plot the forecasts"""
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=train["scanning_date"], y=train["no_of_events"], name="Train"))
    fig.add_trace(go.Scatter(x=test["scanning_date"], y=test["no_of_events"], name="Test"))
    fig.add_trace(go.Scatter(x=test["scanning_date"], y=forecasts, name="Test"))
    fig.update_layout(template="simple_white", font=dict(size=18), title_text=title,
                      width=650, title_x=0.5, height=400, xaxis_title="scanning_date",
                      yaxis_title="no_of_events")
    
    return fig.show()

plot_forecasts(forecasts, "ARIMA")