In [2]:
# Load data and packages
from joblib import Parallel, delayed
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import pmdarima as pm
from pmdarima import auto_arima
import os
from scipy.stats import boxcox
from scipy.special import inv_boxcox
from statsmodels.tsa.arima.model import ARIMA

directory_path = os.getcwd() + "\\Data\\sorting_event_volumes_2023.csv"

df = pd.read_csv(directory_path)

In [3]:
def fill_missing_events(df):
    df_filled_list = []
    
    for center in df['sorting_center_name'].unique():
        df_center = df[df['sorting_center_name'] == center]
        output_belts = df_center['output_belt'].unique()
        
        min_date = df_center['scanning_date'].min()
        max_date = df_center['scanning_date'].max()

        all_dates = pd.date_range(start=pd.Timestamp(year=min_date.year, month=1, day=1), end=pd.Timestamp(year=max_date.year, month=max_date.month, day=1) + pd.offsets.MonthEnd(0))
        
        all_combinations = pd.MultiIndex.from_product(
            [[center], all_dates, output_belts],
            names=['sorting_center_name', 'scanning_date', 'output_belt']
        )
        
        all_combinations_df = pd.DataFrame(index=all_combinations).reset_index()
        df_filled_center = pd.merge(all_combinations_df, df_center, 
                                    on=['sorting_center_name', 'scanning_date', 'output_belt'], 
                                    how='left')
        
        df_filled_center['no_of_events'] = df_filled_center['no_of_events'].fillna(0.0001)
        df_filled_list.append(df_filled_center)
    
    df_filled = pd.concat(df_filled_list, ignore_index=True)
    
    return df_filled

In [4]:
# Data cleaning
print("Number of rows original dataset is: " + str(df.shape[0]))

df = df.loc[df["event_type"] == "LAJ", :]
df.drop(['event_location', 'input_belt', 'position'], axis=1, inplace = True)
df.dropna(inplace = True)
df['output_belt'] = df['output_belt'].astype(int)
df = df.groupby(['sorting_center_name', 'scanning_date', 'output_belt'], as_index = False)['no_of_events'].sum()
df['scanning_date'] = pd.to_datetime(df['scanning_date'])
df = fill_missing_events(df)

print("Number of rows cleaned dataset is: " + str(df.shape[0]))

Number of rows original dataset is: 8949721
Number of rows cleaned dataset is: 243090


In [None]:
# Data preparation
df['day'] = df['scanning_date'].dt.day
df['month'] = df['scanning_date'].dt.month
df['weekday'] = df['scanning_date'].dt.dayofweek + 1
df['week'] = df['scanning_date'].dt.isocalendar().week
df['week_of_month'] = (df['day'] - 1) // 7 + 1
df['yearday'] = df['scanning_date'].dt.day_of_year
df['yearday_sin'] = np.sin(df['yearday'] / 7 * 2 * np.pi)
df['yearday_cos'] = np.cos(df['yearday'] / 7 * 2 * np.pi)

sorting_center_names = df["sorting_center_name"].unique()
df["sorting_center_name"].value_counts()


In [9]:
df_KUOPIO_0 = df[(df["sorting_center_name"] == "KUOPIO") & (df["output_belt"] == 0)]
print(df_KUOPIO_0.head())

    sorting_center_name scanning_date  output_belt  no_of_events
0                KUOPIO    2023-01-01            0        0.0001
110              KUOPIO    2023-01-02            0     1439.0000
220              KUOPIO    2023-01-03            0     1109.0000
330              KUOPIO    2023-01-04            0     1073.0000
440              KUOPIO    2023-01-05            0     1034.0000


In [37]:
# Determine hyperparameters per output belt
def defineHyperparameters(df, sortingcenter):
    df_sorting_center = df[df["sorting_center_name"] == sortingcenter]
    df_sorting_center.drop(["sorting_center_name"], axis=1, inplace=True)
    
    hyperparameterList = []
    output_belts = df_sorting_center["output_belt"].unique()
    print(output_belts)
    for output_belt in output_belts:
        df_output_belt = df_sorting_center[df_sorting_center["output_belt"] == output_belt]
        df_output_belt["no_of_events_boxcox"], lam = boxcox(df_output_belt["no_of_events"])
        df_output_belt["no_of_events_diff"] = df_output_belt["no_of_events_boxcox"].diff()
        df_output_belt.dropna(inplace=True)

        train_groupby = df_output_belt.iloc[:-int(len(df_output_belt) * 0.25)]

        # TUNE PER SORTING CENTER
        tuning_model = auto_arima(train_groupby["no_of_events_boxcox"], 
                            seasonal=False, 
                            stepwise=True,  
                            suppress_warnings=True, 
                            trace=False)   

        p, d, q = tuning_model.order
        #print(f"Optimal order {output_belt}: p={p}, d={d}, q={q}")
        hyperparameterList.append([p,d,q])

    return hyperparameterList


In [5]:
def tune_hyperparameters(df_output_belt):
    # Perform Box-Cox transformation
    df_output_belt["no_of_events_boxcox"], lam = boxcox(df_output_belt["no_of_events"])
    df_output_belt["no_of_events_diff"] = df_output_belt["no_of_events_boxcox"].diff()
    df_output_belt.dropna(inplace=True)

    train_groupby = df_output_belt.iloc[:-int(len(df_output_belt) * 0.25)]

    # Tune the model
    tuning_model = auto_arima(train_groupby["no_of_events_boxcox"], 
                               seasonal=False, 
                               stepwise=True,  
                               suppress_warnings=True, 
                               trace=False)

    p, d, q = tuning_model.order
    return (p, d, q)

def defineHyperparameters(df, sortingcenter):
    df_sorting_center = df[df["sorting_center_name"] == sortingcenter]
    df_sorting_center.drop(["sorting_center_name"], axis=1, inplace=True)
    
    hyperparameterList = []
    output_belts = df_sorting_center["output_belt"].unique()
    print(output_belts)

    results = Parallel(n_jobs=-1)(delayed(tune_hyperparameters)(
        df_sorting_center[df_sorting_center["output_belt"] == output_belt]) 
        for output_belt in output_belts)
    
    hyperparameterList.extend(results)

    hyperparameter_df = pd.DataFrame(hyperparameterList, columns=["p", "d", "q"])
    hyperparameter_df.to_csv(f'hyperparameters_{sortingcenter}.csv', index=False)

    return hyperparameterList

In [9]:
hyperparameters_VANTAA = defineHyperparameters(df, "VANTAA")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sorting_center.drop(["sorting_center_name"], axis=1, inplace=True)


[  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  88  89  90  91  92  93
  94  95  96  97  98 100 109 306 307 308 309 310 311 312 313 314 317 318
 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336
 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 300 301 303
 305 315 316 302  99 304]


In [10]:
def plot_forecasts(train, test, forecasts, title):
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=train["scanning_date"], y=train["no_of_events"], name="Train"))
    fig.add_trace(go.Scatter(x=test["scanning_date"], y=test["no_of_events"], name="Test"))
    fig.add_trace(go.Scatter(x=test["scanning_date"], y=forecasts, name="Test"))
    fig.update_layout(template="simple_white", font=dict(size=18), title_text=title,
                      width=650, title_x=0.5, height=400, xaxis_title="scanning_date",
                      yaxis_title="no_of_events")
    
    return fig.show()

In [26]:
# Train and test ARIMA per output belt
def ARIMA_model(df, sortingcenter, hyperparameters):
    df_sortingcenter = df[df["sorting_center_name"] == sortingcenter]
    output_belts = df_sortingcenter['output_belt'].unique()

    daily_errors = {}

    for index, output_belt in enumerate(output_belts):
        df_output_belt = df_sortingcenter[df_sortingcenter["output_belt"] == output_belt]
        
        df_output_belt["no_of_events_boxcox"], lam = boxcox(df_output_belt["no_of_events"])
        df_output_belt["no_of_events_diff"] = df_output_belt["no_of_events_boxcox"].diff()
        df_output_belt.dropna(inplace=True)

        train = df_output_belt.iloc[:-int(len(df_output_belt) * 0.25)]
        test = df_output_belt.iloc[-int(len(df_output_belt) * 0.25):-int(len(df_output_belt) * 0.20)]

        p, d, q = hyperparameters[index]

        arima_model = ARIMA(train["no_of_events_boxcox"], order=(p, d, q)).fit()

        boxcox_forecast = arima_model.forecast(len(test))
        forecasts = inv_boxcox(boxcox_forecast, lam)

        #plot_forecasts(train, test, forecasts, "ARIMA")

        for day in range(len(test)):
            actual = test["no_of_events"].iloc[day]
            forecast = forecasts.iloc[day]

            difference = actual - forecast

            if day not in daily_errors:
                daily_errors[day] = []
            
            daily_errors[day].append(difference)

    daily_rmse = {}
    daily_mae = {}

    for day in range(len(daily_errors)):
        rmse = 0
        mae = 0

        for index in range(len(daily_errors[day])):
            rmse += daily_errors[day][index] ** 2
            mae += abs(daily_errors[day][index])

        rmse = np.sqrt(rmse / len(daily_errors[day]))
        mae = mae / len(daily_errors[day])

        daily_rmse[day] = rmse
        daily_mae[day] = mae

    return daily_errors, daily_rmse, daily_mae


In [27]:
hyperparameters_df = pd.read_csv("hyperparameters_VANTAA.csv")
hyperparameterList = [tuple(row) for row in hyperparameters_df.to_numpy()]
print(hyperparameterList)
daily_errors, rmse, mae = ARIMA_model(df, "VANTAA", hyperparameterList)

[(5, 1, 2), (5, 0, 0), (5, 0, 0), (5, 0, 0), (5, 0, 2), (5, 0, 0), (5, 0, 2), (5, 0, 0), (5, 0, 0), (5, 0, 2), (5, 0, 2), (5, 0, 0), (5, 0, 0), (5, 1, 3), (5, 0, 0), (5, 0, 0), (5, 0, 0), (5, 0, 2), (5, 0, 0), (5, 0, 0), (5, 1, 4), (5, 0, 0), (5, 0, 0), (5, 0, 0), (5, 0, 0), (5, 0, 0), (5, 0, 0), (5, 0, 0), (5, 0, 0), (5, 0, 2), (5, 1, 3), (5, 0, 2), (5, 0, 2), (5, 0, 2), (5, 0, 0), (5, 0, 0), (5, 0, 2), (5, 0, 0), (5, 0, 0), (5, 1, 3), (5, 0, 0), (5, 0, 0), (5, 1, 3), (5, 0, 0), (5, 0, 2), (2, 1, 5), (5, 1, 3), (5, 0, 2), (5, 1, 3), (5, 0, 0), (5, 0, 2), (5, 0, 2), (5, 1, 3), (5, 0, 2), (5, 0, 0), (5, 0, 2), (5, 0, 0), (5, 0, 0), (5, 0, 0), (5, 0, 0), (5, 0, 2), (5, 1, 5), (5, 0, 2), (5, 0, 0), (5, 0, 0), (5, 1, 1), (5, 1, 4), (5, 1, 1), (5, 0, 2), (5, 0, 0), (5, 0, 2), (5, 0, 2), (5, 0, 2), (5, 0, 0), (5, 0, 0), (5, 1, 3), (5, 0, 2), (5, 0, 2), (5, 0, 2), (5, 0, 0), (5, 1, 4), (5, 0, 0), (5, 0, 0), (5, 0, 0), (5, 0, 2), (5, 0, 0), (5, 0, 2), (5, 0, 2), (5, 0, 2), (5, 0, 2), (5, 0, 2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_output_belt["no_of_events_boxcox"], lam = boxcox(df_output_belt["no_of_events"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_output_belt["no_of_events_diff"] = df_output_belt["no_of_events_boxcox"].diff()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_output_belt.dropna(inplace=True)
  self._init_dates(dates, freq)
  self._init_dates(dates, f

In [31]:
print(rmse)
print(mae)

{0: 1619.0789478535517, 1: 510.9558752779851, 2: 358.7045976509141, 3: 350.7089655200361, 4: 577.6260966888809, 5: 168.1206395394233, 6: 223.61549535205563, 7: 1518.0949224361002, 8: 1077.9596008653098, 9: 656.8875218804268, 10: 574.8678816169634, 11: 724.766124602515, 12: 149.96108716221337, 13: 426.13272635543575, 14: 1515.1766315041575, 15: 1377.732626505471, 16: 844.1386005135879, 17: 553.4179021998362, 18: 734.2128396369158}
{0: 766.1421116685465, 1: 307.65909540422666, 2: 183.91505804434067, 3: 215.97915087083192, 4: 348.8937560873525, 5: 108.52164710681684, 6: 146.47663006489216, 7: 799.1106334390777, 8: 580.4876034021157, 9: 377.82736988097076, 10: 308.730311935895, 11: 377.23273047211995, 12: 103.35749659270397, 13: 279.42773276603714, 14: 767.4728589786017, 15: 750.9841242903761, 16: 493.4318465400393, 17: 341.5463455958298, 18: 379.8216159079348}
