In [1]:
# Load data and packages
import csv
from joblib import Parallel, delayed
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import pmdarima as pm
from pmdarima import auto_arima
import os
from scipy.stats import boxcox
from scipy.special import inv_boxcox
from statsmodels.tsa.arima.model import ARIMA

directory_path = os.getcwd() + "\\Data\\sorting_event_volumes_2023.csv"

df = pd.read_csv(directory_path)

In [2]:
def fill_missing_events(df):
    df_filled_list = []
    
    for center in df['sorting_center_name'].unique():
        df_center = df[df['sorting_center_name'] == center]
        output_belts = df_center['output_belt'].unique()
        
        min_date = df_center['scanning_date'].min()
        max_date = df_center['scanning_date'].max()

        all_dates = pd.date_range(start=pd.Timestamp(year=min_date.year, month=1, day=1), end=pd.Timestamp(year=max_date.year, month=max_date.month, day=1) + pd.offsets.MonthEnd(0))
        
        all_combinations = pd.MultiIndex.from_product(
            [[center], all_dates, output_belts],
            names=['sorting_center_name', 'scanning_date', 'output_belt']
        )
        
        all_combinations_df = pd.DataFrame(index=all_combinations).reset_index()
        df_filled_center = pd.merge(all_combinations_df, df_center, 
                                    on=['sorting_center_name', 'scanning_date', 'output_belt'], 
                                    how='left')
        
        df_filled_center['no_of_events'] = df_filled_center['no_of_events'].fillna(0.0001)
        df_filled_list.append(df_filled_center)
    
    df_filled = pd.concat(df_filled_list, ignore_index=True)
    
    return df_filled

In [4]:
# Data cleaning
print("Number of rows original dataset is: " + str(df.shape[0]))

df = df.loc[df["event_type"] == "LAJ", :]
df.drop(['event_location', 'input_belt', 'position'], axis=1, inplace = True)
df.dropna(inplace = True)
df['output_belt'] = df['output_belt'].astype(int)
df = df.groupby(['sorting_center_name', 'scanning_date', 'output_belt'], as_index = False)['no_of_events'].sum()
df['scanning_date'] = pd.to_datetime(df['scanning_date'])
df = fill_missing_events(df)

print("Number of rows cleaned dataset is: " + str(df.shape[0]))

Number of rows original dataset is: 8949721


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(['event_location', 'input_belt', 'position'], axis=1, inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['output_belt'] = df['output_belt'].astype(int)


Number of rows cleaned dataset is: 243090


In [None]:
# Data preparation
df['day'] = df['scanning_date'].dt.day
df['month'] = df['scanning_date'].dt.month
df['weekday'] = df['scanning_date'].dt.dayofweek + 1
df['week'] = df['scanning_date'].dt.isocalendar().week
df['week_of_month'] = (df['day'] - 1) // 7 + 1
df['yearday'] = df['scanning_date'].dt.day_of_year
df['yearday_sin'] = np.sin(df['yearday'] / 7 * 2 * np.pi)
df['yearday_cos'] = np.cos(df['yearday'] / 7 * 2 * np.pi)

sorting_center_names = df["sorting_center_name"].unique()
df["sorting_center_name"].value_counts()


In [5]:
def tune_hyperparameters(df_output_belt):
    # Perform Box-Cox transformation
    df_output_belt["no_of_events_boxcox"], lam = boxcox(df_output_belt["no_of_events"])
    df_output_belt["no_of_events_diff"] = df_output_belt["no_of_events_boxcox"].diff()
    df_output_belt.dropna(inplace=True)

    train_groupby = df_output_belt.iloc[:-int(len(df_output_belt) * 0.25)]

    # Tune the model
    tuning_model = auto_arima(train_groupby["no_of_events_boxcox"], 
                               seasonal=False, 
                               stepwise=True,  
                               suppress_warnings=True, 
                               trace=False)

    p, d, q = tuning_model.order
    return (p, d, q)

def defineHyperparameters(df, sortingcenter):
    df_sorting_center = df[df["sorting_center_name"] == sortingcenter]
    df_sorting_center.drop(["sorting_center_name"], axis=1, inplace=True)
    
    hyperparameterList = []
    output_belts = df_sorting_center["output_belt"].unique()

    results = Parallel(n_jobs=-1)(delayed(tune_hyperparameters)(
        df_sorting_center[df_sorting_center["output_belt"] == output_belt]) 
        for output_belt in output_belts)
    
    hyperparameterList.extend(results)

    hyperparameter_df = pd.DataFrame(hyperparameterList, columns=["p", "d", "q"])
    hyperparameter_df.to_csv(f'hyperparameters_ARIMA_{sortingcenter}.csv', index=False)

    return hyperparameterList

In [6]:
sorting_center_names = df["sorting_center_name"].unique()

for sorting_center_name in sorting_center_names:
    defineHyperparameters(df, sorting_center_name)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sorting_center.drop(["sorting_center_name"], axis=1, inplace=True)


[  0 601 602 603 604 605 606 607 621 622 623 624 631 632 633 634 635 636
 637 638 639 640 641 642 643 701 702 703 704 705 706 707 711 721 722 723
 724 731 732 733 734 735 736 737 738 739 740 741 742 751 801 802 803 804
 805 807 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824
 825 826 827 828 829 830 901 902 903 904 905 906 907 908 909 910 911 912
 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930
 806 808]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sorting_center.drop(["sorting_center_name"], axis=1, inplace=True)


[  0 101 102 103 104 105 107 108 109 110 111 112 113 114 115 116 117 118
 119 120 121 122 123 124 125 126 127 128 129 130 131 132 201 202 203 204
 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222
 223 224 225 226 227 228 229 230 231 232 301 303 304 305 306 307 308 309
 310 311 312 313 314 402 403 404 406 407 408 409 410 411 412 413 501 502
 503 504 505 506 507 508 509 510 601 602 603 604 605 606 607 608 609 610
 611 612 106 302 401 405]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sorting_center.drop(["sorting_center_name"], axis=1, inplace=True)


[  0 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647
 648 649 671 681 701 702 703 704 705 706 707 709 710 711 751 752 753 754
 755 756 757 759 760 801 802 803 804 805 806 807 808 809 810 851 852 853
 854 856 857 858 859 901 902 903 904 905 906 907 908 951 952 953 954 955
 956 957 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616
 617 618 619 620 621 622 623 624 625 650 651 652 653 654 655 708 758 855
 958]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sorting_center.drop(["sorting_center_name"], axis=1, inplace=True)


[ 0  1  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31 32]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sorting_center.drop(["sorting_center_name"], axis=1, inplace=True)


[  0 601 605 606 622 623 624 625 626 627 628 629 630 631 632 633 634 635
 636 642 651 652 653 654 655 656 661 675 676 678 680 689 691 703 704 705
 707 708 709 710 711 712 713 714 715 716 718 719 720 721 722 723 726 727
 767 768 771 772 773 774 775 776 801 803 804 805 821 822 823 824 825 826
 827 828 829 851 852 853 854 855 871 873 875 877 879 901 902 903 904 905
 906 907 908 909 910 951 952 953 954 955 956 957 958 959 960 604 611 621
 637 638 639 640 641 673 674 677 679 681 682 686 687 688 690 692 701 702
 706 717 728 751 752 754 755 756 757 758 759 764 765 766 770 777 872 874
 876 878 753 769 724 971 725]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sorting_center.drop(["sorting_center_name"], axis=1, inplace=True)


[  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  88  89  90  91  92  93
  94  95  96  97  98 100 109 306 307 308 309 310 311 312 313 314 317 318
 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336
 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 300 301 303
 305 315 316 302  99 304]


In [6]:
def plot_forecasts(train, test, forecasts, title):
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=train["scanning_date"], y=train["no_of_events"], name="Train"))
    fig.add_trace(go.Scatter(x=test["scanning_date"], y=test["no_of_events"], name="Test"))
    fig.add_trace(go.Scatter(x=test["scanning_date"], y=forecasts, name="Test"))
    fig.update_layout(template="simple_white", font=dict(size=18), title_text=title,
                      width=650, title_x=0.5, height=400, xaxis_title="scanning_date",
                      yaxis_title="no_of_events")
    
    return fig.show()

In [32]:
# Test ARIMA per output belt
def ARIMA_model(df, sortingcenter, hyperparameters):
    df_sortingcenter = df[df["sorting_center_name"] == sortingcenter]
    output_belts = df_sortingcenter['output_belt'].unique()

    daily_errors = {}
    daily_mse = {}
    daily_mae = {}

    for index, output_belt in enumerate(output_belts):
        df_output_belt = df_sortingcenter[df_sortingcenter["output_belt"] == output_belt]
        
        df_output_belt["no_of_events_boxcox"], lam = boxcox(df_output_belt["no_of_events"])
        df_output_belt["no_of_events_diff"] = df_output_belt["no_of_events_boxcox"].diff()
        df_output_belt.dropna(inplace=True)

        train = df_output_belt.iloc[:-int(len(df_output_belt) * 0.25)]
        test = df_output_belt.iloc[-int(len(df_output_belt) * 0.25):-int(len(df_output_belt) * 0.20)]

        p, d, q = hyperparameters[index]

        arima_model = ARIMA(train["no_of_events_boxcox"], order=(p, d, q)).fit()

        boxcox_forecast = arima_model.forecast(len(test))
        forecasts = inv_boxcox(boxcox_forecast, lam)

        #plot_forecasts(train, test, forecasts, "ARIMA")

        for day in range(len(test)):
            actual = test.iloc[day]["no_of_events"]
            forecast = forecasts.iloc[day]

            squared_difference = (actual - forecast) ** 2
            absolute_difference = abs(actual - forecast)

            if day not in daily_errors:
                daily_errors[day] = {}
                daily_errors[day]["mse"] = []
                daily_errors[day]["mae"] = []
            
            daily_errors[day]["mse"].append(squared_difference)
            daily_errors[day]["mae"].append(absolute_difference)

    for day in range(len(daily_errors)):
        mse = sum(daily_errors[day]["mse"]) / len(daily_errors[day]["mse"])
        mae = sum(daily_errors[day]["mae"]) / len(daily_errors[day]["mae"])
        daily_mse[day] = mse
        daily_mae[day] = mae

    return daily_errors, daily_mse, daily_mae


In [33]:
sorting_center_names = df["sorting_center_name"].unique()
MSE_dict = {}
VSE_dict = {}
MAE_dict = {}
daily_errors = {}

for sorting_center_name in sorting_center_names:
    hyperparameters_df = pd.read_csv("Data/hyperparameters ARIMA/hyperparameters_ARIMA_{}.csv".format(sorting_center_name))
    hyperparameterList = [tuple(row) for row in hyperparameters_df.to_numpy()]
    daily_errors_sorting_center, mse, mae = ARIMA_model(df, sorting_center_name, hyperparameterList)

    daily_errors[sorting_center_name] = daily_errors_sorting_center
    MSE_dict[sorting_center_name] = sum(mse.values()) / len(mse)
    VSE_dict[sorting_center_name] = np.var(list(mse.values()), ddof=1)
    MAE_dict[sorting_center_name] = sum(mae.values()) / len(mae)
    
    print(MSE_dict[sorting_center_name], VSE_dict[sorting_center_name], MAE_dict[sorting_center_name])

daily_mse = {}
daily_mae = {}

for day in range(len(daily_errors)):
    mse = 0
    mae = 0
    n_output_belts = 0
    for sorting_center_name in sorting_center_names:
        mse += sum(daily_errors[sorting_center_name][day]["mse"])
        mae += sum(daily_errors[sorting_center_name][day]["mae"])
        n_output_belts += len(daily_errors[sorting_center_name][day]["mse"])
    daily_mse[day] = mse / n_output_belts
    daily_mae[day] = mae / n_output_belts

print(daily_mse)

MSE_dict["total"] = sum(daily_mse.values()) / len(daily_mse)
VSE_dict["total"] = np.var(list(daily_mse.values()), ddof=1)
MAE_dict["total"] = sum(daily_mae.values()) / len(daily_mae)

with open("Results/results_ARIMA.csv", mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Sorting center', 'MSE', 'VSE', 'MAE'])
    
    for key in MSE_dict.keys():
        writer.writerow([key, MSE_dict[key], VSE_dict[key], MAE_dict[key]])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_output_belt["no_of_events_boxcox"], lam = boxcox(df_output_belt["no_of_events"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_output_belt["no_of_events_diff"] = df_output_belt["no_of_events_boxcox"].diff()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_output_belt.dropna(inplace=True)
  self._init_dates(dates, freq)
  self._init_dates(dates, f

35359.82806105615 682010613.2648818 123.81411689189967


  return get_prediction_index(
  return get_prediction_index(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_output_belt["no_of_events_boxcox"], lam = boxcox(df_output_belt["no_of_events"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_output_belt["no_of_events_diff"] = df_output_belt["no_of_events_boxcox"].diff()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_output_belt.dropna(inplace=Tru

37032.88678920003 1314014150.1380804 124.66102417922173


  self._init_dates(dates, freq)
  warn('Non-invertible starting MA parameters found.'
  return get_prediction_index(
  return get_prediction_index(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_output_belt["no_of_events_boxcox"], lam = boxcox(df_output_belt["no_of_events"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_output_belt["no_of_events_diff"] = df_output_belt["no_of_events_boxcox"].diff()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_

26644.323920322237 722375818.8537242 101.04815812231425


  return get_prediction_index(
  return get_prediction_index(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_output_belt["no_of_events_boxcox"], lam = boxcox(df_output_belt["no_of_events"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_output_belt["no_of_events_diff"] = df_output_belt["no_of_events_boxcox"].diff()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_output_belt.dropna(inplace=Tru

163019.43868331477 28638924687.91668 282.3833619679473


  return get_prediction_index(
  return get_prediction_index(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_output_belt["no_of_events_boxcox"], lam = boxcox(df_output_belt["no_of_events"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_output_belt["no_of_events_diff"] = df_output_belt["no_of_events_boxcox"].diff()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_output_belt.dropna(inplace=Tru

54031.593274926054 3241027535.4810185 123.01671166633479


  return get_prediction_index(
  return get_prediction_index(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_output_belt["no_of_events_boxcox"], lam = boxcox(df_output_belt["no_of_events"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_output_belt["no_of_events_diff"] = df_output_belt["no_of_events_boxcox"].diff()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_output_belt.dropna(inplace=Tru

750307.5092311929 745663505123.3707 401.94832205519526
{0: 649145.0463585601, 1: 77028.30718830753, 2: 45348.42677881614, 3: 86668.74294428015, 4: 109114.07615423444, 5: 7479.028558088205}


  return get_prediction_index(
  return get_prediction_index(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_output_belt["no_of_events_boxcox"], lam = boxcox(df_output_belt["no_of_events"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_output_belt["no_of_events_diff"] = df_output_belt["no_of_events_boxcox"].diff()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_output_belt.dropna(inplace=Tru

In [None]:
df = pd.read_csv(os.getcwd() + "\\Data\\sorting_event_volumes_2023.csv")

days = 14
output_belts = df["output_belt"].unique()
number_of_output_belts = len(output_belts)
daily_errors = {}
daily_mse = {}

# For each output belt make forecasts
for output_belt in output_belts:
    test = pd.Dataframe()
    forecast = pd.DataFrame()

    # For each day, calculate the squared deviation
    for day in range(days):
        actual = test.iloc[day]
        forecast = forecast.iloc[day]

        squared_deviation = (actual - forecast) ** 2

        # If day not in directory, add empty list
        if day not in daily_errors:
            daily_errors[day] = []
            
        daily_errors[day].append(squared_deviation)

# Calculate for each day the average squared deviation
for day in range(days):
    se = sum(daily_errors[day].values()) / number_of_output_belts
    daily_mse[day] = se

MSE = sum(daily_mse.values()) / days
VSE = np.var(list(daily_mse.values()), ddof=1)