In [163]:
import numpy as np
import os
import pandas as pd
import plotly.express as px
import warnings

from colorama import Fore, Style
from IPython.display import display
from pandas import DataFrame
from plotly.graph_objs import Figure
from skimpy import skim
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import LabelEncoder
from typing import Dict, List

warnings.filterwarnings("ignore")

In [164]:
print(f"{Style.BRIGHT}{Fore.BLUE}DIRECTORIES:")
for root, dirs, _ in os.walk("./Resources"):
    for dir in dirs:
        print(f"{Style.BRIGHT}{Fore.GREEN}    {os.path.join(root, dir)}")

print(f"{Style.BRIGHT}{Fore.BLUE}FILES:")
for root, _, files in os.walk("./Resources"):
    for file in files:
        print(f"{Style.BRIGHT}{Fore.GREEN}    {os.path.join(root, file)}")

[1m[34mDIRECTORIES:
[1m[32m    ./Resources/rohlik-orders-forecasting-challenge
[1m[34mFILES:
[1m[32m    ./Resources/rohlik-orders-forecasting-challenge/test_calendar.csv
[1m[32m    ./Resources/rohlik-orders-forecasting-challenge/train.csv
[1m[32m    ./Resources/rohlik-orders-forecasting-challenge/train_calendar.csv
[1m[32m    ./Resources/rohlik-orders-forecasting-challenge/test.csv
[1m[32m    ./Resources/rohlik-orders-forecasting-challenge/solution_example.csv


In [165]:
# Load data.
train: DataFrame = pd.read_csv(
    "./Resources/rohlik-orders-forecasting-challenge/train.csv", 
    # index_col=["warehouse", "date"],
    usecols=[i for i in range(17)],
    parse_dates=[1],
    date_format="%Y-%m-%d"
)
train_calendar: DataFrame = pd.read_csv(
    "./Resources/rohlik-orders-forecasting-challenge/train_calendar.csv",
    # index_col=["warehouse", "date"],
    parse_dates=[0],
    date_format="%Y-%m-%d"
)
test: DataFrame = pd.read_csv(
    "./Resources/rohlik-orders-forecasting-challenge/test.csv", 
    # index_col=["warehouse", "date"],
    usecols=[i for i in range(7)],
    parse_dates=[1],
    date_format="%Y-%m-%d"
)
test_calendar: DataFrame = pd.read_csv(
    "./Resources/rohlik-orders-forecasting-challenge/test_calendar.csv",
    # index_col=["warehouse", "date"],
    parse_dates=[0],
    date_format="%Y-%m-%d"
)
sub: DataFrame  = pd.read_csv(
    "./Resources/rohlik-orders-forecasting-challenge/solution_example.csv",
    # index_col=["id"],
)

# Descriptive Data Analysis

In [166]:
display(train.describe())
display(train.head())

Unnamed: 0,date,orders,holiday,shutdown,mini_shutdown,shops_closed,winter_school_holidays,school_holidays,blackout,mov_change,frankfurt_shutdown,precipitation,snow,user_activity_1,user_activity_2
count,7340,7340.0,7340.0,7340.0,7340.0,7340.0,7340.0,7340.0,7340.0,7340.0,7340.0,7070.0,7070.0,7340.0,7340.0
mean,2022-08-27 18:17:27.629427968,5535.443869,0.027248,0.000136,0.000545,0.010899,0.029973,0.007084,0.000954,0.221499,0.000272,1.621126,0.290112,1633.545777,23477.031335
min,2020-12-05 00:00:00,790.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,482.0,1797.0
25%,2021-11-23 00:00:00,4434.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,945.0,18509.0
50%,2022-09-08 00:00:00,5370.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1349.0,22267.0
75%,2023-06-13 06:00:00,7009.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2308.0,30028.75
max,2024-03-15 00:00:00,18139.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,80.0,19.84,3437.0,45769.0
std,,2182.680279,0.162816,0.011672,0.02334,0.103836,0.170524,0.083876,0.030869,0.410293,0.016506,4.542875,1.546102,784.167637,8934.848328


Unnamed: 0,warehouse,date,orders,holiday_name,holiday,shutdown,mini_shutdown,shops_closed,winter_school_holidays,school_holidays,blackout,mov_change,frankfurt_shutdown,precipitation,snow,user_activity_1,user_activity_2
0,Prague_1,2020-12-05,6895.0,,0,0,0,0,0,0,0,0.0,0,0.0,0.0,1722.0,32575.0
1,Prague_1,2020-12-06,6584.0,,0,0,0,0,0,0,0,0.0,0,0.0,0.0,1688.0,32507.0
2,Prague_1,2020-12-07,7030.0,,0,0,0,0,0,0,0,0.0,0,0.0,0.0,1696.0,32552.0
3,Prague_1,2020-12-08,6550.0,,0,0,0,0,0,0,0,0.0,0,0.8,0.0,1681.0,32423.0
4,Prague_1,2020-12-09,6910.0,,0,0,0,0,0,0,0,0.0,0,0.5,0.0,1704.0,32410.0


In [167]:
display(test.describe())
display(test.head())

Unnamed: 0,date,holiday,shops_closed,winter_school_holidays,school_holidays
count,397,397.0,397.0,397.0,397.0
mean,2024-04-15 01:48:48.967254528,0.06801,0.020151,0.020151,0.070529
min,2024-03-16 00:00:00,0.0,0.0,0.0,0.0
25%,2024-03-31 00:00:00,0.0,0.0,0.0,0.0
50%,2024-04-15 00:00:00,0.0,0.0,0.0,0.0
75%,2024-04-30 00:00:00,0.0,0.0,0.0,0.0
max,2024-05-15 00:00:00,1.0,1.0,1.0,1.0
std,,0.252081,0.140694,0.140694,0.256359


Unnamed: 0,warehouse,date,holiday_name,holiday,shops_closed,winter_school_holidays,school_holidays
0,Prague_1,2024-03-16,,0,0,0,0
1,Prague_1,2024-03-17,,0,0,0,0
2,Prague_1,2024-03-18,,0,0,0,0
3,Prague_1,2024-03-19,,0,0,0,0
4,Prague_1,2024-03-20,,0,0,0,0


In [168]:
skim(train.reset_index())

In [169]:
skim(test.reset_index())

# Exploratory Data Analysis

In [170]:
# Orders Over Time by Warehouse
fig: Figure = px.line(
    data_frame=train.reset_index(),
    x="date",
    y="orders",
    line_group="warehouse",
    color="warehouse",
    title="Orders Over Time by Warehouse",
    labels={"date": "Date", "orders": "Number of Orders", "warehouse": "Warehouse"},
    template="plotly_dark"
)
fig.show()

In [171]:
# Distribution of Orders by Warehouse
fig = px.violin(
    data_frame=train.reset_index(),
    x="orders",
    y="warehouse",
    color="warehouse",
    labels={"warehouse": "Warehouse", "orders": "Number of Orders"},
    orientation='h',
    points="outliers",
    box=True,
    title="Distribution of Orders by Warehouse",
    template="plotly_dark"
)
fig.show()

# Preprocessing

In [172]:

DISCRETE_FEATURES: List[str] = [
    # "orders", 
    "holiday", 
    "shutdown", 
    "mini_shutdown", 
    "shops_closed",	
    "winter_school_holidays", 
    "school_holidays", 
    "blackout", 
    "mov_change", 
    "frankfurt_shutdown",
    "user_activity_1", 
    "user_activity_2"
]
CONTINUOUS_FEATURES: List[str] = ["precipitation", "snow"]
CATEGORICAL_FEATURES: List[str] = ["warehouse", "holiday_name"]
DATETIME_FEATURES: List[str] = [
    "year",
    "month",
    "day",
    "day_of_week",
    "day_of_year",
    "quarter"
]

WAREHOUSES: List[str] = [
    "Brno_1", 
    "Budapest_1", 
    "Frankfurt_1", 
    "Munich_1", 
    "Prague_1", 
    "Prague_2", 
    "Prague_3"
]
WAREHOUSE_ENCODINGS: Dict[str, int] = {warehouse: i for i, warehouse in enumerate(WAREHOUSES)}

HOLIDAYS: List[str] = [
    "None", 
    "1848 Revolution Memorial Day (Extra holiday)", 
    "1st Christmas Day", 
    "2nd Christmas Day", 
    "All Saints Day", 
    "All Saints' Day Holiday", 
    "Ascension day", 
    "Assumption of the Virgin Mary", 
    "Christmas Eve", 
    "Christmas Holiday", 
    "Cyril and Methodius' Day", 
    "Czech Statehood Day", 
    "Day of National Unity", 
    "Day of Struggle for Freedom and Democracy", 
    "Easter Monday", 
    "Epiphany", 
    "Feast of Corpus Christi", 
    "German Unity Day", 
    "Good Friday", 
    "Hungary National Day Holiday", 
    "Independent Czechoslovak State Day", 
    "Independent Hungary Day", 
    "International womens day", 
    "Jan Hus Day", 
    "Labour Day", 
    "Liberation Day", 
    "Memorial Day for the Martyrs of Arad", 
    "Memorial Day for the Victims of the Communist Dictatorships", 
    "Memorial Day for the Victims of the Holocaust", 
    "Memorial Day of the Republic", 
    "Memorial day of the 1956 Revolution", 
    "National Defense Day", 
    "New Years Day", 
    "Peace Festival in Augsburg", 
    "Reformation Day", 
    "State Foundation Day", 
    "Whit monday", 
    "Whit sunday"
]
HOLIDAY_ENCODINGS: Dict[str, int] = {holiday: i for i, holiday in enumerate(HOLIDAYS)}

def types_of_holidays(
    train: pd.DataFrame,
    train_calendar: pd.DataFrame,
    test_calendar: pd.DataFrame
) -> np.ndarray[str]:
    return np.unique(np.concatenate([
        train["holiday_name"].dropna().unique(),
        train_calendar["holiday_name"].dropna().unique(),
        test_calendar["holiday_name"].dropna().unique()
    ]))


def translate_holidays_to_english(df: pd.DataFrame) -> None:
    df.loc[df["holiday_name"] == "Corpus Christi", "holiday_name"] = "Feast of Corpus Christi"
    df.loc[df["holiday_name"] == "Cyrila a Metodej", "holiday_name"] = "Cyril and Methodius' Day"
    df.loc[df["holiday_name"] == "Den boje za svobodu a demokracii", "holiday_name"] = "Day of Struggle for Freedom and Democracy"
    df.loc[df["holiday_name"] == "Den ceske statnost", "holiday_name"] = "Czech Statehood Day"
    df.loc[df["holiday_name"] == "Den ceske statnosti", "holiday_name"] = "Czech Statehood Day"
    df.loc[df["holiday_name"] == "Den osvobozeni", "holiday_name"] = "Liberation Day"
    df.loc[df["holiday_name"] == "Den vzniku samostatneho ceskoslovenskeho statu", "holiday_name"] = "Independent Czechoslovak State Day"
    df.loc[df["holiday_name"] == "Jan Hus", "holiday_name"] = "Jan Hus Day"
    
    
def transform_columns(df: pd.DataFrame) -> None:
    translate_holidays_to_english(df)
    df["holiday_name"].fillna("None", inplace=True)
    
    columns: List[str] = df.columns
    for column in columns:
        if column in DISCRETE_FEATURES:
            df[column] = df[column].astype(np.int32)
        elif column in CONTINUOUS_FEATURES:
            df[column] = df[column].astype(np.float16)
        elif column in CATEGORICAL_FEATURES:
            df[column] = df[column].astype("category")
        elif column in DATETIME_FEATURES:
            df[column] = pd.to_datetime(df[column])
    
    df["warehouse"] = df["warehouse"].map(WAREHOUSE_ENCODINGS)
    df["holiday_name"] = df["holiday_name"].map(HOLIDAY_ENCODINGS)
    
    df["year"] = df["date"].dt.year
    df["month"] = df["date"].dt.month
    df["day"] = df["date"].dt.day
    df["day_of_week"] = df["date"].dt.day_of_week
    df["day_of_year"] = df["date"].dt.day_of_year
    df["quarter"] = df["date"].dt.quarter
    
    df.drop(columns=["date"], inplace=True)
    
    
def cols_with_nan(df: pd.DataFrame) -> pd.Series:
    nan_counts: pd.Series = df.isnull().sum()
    nan_counts = nan_counts[nan_counts > 0]
    return nan_counts


def impute_missing_values(df: pd.DataFrame) -> pd.DataFrame:
    imputer: IterativeImputer = IterativeImputer(
		sample_posterior=True,
		max_iter=20,
		verbose=2,
		random_state=42
	)
    
    imputer.fit(df)
    imputed_df: pd.DataFrame = pd.DataFrame(imputer.transform(df), columns=df.columns, index=df.index)
    
    return imputed_df

In [173]:
translate_holidays_to_english(train_calendar)
translate_holidays_to_english(test_calendar)

transform_columns(train)
train = impute_missing_values(train)
display(cols_with_nan(train))

transform_columns(test)
test = impute_missing_values(test)
display(cols_with_nan(test))

[IterativeImputer] Completing matrix with shape (7340, 22)
[IterativeImputer] Ending imputation round 1/20, elapsed time 3.37
[IterativeImputer] Ending imputation round 2/20, elapsed time 7.16
[IterativeImputer] Ending imputation round 3/20, elapsed time 11.24
[IterativeImputer] Ending imputation round 4/20, elapsed time 14.84
[IterativeImputer] Ending imputation round 5/20, elapsed time 17.50
[IterativeImputer] Ending imputation round 6/20, elapsed time 20.76
[IterativeImputer] Ending imputation round 7/20, elapsed time 23.80
[IterativeImputer] Ending imputation round 8/20, elapsed time 28.06
[IterativeImputer] Ending imputation round 9/20, elapsed time 33.04
[IterativeImputer] Ending imputation round 10/20, elapsed time 38.21
[IterativeImputer] Ending imputation round 11/20, elapsed time 43.58
[IterativeImputer] Ending imputation round 12/20, elapsed time 48.58
[IterativeImputer] Ending imputation round 13/20, elapsed time 52.47
[IterativeImputer] Ending imputation round 14/20, elaps

Series([], dtype: int64)

[IterativeImputer] Completing matrix with shape (397, 12)
[IterativeImputer] Ending imputation round 1/20, elapsed time 0.03
[IterativeImputer] Ending imputation round 2/20, elapsed time 0.05
[IterativeImputer] Ending imputation round 3/20, elapsed time 0.08
[IterativeImputer] Ending imputation round 4/20, elapsed time 0.10
[IterativeImputer] Ending imputation round 5/20, elapsed time 0.12
[IterativeImputer] Ending imputation round 6/20, elapsed time 0.14
[IterativeImputer] Ending imputation round 7/20, elapsed time 0.16
[IterativeImputer] Ending imputation round 8/20, elapsed time 0.18
[IterativeImputer] Ending imputation round 9/20, elapsed time 0.19
[IterativeImputer] Ending imputation round 10/20, elapsed time 0.21
[IterativeImputer] Ending imputation round 11/20, elapsed time 0.24
[IterativeImputer] Ending imputation round 12/20, elapsed time 0.26
[IterativeImputer] Ending imputation round 13/20, elapsed time 0.28
[IterativeImputer] Ending imputation round 14/20, elapsed time 0.30

Series([], dtype: int64)

# Model Training