In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from core.feature_engineering import FE
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

import lightgbm as lgb

## Hard Variables

In [2]:
TO_SUBMIT = True

## Read Data

In [3]:
train = pd.read_csv("./data/raw_data/train.csv", parse_dates=["date"]).sort_values(by="date")
test = pd.read_csv("./data/raw_data/test_calendar.csv", parse_dates=["date"]).sort_values(by="date")
test_dates = pd.read_csv("./data/raw_data/test.csv", parse_dates=["date"]).sort_values(by="date")
submission = pd.read_csv("./data/raw_data/solution_example.csv")

In [4]:
modelling_cols = list(test.columns) + ['orders']

full_data = pd.concat([train, test], axis=0)
# full_data = full_data[modelling_cols]
full_data

Unnamed: 0,warehouse,date,orders,holiday_name,holiday,shutdown,mini_shutdown,shops_closed,winter_school_holidays,school_holidays,blackout,mov_change,frankfurt_shutdown,precipitation,snow,user_activity_1,user_activity_2,id
0,Prague_1,2020-12-05,6895.0,,0,0.0,0.0,0,0,0,0.0,0.0,0.0,0.0,0.0,1722.0,32575.0,Prague_1_2020-12-05
2386,Prague_2,2020-12-05,4154.0,,0,0.0,0.0,0,0,0,0.0,0.0,0.0,0.0,0.0,1317.0,18462.0,Prague_2_2020-12-05
3579,Prague_3,2020-12-05,4091.0,,0,0.0,0.0,0,0,0,0.0,0.0,0.0,,,964.0,17693.0,Prague_3_2020-12-05
6186,Budapest_1,2020-12-05,4623.0,,0,0.0,0.0,0,0,0,0.0,0.0,0.0,0.3,0.0,3046.0,17840.0,Budapest_1_2020-12-05
1193,Brno_1,2020-12-05,6447.0,,0,0.0,0.0,0,0,0,0.0,0.0,0.0,0.0,0.0,2332.0,27392.0,Brno_1_2020-12-05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1448,Frankfurt_1,2024-12-07,,,0,,,0,0,0,,,,,,,,
12,Prague_1,2024-12-07,,,0,,,0,0,0,,,,,,,,
1830,Budapest_1,2024-12-07,,,0,,,0,0,0,,,,,,,,
932,Prague_3,2024-12-07,,,0,,,0,0,0,,,,,,,,


## Feature Engineering

Done with all data because of holidays in test sample (compute number of days until next holiday)

In [5]:
full_data = FE().fit_transform(full_data, full_data["orders"])
full_data

Unnamed: 0,warehouse,date,orders,holiday_name,holiday,shutdown,mini_shutdown,shops_closed,winter_school_holidays,school_holidays,...,shops_closed*school_holidays,winter_school_holidays+school_holidays,winter_school_holidays-school_holidays,winter_school_holidays*school_holidays,next_holiday_date,next_holiday_name,days_until_next_holiday,past_holiday_date,past_holiday_name,days_after_past_holiday
0,Prague_1,2020-12-05,6895.0,,0,0.0,0.0,0,0,0,...,0,0,0,0,2020-12-24,Christmas Eve,19.0,NaT,,
1,Prague_2,2020-12-05,4154.0,,0,0.0,0.0,0,0,0,...,0,0,0,0,2020-12-24,Christmas Eve,19.0,NaT,,
2,Prague_3,2020-12-05,4091.0,,0,0.0,0.0,0,0,0,...,0,0,0,0,2020-12-24,Christmas Eve,19.0,NaT,,
3,Budapest_1,2020-12-05,4623.0,,0,0.0,0.0,0,0,0,...,0,0,0,0,2020-12-24,Christmas Eve,19.0,NaT,,
4,Brno_1,2020-12-05,6447.0,,0,0.0,0.0,0,0,0,...,0,0,0,0,2020-12-24,Christmas Eve,19.0,NaT,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9204,Frankfurt_1,2024-12-07,,,0,,,0,0,0,...,0,0,0,0,NaT,,,2024-11-29,Black Friday,8.0
9205,Prague_1,2024-12-07,,,0,,,0,0,0,...,0,0,0,0,NaT,,,2024-11-29,Black Friday,8.0
9206,Budapest_1,2024-12-07,,,0,,,0,0,0,...,0,0,0,0,NaT,,,2024-11-29,Black Friday,8.0
9207,Prague_3,2024-12-07,,,0,,,0,0,0,...,0,0,0,0,NaT,,,2024-11-29,Black Friday,8.0


In [6]:
dates_with_shutdown = full_data.loc[
    (full_data["shutdown"] == 1)
    | (full_data["mini_shutdown"] == 1)
    | (full_data["blackout"] == 1)
    | (full_data["frankfurt_shutdown"] == 1),
    "date",
].unique()

cols_to_drop = [
    "holiday_name",
    "next_holiday_date",
    "past_holiday_date",
    "shutdown",
    "mini_shutdown",
    "blackout",
    "mov_change",
    "frankfurt_shutdown",
    "precipitation",
    "snow",
    "user_activity_1",
    "user_activity_2",
    "id",
]

full_data = full_data.drop(cols_to_drop, axis=1)

full_data["warehouse_date"] = (
    full_data["warehouse"] + "_" + full_data["date"].dt.strftime("%Y-%m-%d")
)

submission_data = submission.merge(full_data, left_on="id", right_on="warehouse_date")
full_data = full_data.dropna(axis=0)
# submission_data = full_data.loc[(full_data['date'] >= test_dates["date"].min()) & (full_data['date'] <= test_dates["date"].max())]

## Train Test Split

In [7]:
if TO_SUBMIT:
    X_train = full_data.copy()
    X_test = full_data.loc[full_data["date"] > pd.to_datetime("2024-01-01")]
else:
    X_train = full_data.loc[full_data['date'] <= pd.to_datetime('2024-01-01')]
    X_test = full_data.loc[full_data["date"] > pd.to_datetime("2024-01-01")]

y_train = X_train['orders']
y_test = X_test["orders"]

In [8]:
X_train["weights"] = 1
X_train.loc[X_train["year"] == 2021, "weights"] = 2
X_train.loc[X_train["year"] == 2022, "weights"] = 4
X_train.loc[X_train["year"] == 2023, "weights"] = 8
X_train.loc[X_train["year"] == 2024, "weights"] = 16
X_train.loc[
    X_train["date"].isin(submission_data["date"] - pd.offsets.DateOffset(years=1)),
    "weights",
] = 16
X_train.loc[
    X_train["date"].isin(dates_with_shutdown),
    "weights",
] /= 2

weights = X_train["weights"].values

In [9]:
X_train = X_train.drop(["date", "orders", "warehouse_date", "weights"], axis=1)
X_test = X_test.drop(["date", "orders", "warehouse_date"], axis=1)
# submission_data = submission_data.drop(["date"], axis=1)

## Model

In [10]:
categorical_features = [
    "holiday",
    "shops_closed",
    "winter_school_holidays",
    "school_holidays",
    "warehouse",
    "year",
    "month",
    "day",
    "week",
    "dayofyear",
    "dayofweek",
    "is_weekend",
    "next_holiday_name",
    "past_holiday_name",
]

numerical_features = list(set(X_train.columns) - set(categorical_features))

In [11]:
categorical_transformer = Pipeline(
    steps=[
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("categorical_vars", categorical_transformer, categorical_features),
    ],
    remainder="passthrough"
)


pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        (
            "lightgbm_regressor",
            lgb.LGBMRegressor(
                n_estimators=250,
                learning_rate=0.248,
                max_depth=8,
                min_child_samples=0,
                reg_alpha=1.65,
                reg_lambda=1.9,
                colsample_bytree=1.0,
                subsample=1.0,
                random_seed=42,
                verbose=-1,
            ),
        ),
    ]
)

pipeline.fit(X_train, y_train, lightgbm_regressor__sample_weight=weights)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [12]:
train_pred = pipeline.predict(X_train)
test_pred = pipeline.predict(X_test)

train_mae = mean_squared_error(y_train, train_pred)
test_mae = mean_squared_error(y_test, test_pred)
train_mape = mean_absolute_percentage_error(y_train, train_pred)
test_mape = mean_absolute_percentage_error(y_test, test_pred)

print(f"train MSE = {train_mae:.2f} test MSE = {test_mae:.2f}")
print(f"train MAPE = {train_mape:.2f} test MAPE = {test_mape:.2f}")

train MSE = 11337.12 test MSE = 3529.85
train MAPE = 0.02 test MAPE = 0.01


In [13]:
# train MSE = 16025.36 test MSE = 987279.86
# train MAPE = 0.02 test MAPE = 0.12

In [14]:
pd.DataFrame({"true": y_test, "pred": pipeline.predict(X_test)})

Unnamed: 0,true,pred
6844,5846.0,5706.074569
6845,6349.0,6313.876783
6846,1820.0,1843.700522
6847,8277.0,8342.139433
6848,9457.0,9523.911944
...,...,...
7335,10777.0,10656.722224
7336,6408.0,6410.476564
7337,11917.0,11973.941088
7338,7140.0,7111.554273


## Submission file

In [15]:
submission_data["pred"] = pipeline.predict(submission_data[X_train.columns])

# Save predictions
final_predictions = submission.merge(submission_data[["id", "pred"]], left_on="id", right_on="id")
final_predictions = final_predictions.drop("orders", axis=1)

# fine_tune_params = {
#     "Brno_1": 1.057,
#     "Budapest_1": 1.006,
#     "Frankfurt_1": 1.087,
#     "Munich_1": 1.042,
#     "Prague_1": 1.048,
#     "Prague_2": 0.985,
#     "Prague_3": 0.975,
# }

# for warehouse, weight in fine_tune_params.items():
#     final_predictions.loc[
#         (final_predictions["id"].str.contains(warehouse)), "pred"
#     ] *= weight

final_predictions = final_predictions.set_index("id")

final_predictions

Unnamed: 0_level_0,pred
id,Unnamed: 1_level_1
Prague_1_2024-03-16,10732.331768
Prague_1_2024-03-17,10204.784770
Prague_1_2024-03-18,9805.108140
Prague_1_2024-03-19,9631.949796
Prague_1_2024-03-20,9397.577467
...,...
Budapest_1_2024-05-11,6596.336493
Budapest_1_2024-05-12,6268.630156
Budapest_1_2024-05-13,6146.084981
Budapest_1_2024-05-14,5971.294807


In [16]:
# final_predictions.to_csv("./data/submissions/lightgbm_8_version_4.csv")