In [1]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline

from core.feature_engineering import FE
from core.label_encoder_wrapper import LabelEncoderWrapper
from core.standard_scaler_wrapper import StandardScalerWrapper
from core.tabnet_regressor_pandas_wrapper import TabNetRegressorPandasWrapper
from core.mape import MAPE

## Read Data

In [2]:
train = pd.read_csv("./data/raw_data/train.csv", parse_dates=["date"]).sort_values(by="date")
test = pd.read_csv("./data/raw_data/test_calendar.csv", parse_dates=["date"]).sort_values(by="date")
test_dates = pd.read_csv("./data/raw_data/test.csv", parse_dates=["date"]).sort_values(by="date")
submission = pd.read_csv("./data/raw_data/solution_example.csv")

In [3]:
modelling_cols = list(test.columns) + ['orders']

full_data = pd.concat([train, test], axis=0)
full_data = full_data[modelling_cols]
full_data

Unnamed: 0,date,holiday_name,holiday,shops_closed,winter_school_holidays,school_holidays,warehouse,orders
0,2020-12-05,,0,0,0,0,Prague_1,6895.0
2386,2020-12-05,,0,0,0,0,Prague_2,4154.0
3579,2020-12-05,,0,0,0,0,Prague_3,4091.0
6186,2020-12-05,,0,0,0,0,Budapest_1,4623.0
1193,2020-12-05,,0,0,0,0,Brno_1,6447.0
...,...,...,...,...,...,...,...,...
1448,2024-12-07,,0,0,0,0,Frankfurt_1,
12,2024-12-07,,0,0,0,0,Prague_1,
1830,2024-12-07,,0,0,0,0,Budapest_1,
932,2024-12-07,,0,0,0,0,Prague_3,


### End of code to put in seperate files

In [4]:
full_data.loc[full_data["holiday_name"].isin(["Whit sunday", 'Ascension day']), "holiday_name"] = np.nan

full_data = FE().fit_transform(full_data, full_data["orders"])
full_data

Unnamed: 0,date,holiday_name,holiday,shops_closed,winter_school_holidays,school_holidays,warehouse,orders,year,month,day,week,dayofweek,is_weekend,next_holiday_date,next_holiday_name,days_until_next_holiday,past_holiday_date,past_holiday_name,days_after_past_holiday
0,2020-12-05,,0,0,0,0,Prague_1,6895.0,2020,12,5,49,5,1,2020-12-24,Christmas Eve,19.0,NaT,,
2386,2020-12-05,,0,0,0,0,Prague_2,4154.0,2020,12,5,49,5,1,2020-12-24,Christmas Eve,19.0,NaT,,
3579,2020-12-05,,0,0,0,0,Prague_3,4091.0,2020,12,5,49,5,1,2020-12-24,Christmas Eve,19.0,NaT,,
6186,2020-12-05,,0,0,0,0,Budapest_1,4623.0,2020,12,5,49,5,1,2020-12-24,Christmas Eve,19.0,NaT,,
1193,2020-12-05,,0,0,0,0,Brno_1,6447.0,2020,12,5,49,5,1,2020-12-24,Christmas Eve,19.0,NaT,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1448,2024-12-07,,0,0,0,0,Frankfurt_1,,2024,12,7,49,5,1,NaT,,,2024-10-31,Reformation Day,37.0
12,2024-12-07,,0,0,0,0,Prague_1,,2024,12,7,49,5,1,NaT,,,2024-11-17,Den boje za svobodu a demokracii,20.0
1830,2024-12-07,,0,0,0,0,Budapest_1,,2024,12,7,49,5,1,NaT,,,2024-11-01,All Saints Day,36.0
932,2024-12-07,,0,0,0,0,Prague_3,,2024,12,7,49,5,1,NaT,,,2024-11-17,Den boje za svobodu a demokracii,20.0


In [5]:
full_data = full_data.drop(
    ["holiday_name", "next_holiday_date", "past_holiday_date"], axis=1
)

full_data["warehouse_date"] = (
    full_data["warehouse"] + "_" + full_data["date"].dt.strftime("%Y-%m-%d")
)

submission_data = submission.merge(full_data, left_on="id", right_on="warehouse_date")

# submission_data = full_data.loc[(full_data['date'] >= test_dates["date"].min()) & (full_data['date'] <= test_dates["date"].max())]
full_data = full_data.dropna(axis=0)
full_data

Unnamed: 0,date,holiday,shops_closed,winter_school_holidays,school_holidays,warehouse,orders,year,month,day,week,dayofweek,is_weekend,next_holiday_name,days_until_next_holiday,past_holiday_name,days_after_past_holiday,warehouse_date
2405,2020-12-24,1,0,0,0,Prague_2,2722.0,2020,12,24,52,3,0,Christmas Eve,0.0,Christmas Eve,0.0,Prague_2_2020-12-24
1212,2020-12-24,1,0,0,0,Brno_1,4576.0,2020,12,24,52,3,0,Christmas Eve,0.0,Christmas Eve,0.0,Brno_1_2020-12-24
6205,2020-12-24,1,0,0,0,Budapest_1,3897.0,2020,12,24,52,3,0,Christmas Eve,0.0,Christmas Eve,0.0,Budapest_1_2020-12-24
3598,2020-12-24,1,0,0,0,Prague_3,2532.0,2020,12,24,52,3,0,Christmas Eve,0.0,Christmas Eve,0.0,Prague_3_2020-12-24
19,2020-12-24,1,0,0,0,Prague_1,4327.0,2020,12,24,52,3,0,Christmas Eve,0.0,Christmas Eve,0.0,Prague_1_2020-12-24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2385,2024-03-15,0,0,0,0,Brno_1,10777.0,2024,3,15,11,4,0,Good Friday,14.0,New Years Day,74.0,Brno_1_2024-03-15
4771,2024-03-15,0,0,0,0,Prague_3,6408.0,2024,3,15,11,4,0,Good Friday,14.0,New Years Day,74.0,Prague_3_2024-03-15
1192,2024-03-15,0,0,0,0,Prague_1,11917.0,2024,3,15,11,4,0,Good Friday,14.0,New Years Day,74.0,Prague_1_2024-03-15
3578,2024-03-15,0,0,0,0,Prague_2,7140.0,2024,3,15,11,4,0,Good Friday,14.0,New Years Day,74.0,Prague_2_2024-03-15


In [6]:
X_train = full_data.loc[full_data['date'] <= pd.to_datetime('2024-01-01')]
X_test = full_data.loc[full_data["date"] > pd.to_datetime("2024-01-01")]

y_train = X_train['orders']
y_test = X_test["orders"]

X_train = X_train.drop(["date", "orders", "warehouse_date"], axis=1)
X_test = X_test.drop(["date", "orders", "warehouse_date"], axis=1)
# submission_data = submission_data.drop(["date"], axis=1)

In [7]:
categorical_features = [
    "holiday",
    "shops_closed",
    "winter_school_holidays",
    "school_holidays",
    "warehouse",
    "year",
    "month",
    "day",
    "week",
    "dayofweek",
    "is_weekend",
    "next_holiday_name",
    "past_holiday_name",
]



In [8]:
# For some reason the Pipeline does not work...

# pipeline = Pipeline(
#     steps=[
#         ("label_encoder", LabelEncoderWrapper(categorical_features)),
#         (
#             "standard_scaler",
#             StandardScalerWrapper(
#                 features=["days_until_next_holiday", "days_after_past_holiday"]
#             ),
#         ),
#         (
#             "tabnet_regressor",
#             TabNetRegressorPandasWrapper(
#                 categorical_features=categorical_features,
#                 n_d=64,  # from 8 to 64
#                 n_a=64,  # equal n_d
#                 n_steps=3,  # bwtween 3 and 10
#                 gamma=1.3,  # between 1 and 2
#                 n_independent=5,  # from 1 to 5
#                 n_shared=5,  # from 1 to 5
#                 seed=42,
#                 verbose=1,
#             ),
#         ),
#     ]
# )

# pipeline.fit(
#     X_train,
#     y_train,
#     tabnet_regressor__max_epochs=50,
#     tabnet_regressor__eval_set=[
#         (X_train.values, y_train.values.reshape(-1, 1)),
#         (X_test.values, y_test.values.reshape(-1, 1)),
#     ],
#     tabnet_regressor__eval_name=["train", "valid"],
#     tabnet_regressor__eval_metric=["mse", MAPE],
#     tabnet_regressor__patience=10,
#     tabnet_regressor__batch_size=6588,
#     tabnet_regressor__virtual_batch_size=3294,  # Has to divide batch_size
#     tabnet_regressor__drop_last=False,
#     tabnet_regressor__compute_importance=False,
# )

# print('22')

In [9]:
# Label Encoder
label_encoder = LabelEncoderWrapper(categorical_features)
label_encoder.fit(X_train, y_train)

X_train = label_encoder.transform(X_train)
X_test = label_encoder.transform(X_test)

# Standard Scaler
scaler = StandardScalerWrapper(
    features=["days_until_next_holiday", "days_after_past_holiday"]
)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# TabNet Regressor
tabnet_regressor = TabNetRegressorPandasWrapper(
    categorical_features=categorical_features,
    n_d=64,  # from 8 to 64
    n_a=64,  # equal n_d
    n_steps=10,  # bwtween 3 and 10
    gamma=2,  # between 1 and 2
    n_independent=5,  # from 1 to 5
    n_shared=5,  # from 1 to 5
    seed=42,
    verbose=1,
)
tabnet_regressor.fit(
    X_train,
    y_train,
    max_epochs=5000,
    eval_set=[
        (X_train.values, y_train.values.reshape(-1, 1)),
        (X_test.values, y_test.values.reshape(-1, 1)),
    ],
    eval_name=["train", "valid"],
    eval_metric=["mse", MAPE],
    patience=100,
    batch_size=6588,
    virtual_batch_size=3294,  # Has to divide batch_size
    drop_last=False,
    compute_importance=False,
)

print("dads")



epoch 0  | loss: 35170701.98058| train_mse: 18947458.87614| train_MAPE: 0.65713 | valid_mse: 30341109.86915| valid_MAPE: 0.69277 |  0:00:02s
epoch 1  | loss: 35152871.91019| train_mse: 23257629.2868| train_MAPE: 0.73589 | valid_mse: 35086195.45655| valid_MAPE: 0.75331 |  0:00:04s
epoch 2  | loss: 35119146.38592| train_mse: 25911101.29986| train_MAPE: 0.79629 | valid_mse: 37479052.30222| valid_MAPE: 0.78862 |  0:00:06s
epoch 3  | loss: 35053070.47816| train_mse: 27421833.25644| train_MAPE: 0.82942 | valid_mse: 39089977.02923| valid_MAPE: 0.83254 |  0:00:08s
epoch 4  | loss: 34988062.1335| train_mse: 29067877.78714| train_MAPE: 0.86715 | valid_mse: 41091849.60523| valid_MAPE: 0.86589 |  0:00:10s
epoch 5  | loss: 34902215.06796| train_mse: 29797190.24964| train_MAPE: 0.87884 | valid_mse: 41826735.28648| valid_MAPE: 0.87446 |  0:00:12s
epoch 6  | loss: 34821721.4284| train_mse: 29810990.97487| train_MAPE: 0.87948 | valid_mse: 41716118.53028| valid_MAPE: 0.87556 |  0:00:14s
epoch 7  | loss:



In [10]:
pd.DataFrame({
    'true':y_test,
    'pred':tabnet_regressor.predict(X_test)
})

Unnamed: 0,true,pred
5494,5846.0,2980.827637
7267,6349.0,6167.690430
6122,1820.0,2064.205566
2312,8277.0,7734.812988
1119,9457.0,9498.900391
...,...,...
2385,10777.0,7868.347168
4771,6408.0,4468.337891
1192,11917.0,9055.812500
3578,7140.0,5077.192383


In [11]:
# Label Encoder
submission_data_2 = label_encoder.transform(submission_data)

# Standard Scaler
submission_data_2 = scaler.transform(submission_data_2)

# TabNet Regressor
submission_data["pred"] = tabnet_regressor.predict(submission_data_2[X_train.columns])

# Save predictions
final_predictions = submission.merge(submission_data[["id", "pred"]], left_on="id", right_on="id")
final_predictions=final_predictions.drop("orders", axis=1)
final_predictions = final_predictions.set_index("id")

final_predictions

Unnamed: 0_level_0,pred
id,Unnamed: 1_level_1
Prague_1_2024-03-16,9592.953125
Prague_1_2024-03-17,9161.693359
Prague_1_2024-03-18,9087.541016
Prague_1_2024-03-19,9271.169922
Prague_1_2024-03-20,9081.587891
...,...
Budapest_1_2024-05-11,6534.258789
Budapest_1_2024-05-12,6345.671875
Budapest_1_2024-05-13,6227.945801
Budapest_1_2024-05-14,6448.886719


In [12]:
# final_predictions.to_csv('../data/submissions/tabnet_initial_submision_2.csv')