In [None]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline

from core.feature_engineering import FE
from core.label_encoder_wrapper import LabelEncoderWrapper
from core.standard_scaler_wrapper import StandardScalerWrapper
from core.tabnet_regressor_pandas_wrapper import TabNetRegressorPandasWrapper
from core.mape import MAPE

## Read Data

In [None]:
train = pd.read_csv("./data/raw_data/train.csv", parse_dates=["date"]).sort_values(by="date")
test = pd.read_csv("./data/raw_data/test_calendar.csv", parse_dates=["date"]).sort_values(by="date")
test_dates = pd.read_csv("./data/raw_data/test.csv", parse_dates=["date"]).sort_values(by="date")
submission = pd.read_csv("./data/raw_data/solution_example.csv")

In [None]:
modelling_cols = list(test.columns) + ['orders']

full_data = pd.concat([train, test], axis=0)
full_data = full_data[modelling_cols]
full_data

## Feature Engineering

Done with all data because of holidays in test sample (compute number of days until next holiday)

In [None]:
full_data.loc[full_data["holiday_name"].isin(["Whit sunday", 'Ascension day']), "holiday_name"] = np.nan

full_data = FE().fit_transform(full_data, full_data["orders"])
full_data

In [None]:
full_data = full_data.drop(
    ["holiday_name", "next_holiday_date", "past_holiday_date"], axis=1
)

full_data["warehouse_date"] = (
    full_data["warehouse"] + "_" + full_data["date"].dt.strftime("%Y-%m-%d")
)

submission_data = submission.merge(full_data, left_on="id", right_on="warehouse_date")

# submission_data = full_data.loc[(full_data['date'] >= test_dates["date"].min()) & (full_data['date'] <= test_dates["date"].max())]
full_data = full_data.dropna(axis=0)
full_data

## Train Test Split

In [None]:
X_train = full_data.loc[full_data['date'] <= pd.to_datetime('2024-01-01')]
X_test = full_data.loc[full_data["date"] > pd.to_datetime("2024-01-01")]

y_train = X_train['orders']
y_test = X_test["orders"]

X_train = X_train.drop(["date", "orders", "warehouse_date"], axis=1)
X_test = X_test.drop(["date", "orders", "warehouse_date"], axis=1)
# submission_data = submission_data.drop(["date"], axis=1)

## Model

In [None]:
categorical_features = [
    "holiday",
    "shops_closed",
    "winter_school_holidays",
    "school_holidays",
    "warehouse",
    "year",
    "month",
    "day",
    "week",
    "dayofyear",
    "dayofweek",
    "is_weekend",
    "next_holiday_name",
    "past_holiday_name",
]

numerical_features = list(set(X_train.columns) - set(categorical_features))

In [None]:
numerical_features

In [None]:
# # For some reason the Pipeline does not work...

# pipeline = Pipeline(
#     steps=[
#         ("label_encoder", LabelEncoderWrapper(categorical_features)),
#         (
#             "standard_scaler",
#             StandardScalerWrapper(
#                 features=["days_until_next_holiday", "days_after_past_holiday"]
#             ),
#         ),
#         (
#             "tabnet_regressor",
#             TabNetRegressorPandasWrapper(
#                 categorical_features=categorical_features,
#                 n_d=64,  # from 8 to 64
#                 n_a=64,  # equal n_d
#                 n_steps=3,  # bwtween 3 and 10
#                 gamma=1.3,  # between 1 and 2
#                 n_independent=5,  # from 1 to 5
#                 n_shared=5,  # from 1 to 5
#                 seed=42,
#                 verbose=1,
#             ),
#         ),
#     ]
# )

# pipeline.fit(
#     X_train,
#     y_train,
#     tabnet_regressor__max_epochs=50,
#     # tabnet_regressor__eval_set=[
#     #     (X_train.values, y_train.values.reshape(-1, 1)),
#     #     (X_test.values, y_test.values.reshape(-1, 1)),
#     # ],
#     # tabnet_regressor__eval_name=["train", "valid"],
#     # tabnet_regressor__eval_metric=["mse", MAPE],
#     # tabnet_regressor__patience=10,
#     tabnet_regressor__batch_size=6588,
#     tabnet_regressor__virtual_batch_size=3294,  # Has to divide batch_size
#     tabnet_regressor__drop_last=False,
#     tabnet_regressor__compute_importance=False,
# )

# print('22')

In [None]:
# Label Encoder
label_encoder = LabelEncoderWrapper(categorical_features)
X_train = label_encoder.fit_transform(X_train, y_train)
X_test = label_encoder.transform(X_test)

# Standard Scaler
scaler = StandardScalerWrapper(features=numerical_features)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


# TabNet Regressor
tabnet_regressor = TabNetRegressorPandasWrapper(
    categorical_features=categorical_features,
    n_d=64,  # from 8 to 64
    n_a=64,  # equal n_d
    n_steps=10,  # bwtween 3 and 10
    gamma=2,  # between 1 and 2
    n_independent=5,  # from 1 to 5
    n_shared=5,  # from 1 to 5
    seed=42,
    verbose=1,
)
tabnet_regressor.fit(
    X_train,
    y_train,
    max_epochs=5000,
    eval_set=[
        (X_train.values, y_train.values.reshape(-1, 1)),
        (X_test.values, y_test.values.reshape(-1, 1)),
    ],
    eval_name=["train", "valid"],
    eval_metric=["mse", MAPE],
    patience=100,
    batch_size=6588,
    virtual_batch_size=3294,  # Has to divide batch_size
    drop_last=False,
    compute_importance=False,
)

print("dads")

In [None]:
pd.DataFrame({
    'true':y_test,
    'pred':tabnet_regressor.predict(X_test)
})

## Submission file

In [None]:
# Label Encoder
submission_data_2 = label_encoder.transform(submission_data)

# Standard Scaler
submission_data_2 = scaler.transform(submission_data_2)

# TabNet Regressor
submission_data["pred"] = tabnet_regressor.predict(submission_data_2[X_train.columns])

# Save predictions
final_predictions = submission.merge(submission_data[["id", "pred"]], left_on="id", right_on="id")
final_predictions=final_predictions.drop("orders", axis=1)
final_predictions = final_predictions.set_index("id")

final_predictions

In [None]:
# final_predictions.to_csv('../data/submissions/tabnet_initial_submision_2.csv')