In [107]:
import numpy as np
import pandas as pd

from IPython.display import display
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor

In [12]:
hist_data: pd.DataFrame = pd.read_csv("Resources/historical_data.csv")
prod_desc: pd.DataFrame = pd.read_csv("Resources/product_descriptions.csv")
subm_key: pd.DataFrame = pd.read_csv("Resources/submission_key.csv")

In [113]:
def cols_with_nan(df: pd.DataFrame) -> list[str]:
    return [col for col in df.columns if df[col].isnull().any()]


def prep_train(hist_data: pd.DataFrame, prod_desc: pd.DataFrame) -> pd.DataFrame:
    train = hist_data.copy()

    train["transaction_date"] = pd.to_datetime(train["transaction_date"])

    unique_dates: np.ndarray = train["transaction_date"].unique()
    unique_stores: np.ndarray = train["store_id"].unique()
    unique_products: np.ndarray = prod_desc["product_id"].unique()

    all_combinations: pd.DataFrame = pd.MultiIndex.from_product(
        [unique_dates, unique_stores, unique_products],
        names=["transaction_date", "store_id", "product_id"],
    ).to_frame(index=False)

    train = (
        train.groupby(["transaction_date", "store_id", "product_id"])
        .agg({"transaction_qty": "sum"})
        .reset_index()
    )

    train = all_combinations.merge(
        train, on=["transaction_date", "store_id", "product_id"], how="left"
    )
    train["transaction_qty"] = train["transaction_qty"].fillna(0)

    train = train.merge(prod_desc, on="product_id", how="left")

    train["day_of_week"] = train["transaction_date"].dt.day_of_week
    train["day"] = train["transaction_date"].dt.day
    train["month"] = train["transaction_date"].dt.month
    train["year"] = train["transaction_date"].dt.year

    train["sold_qty"] = train["transaction_qty"].astype(int)
    train.drop(columns=["transaction_qty"], inplace=True)

    train.set_index(["transaction_date"], inplace=True)

    object_columns: str = train.select_dtypes(include="object").columns
    train[object_columns] = train[object_columns].astype("category")

    return train


def prep_test(subm_key: pd.DataFrame, prod_desc: pd.DataFrame) -> pd.DataFrame:
    test = subm_key.copy()

    test["transaction_date"] = pd.to_datetime(test["transaction_date"])

    test = test.merge(prod_desc, on="product_id", how="left")

    test["day_of_week"] = test["transaction_date"].dt.day_of_week
    test["day"] = test["transaction_date"].dt.day
    test["month"] = test["transaction_date"].dt.month
    test["year"] = test["transaction_date"].dt.year

    test.drop(columns=["ID"], inplace=True)
    test.set_index(["transaction_date"], inplace=True)

    object_columns: str = test.select_dtypes(include="object").columns
    test[object_columns] = test[object_columns].astype("category")

    return test

In [89]:
display(hist_data.head())
display(prod_desc.head())
display(subm_key.head())

print(
    cols_with_nan(hist_data),
    cols_with_nan(prod_desc),
    cols_with_nan(subm_key),
    sep="\n",
)

Unnamed: 0,transaction_id,transaction_date,transaction_time,transaction_qty,store_id,product_id,unit_price
0,1,2024-01-01,07:06:11,2,ST2,32,3.0
1,2,2024-01-01,07:08:56,2,ST2,57,3.1
2,3,2024-01-01,07:14:04,2,ST2,59,4.5
3,4,2024-01-01,07:20:24,1,ST2,22,2.0
4,5,2024-01-01,07:22:41,2,ST2,57,3.1


Unnamed: 0,product_id,product_category,product_type,product_detail
0,1,Pastries,Croissants,French Butter Croissant
1,2,Pastries,Muffins,Blueberry Muffin
2,3,Pastries,Danishes,Apple Danish
3,4,Pastries,Danishes,Cherry Danish
4,5,Pastries,Cakes,Vanilla Sponge Cake


Unnamed: 0,ID,transaction_date,store_id,product_id
0,0,2024-06-01,ST1,1
1,1,2024-06-01,ST1,2
2,2,2024-06-01,ST1,3
3,3,2024-06-01,ST1,4
4,4,2024-06-01,ST1,5


[]
[]
[]


In [114]:
train: pd.DataFrame = prep_train(hist_data, prod_desc)
display(train.head())

Unnamed: 0_level_0,store_id,product_id,product_category,product_type,product_detail,day_of_week,day,month,year,sold_qty
transaction_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2024-01-01,ST2,1,Pastries,Croissants,French Butter Croissant,0,1,1,2024,0
2024-01-01,ST2,2,Pastries,Muffins,Blueberry Muffin,0,1,1,2024,0
2024-01-01,ST2,3,Pastries,Danishes,Apple Danish,0,1,1,2024,0
2024-01-01,ST2,4,Pastries,Danishes,Cherry Danish,0,1,1,2024,0
2024-01-01,ST2,5,Pastries,Cakes,Vanilla Sponge Cake,0,1,1,2024,0


In [115]:
test: pd.DataFrame = prep_test(subm_key, prod_desc)
display(test.head())

Unnamed: 0_level_0,store_id,product_id,product_category,product_type,product_detail,day_of_week,day,month,year
transaction_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2024-06-01,ST1,1,Pastries,Croissants,French Butter Croissant,5,1,6,2024
2024-06-01,ST1,2,Pastries,Muffins,Blueberry Muffin,5,1,6,2024
2024-06-01,ST1,3,Pastries,Danishes,Apple Danish,5,1,6,2024
2024-06-01,ST1,4,Pastries,Danishes,Cherry Danish,5,1,6,2024
2024-06-01,ST1,5,Pastries,Cakes,Vanilla Sponge Cake,5,1,6,2024


In [116]:
FEATURES: list[str] = [
    "store_id",
    "product_id",
    "product_category",
    "product_type",
    "product_detail",
    "day_of_week",
    "day",
    "month",
    "year",
]
TARGET: str = "sold_qty"

train.sort_values(["transaction_date"], inplace=True)
X_train: pd.DataFrame
X_valid: pd.DataFrame
y_train: pd.Series
y_valid: pd.Series
X_train, X_valid, y_train, y_valid = train_test_split(
    train[FEATURES], train[TARGET], test_size=0.2, shuffle=False
)

X_test: pd.DataFrame = test[FEATURES]

In [127]:
model: XGBRegressor = XGBRegressor(
    booster="gbtree",
    device="cuda",
    learning_rate=0.001,
    max_depth=10,
    n_estimators=5000,
    early_stopping_rounds=100,
    random_state=42,
    enable_categorical=True,
)

model.fit(
    X_train,
    y_train,
    eval_set=[(X_valid, y_valid)],
    verbose=100,
)

[0]	validation_0-rmse:6.33845
[100]	validation_0-rmse:5.92008
[200]	validation_0-rmse:5.56320
[300]	validation_0-rmse:5.26172
[400]	validation_0-rmse:5.01024
[500]	validation_0-rmse:4.80193
[600]	validation_0-rmse:4.62846
[700]	validation_0-rmse:4.49094
[800]	validation_0-rmse:4.38444
[900]	validation_0-rmse:4.29705
[1000]	validation_0-rmse:4.22866
[1100]	validation_0-rmse:4.17641
[1200]	validation_0-rmse:4.13879
[1300]	validation_0-rmse:4.11417
[1400]	validation_0-rmse:4.09549
[1500]	validation_0-rmse:4.08471
[1600]	validation_0-rmse:4.07725
[1700]	validation_0-rmse:4.07381
[1800]	validation_0-rmse:4.07306
[1900]	validation_0-rmse:4.05812
[2000]	validation_0-rmse:4.04522
[2100]	validation_0-rmse:4.03284
[2200]	validation_0-rmse:4.02233
[2300]	validation_0-rmse:4.01360
[2400]	validation_0-rmse:4.00504
[2500]	validation_0-rmse:3.99831
[2600]	validation_0-rmse:3.99083
[2700]	validation_0-rmse:3.98404
[2800]	validation_0-rmse:3.97842
[2900]	validation_0-rmse:3.97346
[3000]	validation_0-rm

In [128]:
subm_df: pd.DataFrame = pd.read_csv("Resources/submission_format.csv")
subm_df["sold_qty"] = model.predict(X_test)
subm_df.to_csv("submission.csv", index=False)

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




In [129]:
!kaggle competitions submit predicta-1-0-practice-competition -f submission.csv -m "Version 1"

100%|██████████████████████████████████████| 48.6k/48.6k [00:01<00:00, 40.3kB/s]
Successfully submitted to Predicta 1.0 - Practice Competition