In [1]:
import pandas as pd


sales_long = pd.read_parquet("data/sales_long.parquet")

In [2]:

# Select features and target variable
features = ["cat_id", "state_id", "lag_7", "lag_14", "lag_28", 
            "rolling_mean_7", "rolling_mean_14", "rolling_mean_28", 
            "day_of_week", "month", "is_weekend", "is_holiday",
            "sell_price", "price_change"]




# Apply one-hot encoding to categorical features
sales_long_encoded = pd.get_dummies(sales_long, columns=["cat_id", "state_id"], drop_first=True)


In [3]:
sales_long_encoded.head()

Unnamed: 0,item_id,dept_id,store_id,d,sales,date,wm_yr_wk,weekday,wday,month,...,lag_28,rolling_mean_7,rolling_mean_14,rolling_mean_28,price_lag_7,price_change,cat_id_HOBBIES,cat_id_HOUSEHOLD,state_id_TX,state_id_WI
0,HOBBIES_1_001,HOBBIES_1,CA_1,d_1,0,2011-01-29,11101,Saturday,1,1,...,0.0,0.0,0.0,0.0,8.26,0.0,True,False,False,False
1,HOBBIES_1_002,HOBBIES_1,CA_1,d_1,0,2011-01-29,11101,Saturday,1,1,...,0.0,0.0,0.0,0.0,3.97,0.0,True,False,False,False
2,HOBBIES_1_003,HOBBIES_1,CA_1,d_1,0,2011-01-29,11101,Saturday,1,1,...,0.0,0.0,0.0,0.0,2.97,0.0,True,False,False,False
3,HOBBIES_1_004,HOBBIES_1,CA_1,d_1,0,2011-01-29,11101,Saturday,1,1,...,0.0,0.0,0.0,0.0,4.64,0.0,True,False,False,False
4,HOBBIES_1_005,HOBBIES_1,CA_1,d_1,0,2011-01-29,11101,Saturday,1,1,...,0.0,0.0,0.0,0.0,2.98,0.0,True,False,False,False


In [4]:
# Feature selection
features = [
    col for col in sales_long_encoded.columns
    if col.startswith("lag_") or col.startswith("rolling_mean") 
    or col in ["day_of_week", "month", "is_weekend", "is_holiday", 
               "sell_price", "price_change", "cat_id_", "state_id_"]
]

In [5]:


# Create the training and validation sets
X_train = sales_long_encoded[sales_long_encoded["d"].isin([f"d_{i}" for i in range(1, 1914)])][features]
y_train = sales_long_encoded[sales_long_encoded["d"].isin([f"d_{i}" for i in range(1, 1914)])]["sales"]

X_val = sales_long_encoded[sales_long_encoded["d"].isin([f"d_{i}" for i in range(1914, 1942)])][features]
y_val = sales_long_encoded[sales_long_encoded["d"].isin([f"d_{i}" for i in range(1914, 1942)])]["sales"]






In [6]:
# check days distribution
sales_long["d"].value_counts()

d
d_1       30490
d_1324    30490
d_1322    30490
d_1321    30490
d_1320    30490
          ...  
d_653     30490
d_652     30490
d_651     30490
d_650     30490
d_1969    30490
Name: count, Length: 1969, dtype: int64

In [7]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, r2_score

# Train an XGBoost model using scaled data
model = XGBRegressor(
    n_estimators=500,
    learning_rate=0.03,  # Reduce learning rate
    max_depth=6,  # Reduce depth to avoid overfitting
    colsample_bytree=0.8,  # Randomly sample features for diversity
    subsample=0.8,  # Randomly sample rows
    reg_alpha=0.5,  # Increase L1 regularization to reduce noise
    reg_lambda=2.0,  # Increase L2 regularization for better generalization
    tree_method="hist",
    device="cuda",
    missing=-999
)

model.fit(X_train, y_train)

# Validate the model
y_pred = model.predict(X_val)
mae = mean_absolute_error(y_val, y_pred)
print(f"Validation MAE: {mae:.2f}")

r2 = r2_score(y_val, y_pred)
print(f"Validation R² Score: {r2:.2f}")

Validation MAE: 1.03
Validation R² Score: 0.67


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)


In [8]:
# Create test dataset for forecast
X_test = sales_long[sales_long["d"].isin([f"d_{i}" for i in range(1942, 1970)])][features]

# Predict sales for next 28 days
y_pred_test = model.predict(X_test)



In [12]:
# Save results 
forecast_df = sales_long[sales_long["d"].isin([f"d_{i}" for i in range(1942, 1970)])][["item_id", "date","d",]]

forecast_df["predicted_sales"] = y_pred_test

forecast_df.to_parquet("data/m5_forecast_results.parquet", index=False)

print("Forecasting Complete! Results saved to `m5_forecast_results.parquet`")


Forecasting Complete! Results saved to `m5_forecast_results.parquet`


In [None]:
import joblib
joblib.dump(model, "model/xgb_model.pkl")

['xgb_model.pkl']

In [13]:
forecast_df

Unnamed: 0,item_id,date,d,predicted_sales
59181090,HOBBIES_1_001,2016-05-23,d_1942,1.023197
59181091,HOBBIES_1_002,2016-05-23,d_1942,0.340202
59181092,HOBBIES_1_003,2016-05-23,d_1942,0.639674
59181093,HOBBIES_1_004,2016-05-23,d_1942,1.596812
59181094,HOBBIES_1_005,2016-05-23,d_1942,1.484278
...,...,...,...,...
60034805,FOODS_3_823,2016-06-19,d_1969,0.814295
60034806,FOODS_3_824,2016-06-19,d_1969,0.591974
60034807,FOODS_3_825,2016-06-19,d_1969,1.120144
60034808,FOODS_3_826,2016-06-19,d_1969,1.808961
