<a href="https://colab.research.google.com/github/Acheon-stst/Acheon-stst/blob/main/DNSC3288.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from math import sqrt
from pathlib import Path
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

from itertools import product




# Load Data


In [11]:
# Load the main training set (daily sales)
sales = pd.read_csv("sales_train.csv")

# Kaggle test set
test  = pd.read_csv("test.csv")

# Metadata files
items = pd.read_csv("items.csv")
shops = pd.read_csv("shops.csv")
cats  = pd.read_csv("item_categories.csv")



 # Cleaning & Preprocessing the raw sales data

In [12]:
# Kaggle metric clips predictions to [0,20]
sales["item_cnt_day"] = sales["item_cnt_day"].clip(0, 20)

# Convert daily data → monthly data
# Aggregate total sales per (month, shop, item)
monthly = (
    sales.groupby(["date_block_num", "shop_id", "item_id"], as_index=False)
         .agg({"item_cnt_day": "sum",
               "item_price": "mean"})
)


# Renaming item_cnt_day to item_cnt_month
monthly.rename(columns={"item_cnt_day": "item_cnt_month"}, inplace=True)

# Add item_category_id
monthly = monthly.merge(items[["item_id", "item_category_id"]],
                        on="item_id", how="left")





In [58]:
# Item mean sales across all history
item_mean_all = full_matrix.groupby("item_id")["item_cnt_month"].mean().reset_index()
item_mean_all.rename(columns={"item_cnt_month": "item_mean_all"}, inplace=True)
full_matrix = full_matrix.merge(item_mean_all, on="item_id", how="left")

# Shop mean sales
shop_mean_all = full_matrix.groupby("shop_id")["item_cnt_month"].mean().reset_index()
shop_mean_all.rename(columns={"item_cnt_month": "shop_mean_all"}, inplace=True)
full_matrix = full_matrix.merge(shop_mean_all, on="shop_id", how="left")

# Category mean
cat_mean_all = full_matrix.groupby("item_category_id")["item_cnt_month"].mean().reset_index()
cat_mean_all.rename(columns={"item_cnt_month": "cat_mean_all"}, inplace=True)
full_matrix = full_matrix.merge(cat_mean_all, on="item_category_id", how="left")

# Creating a full shop-item-month matrix


In [59]:
# Missing combinations must be filled → assume 0 sales
# List unique shops, items, and months
train_matrix = monthly[["date_block_num", "shop_id", "item_id"]].copy()

# Add test rows (month 34)
test["date_block_num"] = 34
test_matrix = test[["date_block_num", "shop_id", "item_id"]].copy()

full_matrix = pd.concat([train_matrix, test_matrix], ignore_index=True).drop_duplicates()

# Merge true monthly sales
full_matrix = full_matrix.merge(
    monthly,
    on=["date_block_num", "shop_id", "item_id"],
    how="left"
)

# Missing sales → 0
full_matrix["item_cnt_month"] = full_matrix["item_cnt_month"].fillna(0)

# Add category
full_matrix = full_matrix.merge(items[["item_id", "item_category_id"]],
                                on="item_id", how="left")


print("Matrix ready:", full_matrix.shape)


Matrix ready: (1823324, 7)


In [25]:
full_matrix.columns

if "item_category_id_x" in full_matrix.columns:
    full_matrix["item_category_id"] = full_matrix["item_category_id_x"]
    full_matrix.drop(["item_category_id_x", "item_category_id_y"], axis=1, inplace=True)


full_matrix.columns


Index(['date_block_num', 'shop_id', 'item_id', 'item_cnt_month', 'item_price',
       'item_category_id'],
      dtype='object')

In [26]:
def create_lag(df, col, lags):
    """Create lag features for grouped shop-item sequences."""
    for lag in lags:
        df[f"{col}_lag_{lag}"] = df.groupby(["shop_id", "item_id"])[col].shift(lag)
    return df





In [27]:
# Lags of sales
full_matrix = create_lag(full_matrix, "item_cnt_month", [1, 2, 3, 6, 12])

# Shop avg sales
shop_avg = full_matrix.groupby(["date_block_num", "shop_id"])["item_cnt_month"].mean().reset_index()
shop_avg.rename(columns={"item_cnt_month": "shop_avg_sales"}, inplace=True)
full_matrix = full_matrix.merge(shop_avg, on=["date_block_num", "shop_id"], how="left")
full_matrix = create_lag(full_matrix, "shop_avg_sales", [1])






In [28]:
# Item avg sales
item_avg = full_matrix.groupby(["date_block_num", "item_id"])["item_cnt_month"].mean().reset_index()
item_avg.rename(columns={"item_cnt_month": "item_avg_sales"}, inplace=True)
full_matrix = full_matrix.merge(item_avg, on=["date_block_num", "item_id"], how="left")
full_matrix = create_lag(full_matrix, "item_avg_sales", [1])

In [29]:
# Category avg sales
cat_avg = full_matrix.groupby(["date_block_num", "item_category_id"])["item_cnt_month"].mean().reset_index()
cat_avg.rename(columns={"item_cnt_month": "category_avg_sales"}, inplace=True)
full_matrix = full_matrix.merge(cat_avg, on=["date_block_num", "item_category_id"], how="left")
full_matrix = create_lag(full_matrix, "category_avg_sales", [1])

# TRAIN–VALIDATION SPLIT

In [30]:
# Use month 33 (Oct 2015) as validation

train = full_matrix[full_matrix["date_block_num"] < 33]
val   = full_matrix[full_matrix["date_block_num"] == 33]

FEATURES = [
    c for c in full_matrix.columns
    if c not in ["item_cnt_month", "item_price_avg"]
]

X_train, y_train = train[FEATURES], train["item_cnt_month"]
X_val, y_val     = val[FEATURES],   val["item_cnt_month"]

print("Training rows:", X_train.shape, "Validation rows:", X_val.shape)


Training rows: (1577593, 16) Validation rows: (31531, 16)


# TRAIN XGBOOST MODEL


In [31]:
model = XGBRegressor(
    n_estimators=400,
    max_depth=10,
    learning_rate=0.03,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="reg:squarederror",
    random_state=42,
    tree_method="hist"
)

print("Training model...")
model.fit(X_train, y_train)




Training model...


# VALIDATION EVALUATION

In [33]:
val_pred = np.clip(model.predict(X_val), 0, 20)
rmse = sqrt(mean_squared_error(y_val, val_pred))
print("Validation RMSE:", rmse)

Validation RMSE: 5.085467564895916


# Preparing test dataset

In [52]:
for col in ["item_category_id", "item_category_id_x", "item_category_id_y"]:
    if col in test.columns:
        test.drop(col, axis=1, inplace=True)

# Rebuild clean category column
test = test.merge(
    items[["item_id", "item_category_id"]],
    on="item_id",
    how="left"
)


# Restore date_block_num
test["date_block_num"] = 34

# Pull last-month features
last_month_features = full_matrix[full_matrix["date_block_num"] == 33][FEATURES].drop_duplicates()

# Merge engineered features into test
test_ff = test.merge(
    last_month_features,
    on=["shop_id", "item_id", "item_category_id"],
    how="left"
)

# Restore date_block_num again (merge can drop it)
test_ff["date_block_num"] = 34

test_ff.fillna(0, inplace=True)

X_test = test_ff[FEATURES]


test_pred = np.clip(model.predict(X_test), 0, 20)


# Submission

In [54]:
submission = pd.DataFrame({
    "ID": test["ID"],
    "item_cnt_month": test_pred
})

submission.to_csv("submission.csv", index=False)

submission

Unnamed: 0,ID,item_cnt_month
0,0,0.714192
1,1,0.841948
2,2,1.362786
3,3,0.713629
4,4,0.714192
...,...,...
214195,214195,1.032882
214196,214196,0.647461
214197,214197,0.666214
214198,214198,0.623798


In [57]:
submission.to_csv("Group8_submission.csv", index =False)