<a href="https://colab.research.google.com/github/Acheon-stst/Acheon-stst/blob/main/DNSC3288.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from math import sqrt
from pathlib import Path
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

from itertools import product




# Load Data


In [8]:
# Load the main training set (daily sales)
sales = pd.read_csv("sales_train.csv")

# Kaggle test set
test  = pd.read_csv("test.csv")

# Metadata files
items = pd.read_csv("items.csv")
shops = pd.read_csv("shops.csv")
cats  = pd.read_csv("item_categories.csv")



 # Cleaning & Preprocessing the raw sales data

In [9]:
# Kaggle metric clips predictions to [0,20]
sales["item_cnt_day"] = sales["item_cnt_day"].clip(0, 20)

# Convert daily data → monthly data
# Aggregate total sales per (month, shop, item)
monthly = (
    sales.groupby(["date_block_num", "shop_id", "item_id"], as_index=False)
         .agg({"item_cnt_day": "sum",
               "item_price": "mean"})
)


# Renaming item_cnt_day to item_cnt_month
monthly.rename(columns={"item_cnt_day": "item_cnt_month"}, inplace=True)

# Add item_category_id
monthly = monthly.merge(items[["item_id", "item_category_id"]],
                        on="item_id", how="left")





# Creating a full shop-item-month matrix


In [10]:
# Missing combinations must be filled → assume 0 sales
# List unique shops, items, and months
shops_ids = monthly["shop_id"].unique()
items_ids = monthly["item_id"].unique()
months = np.arange(0, 34)   # months 0 → 33 (Jan 2013 → Oct 2015)


full_matrix = pd.DataFrame(
    list(product(months, shops_ids, items_ids)),
    columns=["date_block_num", "shop_id", "item_id"]
)

# Merge real monthly data into the matrix
full_matrix = full_matrix.merge(monthly,
                                on=["date_block_num", "shop_id", "item_id"],
                                how="left")

# Replace missing sales (no observations) with 0
full_matrix["item_cnt_month"] = full_matrix["item_cnt_month"].fillna(0)

# Add category ID for each row
full_matrix = full_matrix.merge(items[["item_id", "item_category_id"]],
                                on="item_id", how="left")


In [11]:
def create_lag(df, col, lags):
    """
    For each row, create shifted versions of a column.
    Example: lag_1 means last month's value for the same (shop_id, item_id).
    """
    for lag in lags:
        df[f"{col}_lag_{lag}"] = \
            df.groupby(["shop_id", "item_id"])[col].shift(lag)
    return df



In [12]:
# Past sales: 1, 2, 3, 6, 12 months ago
full_matrix = create_lag(full_matrix, "item_cnt_month", [1, 2, 3, 6, 12])





In [1]:
# Average monthly sales for each shop
shop_mean = full_matrix.groupby(["date_block_num", "shop_id"])["item_cnt_month"].mean().reset_index()
shop_mean.rename(columns={"item_cnt_month": "shop_avg_sales"}, inplace=True)
full_matrix = full_matrix.merge(shop_mean, on=["date_block_num", "shop_id"], how="left")

# Add lag of shop_avg_sales
full_matrix = create_lag(full_matrix, "shop_avg_sales", [1])




NameError: name 'full_matrix' is not defined

In [None]:
# Average monthly item sales (across all shops)
item_mean = full_matrix.groupby(["date_block_num", "item_id"])["item_cnt_month"].mean().reset_index()
item_mean.rename(columns={"item_cnt_month": "item_avg_sales"}, inplace=True)
full_matrix = full_matrix.merge(item_mean, on=["date_block_num", "item_id"], how="left")

# Lag of item_avg_sales
full_matrix = create_lag(full_matrix, "item_avg_sales", [1])



In [None]:
# Average monthly category sales
cat_mean = full_matrix.groupby(["date_block_num", "item_category_id"])["item_cnt_month"].mean().reset_index()
cat_mean.rename(columns={"item_cnt_month": "category_avg_sales"}, inplace=True)
full_matrix = full_matrix.merge(cat_mean, on=["date_block_num", "item_category_id"], how="left")

# Lag of category_avg_sales
full_matrix = create_lag(full_matrix, "category_avg_sales", [1])


In [None]:
# Average historical price of each item
item_price_mean = sales.groupby("item_id")["item_price"].mean().reset_index()
item_price_mean.rename(columns={"item_price": "item_price_avg"}, inplace=True)
full_matrix = full_matrix.merge(item_price_mean, on="item_id", how="left")

# Price deviation measure:
full_matrix["price_change_from_avg"] = full_matrix["item_price"] / full_matrix["item_price_avg"]
full_matrix = create_lag(full_matrix, "item_price", [1])
full_matrix = create_lag(full_matrix, "price_change_from_avg", [1])