In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("../data/processed/sales_clean.csv")

df.head()


Unnamed: 0,date,store,item,sales,time_idx
0,2013-01-01,1,1,13,0
1,2013-01-01,3,17,26,0
2,2013-01-01,10,7,32,0
3,2013-01-01,3,18,45,0
4,2013-01-01,3,19,34,0


In [2]:
df["date"] = pd.to_datetime(df["date"])
df["day_of_week"] = df["date"].dt.dayofweek
df["week_of_year"] = df["date"].dt.isocalendar().week.astype(int)
df["month"] = df["date"].dt.month
df["is_weekend"] = (df["day_of_week"] >= 5).astype(int)


In [3]:
def add_lag_features(group):
    group = group.sort_values("date")
    group["lag_1"] = group["sales"].shift(1)
    group["lag_7"] = group["sales"].shift(7)
    group["lag_28"] = group["sales"].shift(28)
    return group

df = df.groupby(["store", "item"], group_keys=False).apply(add_lag_features)


  df = df.groupby(["store", "item"], group_keys=False).apply(add_lag_features)


In [4]:
def add_rolling_features(group):
    group = group.sort_values("date")
    group["rolling_mean_7"] = group["sales"].shift(1).rolling(7).mean()
    group["rolling_std_7"] = group["sales"].shift(1).rolling(7).std()
    group["rolling_mean_30"] = group["sales"].shift(1).rolling(30).mean()
    group["rolling_std_30"] = group["sales"].shift(1).rolling(30).std()
    return group

df = df.groupby(["store", "item"], group_keys=False).apply(add_rolling_features)


  df = df.groupby(["store", "item"], group_keys=False).apply(add_rolling_features)


In [5]:
from pandas.tseries.holiday import USFederalHolidayCalendar

cal = USFederalHolidayCalendar()
holidays = cal.holidays(start=df["date"].min(), end=df["date"].max())

df["is_holiday"] = df["date"].isin(holidays).astype(int)


In [6]:
df = df.dropna().reset_index(drop=True)


In [7]:
max_time = df["time_idx"].max()

train_max = int(max_time * 0.7)
val_max = int(max_time * 0.85)


In [8]:
train_df = df[df["time_idx"] <= train_max].copy()
val_df   = df[(df["time_idx"] > train_max) & (df["time_idx"] <= val_max)].copy()
test_df  = df[df["time_idx"] > val_max].copy()

train_df.shape, val_df.shape, test_df.shape


((624000, 17), (137000, 17), (137000, 17))

In [9]:
train_df.to_csv("../data/processed/train.csv", index=False)
val_df.to_csv("../data/processed/val.csv", index=False)
test_df.to_csv("../data/processed/test.csv", index=False)
