In [1]:
import pandas as pd
import os

# Path to processed data
DATA_DIR = "../data/processed/"

# Load cleaned dataset
df = pd.read_csv(
    os.path.join(DATA_DIR, "sales_cleaned.csv")
)

# Convert Date to datetime
df["Date"] = pd.to_datetime(df["Date"])

print("Dataset shape:", df.shape)
df.head()


Dataset shape: (421570, 16)


Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,Type,Size
0,1,1,2010-02-05,24924.5,False,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,A,151315
1,1,1,2010-02-12,46039.49,True,38.51,2.548,0.0,0.0,0.0,0.0,0.0,211.24217,8.106,A,151315
2,1,1,2010-02-19,41595.55,False,39.93,2.514,0.0,0.0,0.0,0.0,0.0,211.289143,8.106,A,151315
3,1,1,2010-02-26,19403.54,False,46.63,2.561,0.0,0.0,0.0,0.0,0.0,211.319643,8.106,A,151315
4,1,1,2010-03-05,21827.9,False,46.5,2.625,0.0,0.0,0.0,0.0,0.0,211.350143,8.106,A,151315


In [2]:
# ==============================
# TIME SERIES SORTING
# ==============================

df = df.sort_values(
    by=["Store", "Dept", "Date"]
).reset_index(drop=True)

df.head()


Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,Type,Size
0,1,1,2010-02-05,24924.5,False,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,A,151315
1,1,1,2010-02-12,46039.49,True,38.51,2.548,0.0,0.0,0.0,0.0,0.0,211.24217,8.106,A,151315
2,1,1,2010-02-19,41595.55,False,39.93,2.514,0.0,0.0,0.0,0.0,0.0,211.289143,8.106,A,151315
3,1,1,2010-02-26,19403.54,False,46.63,2.561,0.0,0.0,0.0,0.0,0.0,211.319643,8.106,A,151315
4,1,1,2010-03-05,21827.9,False,46.5,2.625,0.0,0.0,0.0,0.0,0.0,211.350143,8.106,A,151315


In [3]:
# ==============================
# CALENDAR / DATE FEATURES
# ==============================

df["Year"] = df["Date"].dt.year
df["Month"] = df["Date"].dt.month

# ISO week number (important for retail)
df["Week"] = df["Date"].dt.isocalendar().week.astype(int)

df["Day"] = df["Date"].dt.day
df["DayOfWeek"] = df["Date"].dt.weekday  # 0 = Monday, 6 = Sunday

# Binary indicators
df["IsWeekend"] = (df["DayOfWeek"] >= 5).astype(int)
df["IsMonthStart"] = df["Date"].dt.is_month_start.astype(int)
df["IsMonthEnd"] = df["Date"].dt.is_month_end.astype(int)



In [4]:
# ==============================
# LAG FEATURES
# ==============================

# Sort by Store, Dept, Date (if not already sorted)
df = df.sort_values(by=["Store", "Dept", "Date"]).reset_index(drop=True)

#
# ==============================
# LAG FEATURES — short-term and long-term
# ==============================

lags = [1, 2, 3, 4, 8, 12, 16, 20, 24]  # Define lag periods in weeks

for lag in lags:
    # Create lag feature for each store and department
    # Lag_1 → 1 week ago
    # Lag_2 → 2 weeks ago
    # Lag_3 → 3 weeks ago
    # Lag_4 → 4 weeks ago (≈ 1 month ago)
    # Lag_8 → 8 weeks ago (≈ 2 months ago)
    # Lag_12 → 12 weeks ago (≈ 3 months ago)
    # Lag_16 → 16 weeks ago (≈ 4 months ago)
    # Lag_20 → 20 weeks ago (≈ 5 months ago)
    # Lag_24 → 24 weeks ago (≈ 6 months ago)
    # These features give the model memory across weeks, months, and quarters
    df[f"Lag_{lag}"] = df.groupby(["Store", "Dept"])["Weekly_Sales"].shift(lag)



In [5]:
# ==============================
# ROLLING / MOVING WINDOW FEATURES
# ==============================

# Short-term rolling features (1 month ≈ 4 weeks)
df["RollingMean_4"] = df.groupby(["Store", "Dept"])["Weekly_Sales"].transform(lambda x: x.shift(1).rolling(4).mean())
df["RollingStd_4"] = df.groupby(["Store", "Dept"])["Weekly_Sales"].transform(lambda x: x.shift(1).rolling(4).std())

# Medium-term rolling features (3 months ≈ 12 weeks)
df["RollingMean_12"] = df.groupby(["Store", "Dept"])["Weekly_Sales"].transform(lambda x: x.shift(1).rolling(12).mean())
df["RollingStd_12"] = df.groupby(["Store", "Dept"])["Weekly_Sales"].transform(lambda x: x.shift(1).rolling(12).std())


In [6]:
df

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,...,Lag_4,Lag_8,Lag_12,Lag_16,Lag_20,Lag_24,RollingMean_4,RollingStd_4,RollingMean_12,RollingStd_12
0,1,1,2010-02-05,24924.50,False,42.31,2.572,0.00,0.00,0.00,...,,,,,,,,,,
1,1,1,2010-02-12,46039.49,True,38.51,2.548,0.00,0.00,0.00,...,,,,,,,,,,
2,1,1,2010-02-19,41595.55,False,39.93,2.514,0.00,0.00,0.00,...,,,,,,,,,,
3,1,1,2010-02-26,19403.54,False,46.63,2.561,0.00,0.00,0.00,...,,,,,,,,,,
4,1,1,2010-03-05,21827.90,False,46.50,2.625,0.00,0.00,0.00,...,24924.50,,,,,,32990.7700,12832.106391,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
421565,45,98,2012-09-28,508.37,False,64.88,3.997,4556.61,20.64,1.50,...,346.04,516.46,659.65,713.50,893.60,559.14,442.9350,122.132155,565.753333,157.464751
421566,45,98,2012-10-05,628.10,False,64.89,3.985,5046.74,0.00,18.82,...,352.44,727.49,695.21,856.35,745.44,605.80,483.5175,104.966869,553.146667,155.304907
421567,45,98,2012-10-12,1061.02,False,54.47,4.000,1956.28,0.00,7.89,...,605.96,500.16,845.30,622.62,795.94,619.41,552.4325,76.984436,547.554167,150.869134
421568,45,98,2012-10-19,760.01,False,56.47,3.969,2004.02,0.00,3.18,...,467.30,415.40,657.63,690.52,874.64,694.25,666.1975,271.910501,565.530833,195.748704


In [7]:
import os

# Path to save processed features
PROCESSED_DIR = "../data/processed/"

# Make sure the folder exists
os.makedirs(PROCESSED_DIR, exist_ok=True)

# Save the feature-engineered dataframe
df.to_csv(os.path.join(PROCESSED_DIR, "sales_features.csv"), index=False)

print("Feature-engineered dataset saved successfully!")


Feature-engineered dataset saved successfully!
