In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import HistGradientBoostingRegressor

import joblib, json, os


In [4]:
train = pd.read_csv("/kaggle/input/walmart-store-sales-forecasting/walmart-recruiting-store-sales-forecasting/train.csv/train.csv")
features = pd.read_csv("/kaggle/input/walmart-store-sales-forecasting/walmart-recruiting-store-sales-forecasting/features.csv/features.csv")
stores = pd.read_csv("/kaggle/input/walmart-store-sales-forecasting/walmart-recruiting-store-sales-forecasting/stores.csv")

print(train.shape, features.shape, stores.shape)
train.head()

(421570, 5) (8190, 12) (45, 3)


Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday
0,1,1,2010-02-05,24924.5,False
1,1,1,2010-02-12,46039.49,True
2,1,1,2010-02-19,41595.55,False
3,1,1,2010-02-26,19403.54,False
4,1,1,2010-03-05,21827.9,False


In [5]:
print(train.columns)
print(features.columns)
print(stores.columns)

print(train.isna().sum().head(10))

Index(['Store', 'Dept', 'Date', 'Weekly_Sales', 'IsHoliday'], dtype='object')
Index(['Store', 'Date', 'Temperature', 'Fuel_Price', 'MarkDown1', 'MarkDown2',
       'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment',
       'IsHoliday'],
      dtype='object')
Index(['Store', 'Type', 'Size'], dtype='object')
Store           0
Dept            0
Date            0
Weekly_Sales    0
IsHoliday       0
dtype: int64


In [6]:
train["Date"] = pd.to_datetime(train["Date"])
features["Date"] = pd.to_datetime(features["Date"])

train.dtypes


Store                    int64
Dept                     int64
Date            datetime64[ns]
Weekly_Sales           float64
IsHoliday                 bool
dtype: object

In [7]:
train.head()

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday
0,1,1,2010-02-05,24924.5,False
1,1,1,2010-02-12,46039.49,True
2,1,1,2010-02-19,41595.55,False
3,1,1,2010-02-26,19403.54,False
4,1,1,2010-03-05,21827.9,False


In [8]:
df = (
    train
    .merge(features, on=["Store", "Date", "IsHoliday"], how="left")
    .merge(stores, on="Store", how="left")
)

print(df.shape)
df.head()


(421570, 16)


Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,Type,Size
0,1,1,2010-02-05,24924.5,False,42.31,2.572,,,,,,211.096358,8.106,A,151315
1,1,1,2010-02-12,46039.49,True,38.51,2.548,,,,,,211.24217,8.106,A,151315
2,1,1,2010-02-19,41595.55,False,39.93,2.514,,,,,,211.289143,8.106,A,151315
3,1,1,2010-02-26,19403.54,False,46.63,2.561,,,,,,211.319643,8.106,A,151315
4,1,1,2010-03-05,21827.9,False,46.5,2.625,,,,,,211.350143,8.106,A,151315


In [9]:
markdown_cols = ["MarkDown1","MarkDown2","MarkDown3","MarkDown4","MarkDown5"]

df[markdown_cols] = df[markdown_cols].fillna(0)


In [11]:
df = df.sort_values(["Store","Dept","Date"]).reset_index(drop=True)


In [13]:
df["y_next_week"] = df.groupby(["Store","Dept"])["Weekly_Sales"].shift(-1)

df[["Store","Dept","Date","Weekly_Sales","y_next_week"]].head(10)


Unnamed: 0,Store,Dept,Date,Weekly_Sales,y_next_week
0,1,1,2010-02-05,24924.5,46039.49
1,1,1,2010-02-12,46039.49,41595.55
2,1,1,2010-02-19,41595.55,19403.54
3,1,1,2010-02-26,19403.54,21827.9
4,1,1,2010-03-05,21827.9,21043.39
5,1,1,2010-03-12,21043.39,22136.64
6,1,1,2010-03-19,22136.64,26229.21
7,1,1,2010-03-26,26229.21,57258.43
8,1,1,2010-04-02,57258.43,42960.91
9,1,1,2010-04-09,42960.91,17596.96


In [14]:
df["week"] = df["Date"].dt.isocalendar().week.astype(int)
df["month"] = df["Date"].dt.month
df["year"] = df["Date"].dt.year
df["is_weekend"] = (df["Date"].dt.dayofweek >= 5).astype(int)


In [16]:
g = df.groupby(["Store","Dept"])

for lag in [1, 2, 4, 8]:
    df[f"lag_{lag}"] = g["Weekly_Sales"].shift(lag)

for win in [4, 8]:
    df[f"roll_mean_{win}"] = g["Weekly_Sales"].shift(1).rolling(win).mean()


In [17]:
df["Type"] = df["Type"].astype("category").cat.codes


In [18]:
df = df.dropna().reset_index(drop=True)

print(df.shape)


(392424, 27)


In [19]:
df.head()

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,...,week,month,year,is_weekend,lag_1,lag_2,lag_4,lag_8,roll_mean_4,roll_mean_8
0,1,1,2010-04-02,57258.43,False,62.27,2.719,0.0,0.0,0.0,...,13,4,2010,0,26229.21,22136.64,21827.9,24924.5,22809.285,27900.0275
1,1,1,2010-04-09,42960.91,False,65.86,2.77,0.0,0.0,0.0,...,14,4,2010,0,57258.43,26229.21,21043.39,46039.49,31666.9175,31941.76875
2,1,1,2010-04-16,17596.96,False,66.32,2.808,0.0,0.0,0.0,...,15,4,2010,0,42960.91,57258.43,22136.64,41595.55,37146.2975,31556.94625
3,1,1,2010-04-23,16145.35,False,64.84,2.795,0.0,0.0,0.0,...,16,4,2010,0,17596.96,42960.91,26229.21,19403.54,36011.3775,28557.1225
4,1,1,2010-04-30,16555.11,False,67.41,2.78,0.0,0.0,0.0,...,17,4,2010,0,16145.35,17596.96,57258.43,21827.9,33490.4125,28149.84875


In [20]:
blocked = ["Date", "Weekly_Sales", "y_next_week"]
feature_cols = [c for c in df.columns if c not in blocked]

cutoff = df["Date"].quantile(0.8)

train_df = df[df["Date"] <= cutoff]
val_df   = df[df["Date"] > cutoff]

X_train, y_train = train_df[feature_cols], train_df["y_next_week"]
X_val, y_val     = val_df[feature_cols], val_df["y_next_week"]

print("Train:", X_train.shape)
print("Valid:", X_val.shape)


Train: (315818, 24)
Valid: (76606, 24)


In [21]:
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_absolute_error

model = HistGradientBoostingRegressor(
    learning_rate=0.05,
    max_depth=10,
    max_iter=800
)

model.fit(X_train, y_train)

pred = model.predict(X_val)
mae = mean_absolute_error(y_val, pred)

print("Validation MAE:", mae)


Validation MAE: 1518.5795980284295


In [22]:
import joblib, json, os

os.makedirs("/kaggle/working/artifacts", exist_ok=True)

joblib.dump(model, "/kaggle/working/artifacts/model.joblib")

json.dump(
    {"features": feature_cols, "mae": float(mae)},
    open("/kaggle/working/artifacts/metadata.json","w"),
    indent=2
)


In [24]:
# Pick a random validation row
i = np.random.randint(0, len(X_val))
x_one = X_val.iloc[[i]]
y_true = y_val.iloc[i]
y_pred = model.predict(x_one)[0]

print("True next-week sales:", y_true)
print("Predicted next-week sales:", y_pred)
print("Abs error:", abs(y_true - y_pred))


True next-week sales: 39493.07
Predicted next-week sales: 39502.68947555051
Abs error: 9.619475550513016


In [26]:
# after you created df (merged train+features+stores) and filled markdown NaNs
df.to_parquet("/kaggle/working/history.parquet", index=False)