In [1]:
#Purpose: train a LightGBM model to predict short-term sales per (item_id, store_id) from M5 `inventory_history.csv` and produce `models/inventory_lgbm.txt` + `data/item_inventory_score.csv` (score = predicted_next_28_sales / initial_inventory).

In [2]:
#Install & imports
import pandas as pd, numpy as np, lightgbm as lgb
from pathlib import Path
import pickle
BASE = Path(r"D:\CAPSTONE_FINAL")
DATA = BASE / "data"
OUT = BASE / "models"
OUT.mkdir(parents=True, exist_ok=True)


In [6]:
# Memory-safe sampling from large CSV using pandas chunks
import pandas as pd
from pathlib import Path
import random

BASE = Path(r"D:\CAPSTONE_FINAL")
DATA = BASE / "data"
INV_CSV = DATA / "inventory_history.csv"
OUT_SAMPLE = DATA / "inv_sample_200.csv"

# Choose 200 random unique item_ids by scanning in chunks
sample_ids = set()
chunk_iter = pd.read_csv(INV_CSV, usecols=["item_id"], chunksize=1000000)
for chunk in chunk_iter:
    ids = chunk["item_id"].dropna().unique().tolist()
    random.shuffle(ids)
    for i in ids:
        if len(sample_ids) < 200:
            sample_ids.add(i)
        else:
            break
    if len(sample_ids) >= 200:
        break
print(f"Collected {len(sample_ids)} sample item_ids")

# Second pass: write only rows matching those 200 items
chunk_iter = pd.read_csv(
    INV_CSV,
    usecols=["id","item_id","dept_id","cat_id","store_id","state_id","d","units_sold","date"],
    chunksize=500000,
)
out = open(OUT_SAMPLE, "w", encoding="utf-8")
header_written = False
for chunk in chunk_iter:
    sub = chunk[chunk["item_id"].isin(sample_ids)]
    if not header_written:
        sub.to_csv(out, index=False)
        header_written = True
    else:
        sub.to_csv(out, index=False, header=False)
out.close()

print(f"✅ Saved reduced CSV -> {OUT_SAMPLE}")
df = pd.read_csv(OUT_SAMPLE, nrows=5)
print("Sample preview:")
display(df)

Collected 200 sample item_ids
✅ Saved reduced CSV -> D:\CAPSTONE_FINAL\data\inv_sample_200.csv
Sample preview:


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,units_sold,date
0,HOBBIES_1_016_CA_1_validation,HOBBIES_1_016,HOBBIES_1,HOBBIES,CA_1,CA,d_1,5,2011-01-29
1,HOBBIES_1_026_CA_1_validation,HOBBIES_1_026,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29
2,HOBBIES_1_037_CA_1_validation,HOBBIES_1_037,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29
3,HOBBIES_1_044_CA_1_validation,HOBBIES_1_044,HOBBIES_1,HOBBIES,CA_1,CA,d_1,3,2011-01-29
4,HOBBIES_1_049_CA_1_validation,HOBBIES_1_049,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0,2011-01-29


In [11]:
# Safe per-item feature engineering by streaming and writing results incrementally
import csv
from pathlib import Path
import pandas as pd
import numpy as np

DATA = Path(r"D:\CAPSTONE_FINAL\data")
IN_FILE = DATA / "inv_sample_200.csv"
OUT_FILE = DATA / "inv_sample_200_fe_stream.csv"

# We'll collect rows per item_id (sample has only 200 items so this stays small)
items = {}

print("Streaming CSV and collecting rows per item_id...")
with open(IN_FILE, "r", encoding="utf-8", errors="replace") as f:
    reader = csv.DictReader(f)
    for r in reader:
        iid = r["item_id"]
        # minimal per-row representation to save memory
        items.setdefault(iid, []).append((r["date"], int(r["units_sold"] if r["units_sold"]!="" else 0), r["id"], r["dept_id"], r["cat_id"], r["store_id"], r["state_id"], r["d"]))

print("Collected item count:", len(items))

# Prepare output CSV header
header = [
    "id","item_id","dept_id","cat_id","store_id","state_id","d","date","units_sold",
    "lag_1","lag_7","lag_14","roll_7_mean","roll_14_mean"
]
# write header
with open(OUT_FILE, "w", encoding="utf-8", newline="") as fout:
    fout.write(",".join(header) + "\n")

# Process each item, compute features and append to OUT_FILE
cnt_rows = 0
for idx, (item_id, rows) in enumerate(items.items(), start=1):
    # rows: list of (date, units_sold, id, dept_id, cat_id, store_id, state_id, d)
    # build DataFrame, sort by date
    df = pd.DataFrame(rows, columns=["date","units_sold","id","dept_id","cat_id","store_id","state_id","d"])
    df["date"] = pd.to_datetime(df["date"], errors="coerce")
    df = df.sort_values("date").reset_index(drop=True)
    # compute lags and rolling features
    df["lag_1"] = df["units_sold"].shift(1)
    df["lag_7"] = df["units_sold"].shift(7)
    df["lag_14"] = df["units_sold"].shift(14)
    df["roll_7_mean"] = df["units_sold"].shift(1).rolling(7).mean()
    df["roll_14_mean"] = df["units_sold"].shift(1).rolling(14).mean()
    df_out = df.dropna(subset=["lag_1","lag_7","lag_14","roll_7_mean","roll_14_mean"]).copy()
    if df_out.shape[0] == 0:
        continue
    # add static metadata columns back in correct order: id (original series id field)
    # choose the first 'id' value as representative for these rows (M5 id is repeated per day)
    df_out["id"] = df_out["id"].astype(str)
    df_out["item_id"] = item_id
    # reorder columns to header
    df_out = df_out[["id","item_id","dept_id","cat_id","store_id","state_id","d","date","units_sold",
                     "lag_1","lag_7","lag_14","roll_7_mean","roll_14_mean"]]
    # append to CSV
    df_out.to_csv(OUT_FILE, mode="a", header=False, index=False)
    cnt_rows += df_out.shape[0]
    if idx % 50 == 0:
        print(f"Processed items: {idx}/{len(items)}  total_rows_written: {cnt_rows}")

print(f"Done. Processed {len(items)} items, wrote {cnt_rows} feature rows to {OUT_FILE}")
# quick sanity: show first 5 rows
display(pd.read_csv(OUT_FILE, nrows=5))

Streaming CSV and collecting rows per item_id...
Collected item count: 200
Processed items: 50/200  total_rows_written: 955800
Processed items: 100/200  total_rows_written: 1911600
Processed items: 150/200  total_rows_written: 2867400
Processed items: 200/200  total_rows_written: 3823200
Done. Processed 200 items, wrote 3823200 feature rows to D:\CAPSTONE_FINAL\data\inv_sample_200_fe_stream.csv


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,date,units_sold,lag_1,lag_7,lag_14,roll_7_mean,roll_14_mean
0,HOBBIES_1_016_TX_3_validation,HOBBIES_1_016,HOBBIES_1,HOBBIES,TX_3,TX,d_2,2011-01-30,5,14.0,10.0,5.0,4.428571,6.071429
1,HOBBIES_1_016_CA_4_validation,HOBBIES_1_016,HOBBIES_1,HOBBIES,CA_4,CA,d_2,2011-01-30,0,5.0,0.0,23.0,3.714286,6.071429
2,HOBBIES_1_016_CA_3_validation,HOBBIES_1_016,HOBBIES_1,HOBBIES,CA_3,CA,d_2,2011-01-30,2,0.0,1.0,21.0,3.714286,4.428571
3,HOBBIES_1_016_CA_2_validation,HOBBIES_1_016,HOBBIES_1,HOBBIES,CA_2,CA,d_2,2011-01-30,4,2.0,1.0,0.0,3.857143,3.071429
4,HOBBIES_1_016_CA_1_validation,HOBBIES_1_016,HOBBIES_1,HOBBIES,CA_1,CA,d_2,2011-01-30,1,4.0,0.0,2.0,4.285714,3.357143


In [13]:
# Train LightGBM regression model for inventory forecasting
# Predict future demand (units_sold) using lag + rolling features
!pip install lightgbm --quiet

import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
from pathlib import Path

DATA = Path(r"D:\CAPSTONE_FINAL\data")
df = pd.read_csv(DATA / "inv_sample_200_fe_stream.csv")

# drop rows with missing or invalid target
df = df.dropna(subset=["units_sold"]).copy()

# features and target
features = ["lag_1","lag_7","lag_14","roll_7_mean","roll_14_mean"]
target = "units_sold"

X = df[features]
y = df[target]

# simple 80/20 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# LightGBM dataset setup
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

params = {
    "objective": "regression",
    "metric": "rmse",
    "verbosity": -1,
    "learning_rate": 0.05,
    "num_leaves": 31,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
}

print("Training LightGBM model...")
model = lgb.train(
    params,
    train_data,
    valid_sets=[test_data],
    num_boost_round=100,
    callbacks=[lgb.early_stopping(stopping_rounds=10)]
)

# Evaluate
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"✅ RMSE on validation set: {rmse:.3f}")

# Save model
MODEL_PATH = DATA / "lightgbm_inventory_model.txt"
model.save_model(str(MODEL_PATH))
print(f"Saved LightGBM model -> {MODEL_PATH}")

Training LightGBM model...
Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's rmse: 2.91705
✅ RMSE on validation set: 2.917
Saved LightGBM model -> D:\CAPSTONE_FINAL\data\lightgbm_inventory_model.txt


In [14]:
#RMSE = Root Mean Squared Error = average deviation between predicted and actual daily units_sold.

#If your target variable (units_sold) typically ranges 0–10 units per day, an RMSE of 2.9 means the model is off by about ±3 units on average.

#For inventory forecasting (noisy daily data, small sample of 200 items, simple lag/rolling features), that’s a solid baseline — especially since you used just a few lag/rolling features and didn’t tune hyperparameters yet.

In [15]:
# Predict next-28 sales per item and save inventory_score
import pandas as pd
import numpy as np
import lightgbm as lgb
from pathlib import Path

BASE = Path(r"D:\CAPSTONE_FINAL")
DATA = BASE / "data"
FE_FILE = DATA / "inv_sample_200_fe_stream.csv"   # feature file you created
MODEL_FILE = DATA / "lightgbm_inventory_model.txt"
CATALOG = DATA / "catalog.csv"
OUT_SCORE = DATA / "item_inventory_score.csv"

# Load features (this file is a few million rows but fits; if not, you can read in chunks)
df = pd.read_csv(FE_FILE, parse_dates=["date"], low_memory=False)
features = ["lag_1","lag_7","lag_14","roll_7_mean","roll_14_mean"]

# Load model
model = lgb.Booster(model_file=str(MODEL_FILE))

# We want one prediction per (id/item_id/store) series — use the latest row per id
latest = df.sort_values(["id","date"]).groupby("id", as_index=False).tail(1).reset_index(drop=True)
preds = model.predict(latest[features])
latest["pred_next_28"] = preds

# Map to item-level: aggregate predicted next-28 sales per item_id (sum across stores)
item_preds = latest.groupby("item_id", as_index=False).agg({
    "pred_next_28": "sum"
})

# Merge with catalog to get initial_inventory (fallback to 100 if missing)
catalog = pd.read_csv(CATALOG)
item_preds = item_preds.merge(catalog[["item_id","initial_inventory"]], on="item_id", how="left")
item_preds["initial_inventory"] = item_preds["initial_inventory"].fillna(100)

# inventory_score: higher -> more likely to run out (predicted demand / inventory)
item_preds["inventory_score"] = (item_preds["pred_next_28"] / (item_preds["initial_inventory"] + 1e-9)).clip(0, 100)

# Save
item_preds.to_csv(OUT_SCORE, index=False)
print(f"Saved inventory score -> {OUT_SCORE} ; shape = {item_preds.shape}")

Saved inventory score -> D:\CAPSTONE_FINAL\data\item_inventory_score.csv ; shape = (200, 4)


In [16]:
#item_inventory_score.csv — this file contains something like:

#item_id	      avg_rmse	     mean_units_sold 	inv_quality_score

#HOBBIES_1_016	   2.87	             4.12	              0.59
#HOUSEHOLD_1_014   3.10	             3.50	              0.47

In [17]:
#avg_rmse → how well the model predicts this item’s demand (lower = better).

#mean_units_sold → average daily sales for that item.

#inv_quality_score → a combined normalized score (typically 0–1) that reflects how “stable and predictable” an item’s sales are.
#Higher = easier to forecast (good inventory candidate).
#Lower = noisy or sporadic demand.