In [1]:
#Purpose: Build `events.csv` and `catalog.csv` used by the Two-Tower retriever model.

In [2]:
#imports and paths
from pathlib import Path
import pandas as pd
import numpy as np

BASE = Path(r"D:\CAPSTONE_FINAL")
DATA_DIR = BASE / "data"
REVIEWS = DATA_DIR / "reviews.csv"
INVENTORY = DATA_DIR / "inventory_history.csv"

print("Paths OK:", REVIEWS.exists(), INVENTORY.exists())


Paths OK: True True


In [3]:
# Build catalog.csv using unique items from reviews + average price from inventory
reviews = pd.read_csv(REVIEWS, usecols=["item_id"]).dropna()
items = reviews["item_id"].unique()

# Approximate price = random (since M5 inventory has no direct price mapping here)
np.random.seed(42)
prices = np.random.uniform(50, 500, len(items))

catalog = pd.DataFrame({
    "item_id": items,
    "title": [f"Item_{i}" for i in items],
    "category": "Electronics",
    "price": prices,
    "cost": prices * 0.6,
    "initial_inventory": np.random.randint(50, 500, len(items))
})
out_catalog = DATA_DIR / "catalog.csv"
catalog.to_csv(out_catalog, index=False)
print(f"Saved catalog -> {out_catalog} ; shape = {catalog.shape}")


Saved catalog -> D:\CAPSTONE_FINAL\data\catalog.csv ; shape = (8381, 6)


In [4]:
# Treat reviews as purchase events, generate synthetic views for each purchase (create events.csv)
reviews = pd.read_csv(REVIEWS, parse_dates=["timestamp"], low_memory=False)
reviews["ts"] = pd.to_datetime(reviews["timestamp"], unit="s", errors="coerce")
purchases = reviews[["user_id","item_id","ts"]].dropna().copy()
purchases["event_type"] = "purchase"
purchases["qty"] = 1

# add random earlier views
views = purchases.copy()
views["event_type"] = "view"
views["ts"] = views["ts"] - pd.to_timedelta(np.random.randint(1, 30, len(views)), unit="d")

events = pd.concat([views, purchases]).sort_values(["user_id","ts"])
events = events.merge(catalog[["item_id","price"]], on="item_id", how="left")
out_events = DATA_DIR / "events.csv"
events.to_csv(out_events, index=False)
print(f"Saved events -> {out_events} ; shape = {events.shape}")


  reviews = pd.read_csv(REVIEWS, parse_dates=["timestamp"], low_memory=False)
  reviews["ts"] = pd.to_datetime(reviews["timestamp"], unit="s", errors="coerce")


Saved events -> D:\CAPSTONE_FINAL\data\events.csv ; shape = (400000, 6)


In [5]:
#preview
print("events sample:")
display(pd.read_csv(out_events, nrows=5))
print("catalog sample:")
display(pd.read_csv(out_catalog, nrows=5))


events sample:


Unnamed: 0,user_id,item_id,ts,event_type,qty,price
0,A00472881KT6WR48K907X,B0000AZJZT,2013-02-05,view,1,371.323726
1,A00472881KT6WR48K907X,B0000AZJZT,2013-02-18,purchase,1,371.323726
2,A01036691ZFOFCXBLP2D1,B00066IJPQ,2012-10-17,view,1,398.518521
3,A01036691ZFOFCXBLP2D1,B00066IJPQ,2012-11-12,purchase,1,398.518521
4,A01036691ZFOFCXBLP2D1,B000BUIP6K,2013-12-30,view,1,294.29516


catalog sample:


Unnamed: 0,item_id,title,category,price,cost,initial_inventory
0,528881469,Item_0528881469,Electronics,218.543053,131.125832,94
1,594451647,Item_0594451647,Electronics,477.821438,286.692863,420
2,594481813,Item_0594481813,Electronics,379.397274,227.638364,392
3,972683275,Item_0972683275,Electronics,319.396318,191.637791,189
4,1400501466,Item_1400501466,Electronics,120.208388,72.125033,226
