In [1]:
#Purpose: Train a simple Two-Tower retriever (PyTorch) using `data/events.csv` + `data/catalog.csv`, save model + item embeddings for later pipeline steps.

In [2]:
#(imports, paths, small configs)
from pathlib import Path
import pandas as pd
import numpy as np
import torch, torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pickle, os

BASE = Path(r"D:\CAPSTONE_FINAL")
DATA_DIR = BASE / "data"
EVENTS_CSV = DATA_DIR / "events.csv"
CATALOG_CSV = DATA_DIR / "catalog.csv"
OUT_DIR = BASE / "models"
OUT_DIR.mkdir(parents=True, exist_ok=True)

# quick config
EMB_DIM = 64
BATCH = 512
EPOCHS = 3
LR = 1e-3
TEMPERATURE = 0.07

print(EVENTS_CSV.exists(), CATALOG_CSV.exists())


True True


In [3]:
#(build mappings and train/test split — leave-one-out)
events = pd.read_csv(EVENTS_CSV, parse_dates=['ts'], low_memory=False)
catalog = pd.read_csv(CATALOG_CSV)
# map ids to indices
users = sorted(events['user_id'].dropna().unique())
items = sorted(catalog['item_id'].dropna().unique())
user2idx = {u:i for i,u in enumerate(users)}
item2idx = {it:i for i,it in enumerate(items)}
idx2item = {v:k for k,v in item2idx.items()}

# only keep interactions that map to our catalog items
events = events[events['item_id'].isin(item2idx)].copy()
events['user_idx'] = events['user_id'].map(user2idx)
events['item_idx'] = events['item_id'].map(item2idx)
events = events.sort_values(['user_idx','ts'])

# leave-one-out: last interaction per user -> test
last = events.groupby('user_idx')['ts'].idxmax()
test_df = events.loc[last].copy()
train_df = events.drop(last).copy().reset_index(drop=True)
print("users,items,train_rows,test_rows:", len(users), len(items), train_df.shape[0], test_df.shape[0])


users,items,train_rows,test_rows: 97809 8381 302191 97809


In [4]:
#(PyTorch Dataset + TwoTower model)
class RecDataset(Dataset):
    def __init__(self, df):
        self.u = df['user_idx'].astype(int).values
        self.i = df['item_idx'].astype(int).values
    def __len__(self): return len(self.u)
    def __getitem__(self, idx): return self.u[idx], self.i[idx]

class TwoTower(nn.Module):
    def __init__(self, n_users, n_items, emb_dim=64):
        super().__init__()
        self.user_emb = nn.Embedding(n_users, emb_dim)
        self.item_emb = nn.Embedding(n_items, emb_dim)
        self.user_mlp = nn.Sequential(nn.Linear(emb_dim, emb_dim), nn.ReLU(), nn.Linear(emb_dim, emb_dim))
        self.item_mlp = nn.Sequential(nn.Linear(emb_dim, emb_dim), nn.ReLU(), nn.Linear(emb_dim, emb_dim))
    def forward_user(self, u): return self.user_mlp(self.user_emb(u))
    def forward_item(self, i): return self.item_mlp(self.item_emb(i))


In [5]:
#(training loop with in-batch negatives + save artifacts)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
n_users = len(user2idx); n_items = len(item2idx)
model = TwoTower(n_users, n_items, emb_dim=EMB_DIM).to(device)
opt = torch.optim.AdamW(model.parameters(), lr=LR)
ce = nn.CrossEntropyLoss()

train_loader = DataLoader(RecDataset(train_df), batch_size=BATCH, shuffle=True, drop_last=True)

def compute_item_embeddings(model, device, batch=1024):
    model.eval()
    embs = []
    with torch.no_grad():
        for start in range(0, n_items, batch):
            ids = torch.arange(start, min(n_items, start+batch), dtype=torch.long, device=device)
            embs.append(model.forward_item(ids).cpu().numpy())
    return np.vstack(embs)

for epoch in range(1, EPOCHS+1):
    model.train()
    total_loss = 0.0
    for users_batch, items_batch in train_loader:
        users_batch = users_batch.to(device)
        items_batch = items_batch.to(device)
        u_vec = model.forward_user(users_batch)          # (B,D)
        i_vec = model.forward_item(items_batch)          # (B,D)
        logits = torch.matmul(u_vec, i_vec.t()) / TEMPERATURE    # (B,B)
        labels = torch.arange(logits.size(0), device=device)
        loss = ce(logits, labels)
        opt.zero_grad(); loss.backward(); opt.step()
        total_loss += loss.item()
    print(f"Epoch {epoch} avg_loss={total_loss/len(train_loader):.4f}")
    # checkpoint item embeddings
    item_embs = compute_item_embeddings(model, device)
    np.save(OUT_DIR / "item_embeddings.npy", item_embs)
    torch.save(model.state_dict(), OUT_DIR / "two_tower.pth")
    with open(OUT_DIR / "mappings.pkl", "wb") as f:
        pickle.dump({'user2idx':user2idx,'item2idx':item2idx,'idx2item':idx2item}, f)
    print("Saved checkpoint and item embeddings")


Epoch 1 avg_loss=6.5746
Saved checkpoint and item embeddings
Epoch 2 avg_loss=6.2184
Saved checkpoint and item embeddings
Epoch 3 avg_loss=6.1763
Saved checkpoint and item embeddings


In [6]:
# (quick retrieval demo using saved embeddings)
import numpy as np
item_embs = np.load(OUT_DIR / "item_embeddings.npy")
with open(OUT_DIR / "mappings.pkl","rb") as f: meta = pickle.load(f)
u_example = next(iter(meta['user2idx'].keys()))
u_idx = torch.LongTensor([meta['user2idx'][u_example]]).to(device)
model.load_state_dict(torch.load(OUT_DIR / "two_tower.pth", map_location=device))
model.to(device).eval()
with torch.no_grad():
    u_emb = model.forward_user(u_idx).cpu().numpy()[0]
scores = item_embs @ u_emb
topk = np.argsort(-scores)[:10]
print("Example user:", u_example, "Top-10 items:", [meta['idx2item'][int(i)] for i in topk])


Example user: A00472881KT6WR48K907X Top-10 items: ['B0002IQ1F8', 'B0002NUECY', 'B0002L2MMG', 'B00004RIPE', 'B0002ZPH7O', 'B0000TFGWS', 'B0006ZSQL4', 'B0007WW69O', 'B0001ZRLRE', 'B00091S0S4']


In [7]:
# evaluate_retrieval(It will evaluate how useful the loss values are in practice — loss alone won’t tell you if retrieval is good.)
import numpy as np, pickle, torch
from pathlib import Path
from tqdm import tqdm

BASE = Path(r"D:\CAPSTONE_FINAL")
OUT = BASE / "models"
DATA = BASE / "data"
# load artifacts
item_embs = np.load(OUT / "item_embeddings.npy")   # shape (n_items, D)
with open(OUT / "mappings.pkl","rb") as f: meta = pickle.load(f)
events = __import__("pandas").read_csv(DATA / "events.csv", parse_dates=["ts"], low_memory=False)

# load model skeleton (recreate TwoTower from your notebook) and weights
# paste the TwoTower class definition here if not importable; below assumes it's in the notebook session
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TwoTower(n_users=len(meta['user2idx']), n_items=len(meta['item2idx']), emb_dim=item_embs.shape[1])
model.load_state_dict(torch.load(OUT / "two_tower.pth", map_location=device))
model.to(device).eval()

# prepare test users: leave-one-out test (already created earlier as test_df in the notebook)
# if test_df not in session, rebuild quickly:
events = events[events['item_id'].isin(meta['item2idx'].keys())].copy()
events['user_idx'] = events['user_id'].map(meta['user2idx'])
events['item_idx'] = events['item_id'].map(meta['item2idx'])
events = events.sort_values(['user_idx','ts'])
last = events.groupby('user_idx')['ts'].idxmax()
test_df = events.loc[last]

K = 50
hits = 0
mrr_total = 0.0
total = 0

# normalize item_embs for cosine-like inner product
item_norm = item_embs / (np.linalg.norm(item_embs, axis=1, keepdims=True) + 1e-9)

for _, row in tqdm(test_df.iterrows(), total=len(test_df)):
    u_idx = torch.LongTensor([int(row['user_idx'])]).to(device)
    with torch.no_grad():
        u_emb = model.forward_user(u_idx).cpu().numpy()[0]
    scores = item_norm @ (u_emb / (np.linalg.norm(u_emb)+1e-9))
    topk = np.argsort(-scores)[:K]
    pos = int(row['item_idx'])
    total += 1
    if pos in topk:
        hits += 1
        rank = int(np.where(topk == pos)[0][0]) + 1
        mrr_total += 1.0 / rank

recall_at_k = hits / total
mrr = mrr_total / total
print(f"Recall@{K} = {recall_at_k:.4f}")
print(f"MRR = {mrr:.4f}")


100%|███████████████████████████████████████████████████████████████████████████| 97809/97809 [02:38<00:00, 615.78it/s]

Recall@50 = 0.0157
MRR = 0.0015





In [8]:
#Recall@50 = 0.0157 (1.6%) → the correct item appeared in the top-50 recommendations for ~1.6% of users.

#MRR = 0.0015 → on average, the correct item ranks very low (closer to bottom of top-K).

#This is normal for a first, un-tuned Two-Tower trained on synthetic data (views/purchases randomly generated). It confirms the pipeline works — now we can improve it later with better negative sampling, feature enrichment, or longer training.