In [1]:
import os, pickle, random, math
import numpy as np, pandas as pd
from tqdm import tqdm
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [2]:
# Load train/val/test
with open("../models/train_valid_test.pkl","rb") as f:
    train_df, valid_df, test_df = pickle.load(f)

# Build global user and item index (consistent)
all_users = sorted(train_df['user_id'].unique())
all_items = sorted(train_df['item_id'].unique())
user2idx = {u:i for i,u in enumerate(all_users)}
item2idx = {it:i for i,it in enumerate(all_items)}
n_users, n_items = len(all_users), len(all_items)
print("n_users, n_items:", n_users, n_items)

n_users, n_items: 943 1556


In [3]:
train_user_pos = train_df.groupby("user_id")["item_id"].apply(set).to_dict()

def sample_negative(user_id, num_samples=1):
    pos = train_user_pos.get(user_id, set())
    negs = []
    while len(negs) < num_samples:
        cand = random.choice(all_items)
        if cand not in pos:
            negs.append(cand)
    return negs



In [4]:
class ImplicitPairDataset(Dataset):
    # yields (user_idx, pos_item_idx, neg_item_idx) for pairwise or (user, item, label) for pointwise
    def __init__(self, interactions_df, num_negatives=4, pointwise=False):
        self.interactions = interactions_df.reset_index(drop=True)
        self.num_neg = num_negatives
        self.pointwise = pointwise
        # list of (user,item)
        self.user_item_pairs = list(zip(self.interactions['user_id'], self.interactions['item_id']))
    def __len__(self):
        return len(self.user_item_pairs)
    def __getitem__(self, idx):
        u, i = self.user_item_pairs[idx]
        if self.pointwise:
            # return pos examples + sampled negatives as separate rows in training loop (not implemented here)
            pass
        else:
            negs = sample_negative(u, self.num_neg)
            return torch.LongTensor([user2idx[u]]), torch.LongTensor([item2idx[i]]), torch.LongTensor([item2idx[negs[0]]])

# Quick dataset + dataloader
train_dataset = ImplicitPairDataset(train_df, num_negatives=1, pointwise=False)
train_loader = DataLoader(train_dataset, batch_size=1024, shuffle=True, drop_last=True, num_workers=0)


In [5]:
class MF(nn.Module):
    def __init__(self, n_users, n_items, emb_dim=64):
        super().__init__()
        self.user_emb = nn.Embedding(n_users, emb_dim)
        self.item_emb = nn.Embedding(n_items, emb_dim)
        self._init_weights()
    def _init_weights(self):
        nn.init.normal_(self.user_emb.weight, std=0.01)
        nn.init.normal_(self.item_emb.weight, std=0.01)
    def forward(self, u, i):
        u_e = self.user_emb(u).squeeze(1)    # (B,emb)
        i_e = self.item_emb(i).squeeze(1)
        dot = (u_e * i_e).sum(dim=1, keepdim=True)  # (B,1)
        return dot  # raw score

In [6]:
class NCF(nn.Module):
    def __init__(self, n_users, n_items, emb_dim=32, mlp_layers=[64,32]):
        super().__init__()
        # GMF embeddings
        self.gmf_user = nn.Embedding(n_users, emb_dim)
        self.gmf_item = nn.Embedding(n_items, emb_dim)
        # MLP embeddings
        self.mlp_user = nn.Embedding(n_users, emb_dim)
        self.mlp_item = nn.Embedding(n_items, emb_dim)
        # MLP layers
        mlp_input = emb_dim * 2
        layers = []
        for h in mlp_layers:
            layers.append(nn.Linear(mlp_input, h))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(0.2))
            mlp_input = h
        self.mlp = nn.Sequential(*layers)
        # final prediction combining GMF and MLP
        final_size = emb_dim + (mlp_layers[-1] if len(mlp_layers)>0 else emb_dim)
        self.output = nn.Linear(final_size, 1)
        self.sig = nn.Sigmoid()
        self._init_weights()
    def _init_weights(self):
        nn.init.normal_(self.gmf_user.weight, std=0.01)
        nn.init.normal_(self.gmf_item.weight, std=0.01)
        nn.init.normal_(self.mlp_user.weight, std=0.01)
        nn.init.normal_(self.mlp_item.weight, std=0.01)
    def forward(self, u, i):
        u = u.squeeze(1); i = i.squeeze(1)
        g_u = self.gmf_user(u)
        g_i = self.gmf_item(i)
        gmf = g_u * g_i  # element-wise
        m_u = self.mlp_user(u)
        m_i = self.mlp_item(i)
        mlp = torch.cat([m_u, m_i], dim=1)
        mlp_out = self.mlp(mlp)
        x = torch.cat([gmf, mlp_out], dim=1)
        out = self.sig(self.output(x)).unsqueeze(1)
        return out  # probability score

In [7]:
def bpr_loss(pos_scores, neg_scores):
    # pos_scores, neg_scores: (B,1) raw
    x = pos_scores - neg_scores
    loss = -torch.log(torch.sigmoid(x) + 1e-8).mean()
    return loss


In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

def train_bpr(model, dataloader, optimizer, epochs=3):
    model.to(device)
    model.train()
    for epoch in range(epochs):
        total_loss = 0.0
        for b_u, b_pos, b_neg in tqdm(dataloader):
            b_u = b_u.to(device)
            b_pos = b_pos.to(device)
            b_neg = b_neg.to(device)
            # forward pass
            pos_scores = model(b_u, b_pos)  # (B,1)
            neg_scores = model(b_u, b_neg)
            loss = bpr_loss(pos_scores, neg_scores)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}/{epochs} avg loss: {total_loss/len(dataloader):.4f}")

Device: cpu


In [9]:
emb_dim = 32
mlp_layers = [64,32]
ncf = NCF(n_users, n_items, emb_dim=emb_dim, mlp_layers=mlp_layers)
opt = torch.optim.Adam(ncf.parameters(), lr=1e-3, weight_decay=1e-5)
train_bpr(ncf, train_loader, opt, epochs=5)


  from .autonotebook import tqdm as notebook_tqdm
100%|██████████| 59/59 [00:00<00:00, 107.71it/s]


Epoch 1/5 avg loss: 0.6820


100%|██████████| 59/59 [00:00<00:00, 129.40it/s]


Epoch 2/5 avg loss: 0.5432


100%|██████████| 59/59 [00:00<00:00, 122.81it/s]


Epoch 3/5 avg loss: 0.4910


100%|██████████| 59/59 [00:00<00:00, 129.57it/s]


Epoch 4/5 avg loss: 0.4896


100%|██████████| 59/59 [00:00<00:00, 127.44it/s]

Epoch 5/5 avg loss: 0.4869





In [10]:
def score_all_items(model, orig_user_id, topk=10, filter_seen=True):
    model.eval()
    if orig_user_id not in user2idx:
        return []
    u_idx = user2idx[orig_user_id]
    user_tensor = torch.LongTensor([[u_idx]]).to(device)
    # score in batches to avoid OOM
    batch = 2048
    scores = []
    item_indices = list(range(n_items))
    with torch.no_grad():
        for i in range(0, n_items, batch):
            items_batch = item_indices[i:i+batch]
            it_tensor = torch.LongTensor([[j] for j in items_batch]).to(device)
            u_batch = user_tensor.repeat(len(it_tensor),1)
            out = model(u_batch, it_tensor)  # (B,1)
            scores.extend(out.squeeze(1).cpu().numpy().tolist())
    # map back to original item IDs (all_items list)
    scored = list(zip(all_items, scores))
    if filter_seen:
        seen = train_user_pos.get(orig_user_id, set())
        scored = [ (it,sc) for it,sc in scored if it not in seen ]
    scored_sorted = sorted(scored, key=lambda x: x[1], reverse=True)[:topk]
    return [it for it,sc in scored_sorted]

In [11]:
def precision_at_k(recommended, true_items, k=10):
    recommended_set = set(recommended[:k]); true_set=set(true_items)
    return len(recommended_set & true_set)/k

def recall_at_k(recommended, true_items, k=10):
    if len(true_items)==0: return 0.0
    recommended_set = set(recommended[:k]); true_set=set(true_items)
    return len(recommended_set & true_set)/len(true_set)

def ndcg_at_k(recommended, true_items, k=10):
    dcg = 0.0
    for i,item in enumerate(recommended[:k]):
        if item in true_items:
            dcg += 1.0/math.log2(i+2)
    ideal_hits = min(len(true_items), k)
    if ideal_hits==0: return 0.0
    idcg = sum([1.0/math.log2(i+2) for i in range(ideal_hits)])
    return dcg/idcg


In [12]:
def evaluate_model_on_df(model, df, k=10):
    grouped = df.groupby("user_id")["item_id"].apply(list).to_dict()
    precs, recs, ndcgs = [], [], []
    for u, true_items in grouped.items():
        preds = score_all_items(model, u, topk=k, filter_seen=True)
        precs.append(precision_at_k(preds, true_items, k))
        recs.append(recall_at_k(preds, true_items, k))
        ndcgs.append(ndcg_at_k(preds, true_items, k))
    return {"precision@"+str(k):np.mean(precs), "recall@"+str(k):np.mean(recs), "ndcg@"+str(k):np.mean(ndcgs)}

In [13]:

print("Evaluating NCF on validation (this may take a while)...")
metrics_ncf = evaluate_model_on_df(ncf, valid_df, k=10)
print("NCF validation metrics:", metrics_ncf)

Evaluating NCF on validation (this may take a while)...
NCF validation metrics: {'precision@10': 0.0706256627783669, 'recall@10': 0.06430795008904644, 'ndcg@10': 0.07953139710253686}


In [14]:
torch.save(ncf.state_dict(), "../models/ncf_state.pt")
with open("../models/ncf_meta.pkl","wb") as f:
    pickle.dump({"user2idx":user2idx, "item2idx":item2idx, "all_users":all_users, "all_items":all_items}, f)


In [15]:
item_emb = ncf.gmf_item.weight.detach().cpu().numpy()


In [16]:
import numpy as np
from numpy.linalg import norm

# item embedding matrix (n_items, emb_dim)
item_vecs = ncf.gmf_item.weight.detach().cpu().numpy()

# Normalize for fast cosine similarity
item_norm = item_vecs / np.linalg.norm(item_vecs, axis=1, keepdims=True)

def topk_similar_items(target_item_id, k=10):
    if target_item_id not in item2idx:
        return []
    idx = item2idx[target_item_id]
    v = item_norm[idx]                      # (emb_dim,)
    sims = item_norm @ v                    # (n_items,)
    # sort by similarity
    top_idx = np.argpartition(-sims, range(k+1))[:k+1]
    top_idx = top_idx[top_idx != idx]       # exclude itself
    top = sorted(top_idx, key=lambda x: sims[x], reverse=True)[:k]
    return [all_items[i] for i in top]


In [17]:
def generate_candidates_for_user(user_id, num_items=200):
    # Step 1: get items user has interacted with
    seen = list(train_user_pos.get(user_id, []))
    if len(seen) == 0:
        return random.sample(all_items, num_items)

    # Step 2: for each seen item → get similar items
    cand = []
    for it in seen:
        sim_items = topk_similar_items(it, k=20)
        cand.extend(sim_items)

    # Step 3: remove seen items
    cand = list(set(cand) - set(seen))

    # Step 4: sample up to num_items
    if len(cand) > num_items:
        cand = random.sample(cand, num_items)

    return cand


In [18]:
from annoy import AnnoyIndex
import os

emb_dim = item_vecs.shape[1]
ann = AnnoyIndex(emb_dim, 'angular')

for i in range(n_items):
    ann.add_item(i, item_vecs[i])

ann.build(20)
ann.save("../models/item_annoy.idx")


True

In [19]:
ann = AnnoyIndex(emb_dim, 'angular')
ann.load("../models/item_annoy.idx")

def annoy_similar_items(item_id, k=10):
    idx = item2idx[item_id]
    top = ann.get_nns_by_item(idx, k+1)[1:]   # exclude itself
    return [all_items[i] for i in top]


In [20]:
candidates = generate_candidates_for_user('166', num_items=200)


In [21]:
def rank_candidates(user_id, candidates, topk=10):
    ncf.eval()
    u_idx = user2idx.get(user_id)
    if u_idx is None:
        return []
    
    scores = []
    with torch.no_grad():
        for it in candidates:
            it_idx = item2idx[it]
            u_tensor = torch.LongTensor([[u_idx]]).to(device)
            i_tensor = torch.LongTensor([[it_idx]]).to(device)
            s = ncf(u_tensor, i_tensor).item()
            scores.append((it, s))
    
    ranked = sorted(scores, key=lambda x: x[1], reverse=True)
    return [it for it,_ in ranked[:topk]]


In [22]:
def evaluate_pipeline(df, k=10):
    grouped = df.groupby("user_id")["item_id"].apply(list).to_dict()
    precs, recs, ndcgs = [], [], []

    for user, true_items in grouped.items():
        cand = generate_candidates_for_user(user, num_items=200)
        preds = rank_candidates(user, cand, topk=k)

        precs.append(precision_at_k(preds, true_items, k))
        recs.append(recall_at_k(preds, true_items, k))
        ndcgs.append(ndcg_at_k(preds, true_items, k))

    print("End-to-End Results:")
    print("Precision@{}: {:.4f}".format(k, np.mean(precs)))
    print("Recall@{}: {:.4f}".format(k, np.mean(recs)))
    print("NDCG@{}: {:.4f}".format(k, np.mean(ndcgs)))


In [23]:
evaluate_pipeline(valid_df, k=10)


End-to-End Results:
Precision@10: 0.0680
Recall@10: 0.0386
NDCG@10: 0.0743


In [24]:
with open("../models/item_vecs.npy", "wb") as f:
    np.save(f, item_vecs)
