In [1]:
import pandas as pd
from collections import Counter
import torch
from sklearn.model_selection import train_test_split
from networkx.readwrite.json_graph import adjacency
import random

import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader



In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
dataset_pd = pd.read_csv('D:\\VideoRecSystem\\MicroLens\\DataSet\\MicroLens-50k_pairs.csv')
# dataset_pd = pd.read_csv('MicroLens-50k_pairs.csv')

In [4]:
dataset_pd.head(10)

Unnamed: 0,user,item,timestamp
0,36121,9580,1583378629552
1,26572,9580,1583436719018
2,37550,9580,1584412681021
3,14601,9580,1584848802432
4,15061,9580,1585388171106
5,6364,9580,1585390736041
6,3542,9580,1585404918503
7,21038,9580,1590144594477
8,12538,14631,1634867362929
9,47592,14631,1634872254913


In [5]:
dataset_pd.count

<bound method DataFrame.count of          user   item      timestamp
0       36121   9580  1583378629552
1       26572   9580  1583436719018
2       37550   9580  1584412681021
3       14601   9580  1584848802432
4       15061   9580  1585388171106
...       ...    ...            ...
359703  48702   1363  1662984066647
359704  27203   7291  1662984082974
359705  29261  19649  1662984103874
359706  28341  19188  1662984123833
359707  38967   7254  1662984132429

[359708 rows x 3 columns]>

In [6]:
user_counts = dataset_pd['user'].value_counts()
item_counts = dataset_pd['item'].value_counts()
# valid_users = user_counts[user_counts > 3].index
# valid_items = item_counts[item_counts > 3].index
# filtered_df = dataset_pd[dataset_pd['user'].isin(valid_users) & dataset_pd['item'].isin(valid_items)]
# filtered_df.count

In [7]:
# order by user,timestamp 
filtered_df = dataset_pd.sort_values(by=["user", "timestamp"])


In [8]:
def split(df, user_col='user', item_col='item', time_col='timestamp'):

    df = df.sort_values(by=[user_col, time_col])  # 按用户时间排序

    # 获取每个用户的最后一条记录作为 test
    test_df = df.groupby(user_col).tail(1)
    train_df = df.drop(index=test_df.index)

    # 过滤 test 中那些 user/item 不在 train 中的
    train_users = set(train_df[user_col])
    train_items = set(train_df[item_col])

    test_df = test_df[
        test_df[user_col].isin(train_users) &
        test_df[item_col].isin(train_items)
    ]

    return train_df.reset_index(drop=True), test_df.reset_index(drop=True)

In [9]:

train_df, test_df = split(filtered_df,user_col='user', item_col='item', time_col='timestamp')

print(f"Train size: {len(train_df)}")
print(f"Test size: {len(test_df)}")


Train size: 309708
Test size: 49424


In [10]:
# maintain a map from new id to old id, new id for constructing matrix
user2id = {u: i for i, u in enumerate(filtered_df['user'].unique())}
item2id = {i: j for j, i in enumerate(filtered_df['item'].unique())}

# apply to train_df and test_df
train_df['user_id'] = train_df['user'].map(user2id)
train_df['item_id'] = train_df['item'].map(item2id)
test_df['user_id'] = test_df['user'].map(user2id)
test_df['item_id'] = test_df['item'].map(item2id)

num_users = len(user2id)
num_items = len(item2id)


In [11]:
# ---------- 超参数 ----------
MAX_SEQ_LEN   = 20          # 输入序列长度
EMBEDDING_DIM = 64          # item / user embedding 维度
HIDDEN_SIZE   = 64          # GRU 隐藏维度（可与 EMBEDDING_DIM 相同）
NEG_SAMPLE    = 5           # 训练时每个正样本采负样本数
BATCH_SIZE    = 256
EPOCHS        = 10
LR            = 1e-3
SEED          = 42
# ----------------------------

# ---------- 随机种子 ----------
torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)
# ----------------------------

# ---------- 常量 ----------
PAD_IDX  = num_items              # padding 专用 id（不与真实 item 冲突）
N_ITEMS  = num_items + 1          # Embedding 行数（含 PAD）
ALL_ITEM_IDS = np.arange(num_items, dtype=np.int64)  # 用于随机负采样
# ----------------------------


# ======== 数据集 ======== #
class GRU4RecBPRDataset(Dataset):
    """
    每条样本: (hist_seq, pos_item, neg_items)
    hist_seq 已左侧 padding 到长度 max_len
    """
    def __init__(self, df, max_len=MAX_SEQ_LEN, pad_idx=PAD_IDX, n_neg=NEG_SAMPLE):
        super().__init__()
        self.max_len = max_len
        self.pad_idx = pad_idx
        self.n_neg   = n_neg

        self.inputs, self.targets = [], []
        for _, user_hist in df.groupby("user_id"):
            seq = user_hist["item_id"].tolist()
            for i in range(1, len(seq)):
                hist = seq[max(0, i - max_len): i]
                hist = [pad_idx] * (max_len - len(hist)) + hist
                self.inputs.append(hist)
                self.targets.append(seq[i])  # 正样本

        self.inputs  = np.asarray(self.inputs,  dtype=np.int64)
        self.targets = np.asarray(self.targets, dtype=np.int64)

    def __len__(self):
        return len(self.targets)

    def _sample_neg(self, pos):
        """从全集随机采 n_neg 个与 pos 不同的负样本（无放回）。"""
        negs = np.random.choice(ALL_ITEM_IDS, size=self.n_neg, replace=False)
        while (negs == pos).any():
            dup = negs == pos
            negs[dup] = np.random.choice(ALL_ITEM_IDS, size=dup.sum(), replace=False)
        return negs

    def __getitem__(self, idx):
        hist = self.inputs[idx]
        pos  = self.targets[idx]
        negs = self._sample_neg(pos)
        return (
            torch.tensor(hist, dtype=torch.long),    # (T,)
            torch.tensor(pos,  dtype=torch.long),    # ()
            torch.tensor(negs, dtype=torch.long)     # (n_neg,)
        )



In [12]:

# ======== 载入 cover_emb128 LMDB 向量，并构建查找表 (128 维) ======== #
import lmdb

def load_lmdb_embeddings(lmdb_path, num_items, emb_dim=128):
    """读取 LMDB 中的 128 维 cover 向量，key 从 1 开始计数"""
    env = lmdb.open(lmdb_path, readonly=True, lock=False,subdir=False)
    cover_embs = np.zeros((num_items + 1, emb_dim), dtype=np.float32)  # index 0 保留给 PAD
    with env.begin() as txn:
        for idx in range(1, num_items + 1):
            val = txn.get(str(idx).encode('ascii'))
            if val is not None:
                cover_embs[idx] = np.frombuffer(val, dtype=np.float32)
    env.close()
    return torch.tensor(cover_embs)

COVER_EMB_PATH = r"D:/VideoRecSystem/MicroLens/cover_emb128.lmdb"
print(f"Loading cover embeddings from {COVER_EMB_PATH} ...")
COVER_EMBS = load_lmdb_embeddings(COVER_EMB_PATH, num_items=num_items, emb_dim=128)
print("COVER_EMBS shape:", COVER_EMBS.shape)  # (num_items+1, 128)
# =================================================================== #


Loading cover embeddings from D:/VideoRecSystem/MicroLens/cover_emb128.lmdb ...
COVER_EMBS shape: torch.Size([19221, 128])


In [13]:

# ======== 模型 (拼接 cover_emb128) ======== #
class GRU4RecBPR(nn.Module):
    def __init__(self, n_items=N_ITEMS,
                 embedding_dim=EMBEDDING_DIM, hidden_size=HIDDEN_SIZE,
                 num_layers=1, pad_idx=PAD_IDX):
        super().__init__()
        # 原始 ID embedding
        self.id_embedding = nn.Embedding(n_items, embedding_dim, padding_idx=pad_idx)
        # 冻结的 cover embedding (128 维，来自 LMDB)
        self.cover_embedding = nn.Embedding.from_pretrained(COVER_EMBS, freeze=True, padding_idx=pad_idx)

        # GRU 输入维度 = ID embedding + cover embedding
        input_dim = embedding_dim + 128
        self.gru = nn.GRU(input_dim, hidden_size,
                          num_layers=num_layers, batch_first=True)


    def forward(self, seq):
        """seq : (B, T) → 返回用户向量 h_hat : (B, D)"""
        id_emb    = self.id_embedding(seq)       # (B, T, D)
        cover_emb = self.cover_embedding(seq)    # (B, T, 128)
        emb       = torch.cat([id_emb, cover_emb], dim=-1)  # (B, T, D+128)

        out, _ = self.gru(emb)                   # (B, T, H)
        h      = out[:, -1, :]                   # (B, H)
        return h                     # (B, D)

    def score(self, h, items):
        """点积打分：<h, E_id[items]>, 只与原始 ID embedding 对比"""
        item_emb = self.id_embedding(items)
        return (h.unsqueeze(-2) * item_emb).sum(-1)


In [14]:
# ======== 损失 ======== #
def bpr_loss(pos_score, neg_score):
    """
    BPR  pairwise  loss
    pos_score : (B,)
    neg_score : (B, n_neg)
    """
    diff = pos_score.unsqueeze(-1) - neg_score
    return -torch.log(torch.sigmoid(diff) + 1e-8).mean()

In [15]:
# ======== 训练流程 ======== #
def train_bpr(train_df, test_df=None, device=None):
    if device is None:
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    dataset = GRU4RecBPRDataset(train_df)
    loader  = DataLoader(dataset, batch_size=BATCH_SIZE,
                         shuffle=True, num_workers=0, pin_memory=True)

    model = GRU4RecBPR().to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=LR)

    for epoch in range(1, EPOCHS + 1):
        model.train()
        total_loss = 0.0
        for step, (hist, pos, neg) in enumerate(loader, 1):
            hist, pos, neg = hist.to(device), pos.to(device), neg.to(device)

            h = model(hist)                      # (B, D)
            pos_s = model.score(h, pos)          # (B,)
            neg_s = model.score(h, neg)          # (B, n_neg)

            loss = bpr_loss(pos_s, neg_s)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item() * hist.size(0)
            if step % 100 == 0 or step == 1:
             print(f"[Epoch {epoch}] Step {step}/{len(loader)}", flush=True)
        avg_loss = total_loss / len(dataset)
        msg = f"Epoch {epoch:02d} | BPR loss = {avg_loss:.4f}"
        print(msg)

    return model


In [16]:
# ======== 构建 (user → 历史张量) ======== #
def build_hist_tensors(df, max_len=MAX_SEQ_LEN, pad_idx=PAD_IDX):
    user_hist_t = {}
    for uid, items in df.groupby("user_id")["item_id"]:
        seq = items.tolist()[-max_len:]
        seq = [pad_idx] * (max_len - len(seq)) + seq
        user_hist_t[uid] = torch.tensor(seq, dtype=torch.long).unsqueeze(0)  # (1,T)
    return user_hist_t

In [17]:

def evaluate_ranking(
        test_df,              # DataFrame, 必含 user_id / item_id
        train_df,             # DataFrame, 用来构建用户→已交互物品集合
        score_fn,             # callable(users_tensor, items_tensor) → np.array
        num_items,            # 物品总数
        k=10,                 # Hit@K / NDCG@K
        num_neg=100,          # 每个正样本采多少负样本
        user_col='user_id',
        item_col='item_id',
        seed=42
    ):
    """
    不依赖具体模型，只要提供 score_fn 就能评估。
    score_fn: 接收 (user_tensor, item_tensor) 并返回同长度的 Numpy 分数向量。
    """
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

    # 用户历史，用于采负样本 & 过滤
    train_user_dict = (
        train_df.groupby(user_col)[item_col].apply(set).to_dict()
    )

    hits, ndcgs = [], []

    for _, row in test_df.iterrows():
        u = int(row[user_col])
        pos_item = int(row[item_col])

        # ---------- 负采样 ----------
        neg_items = set()
        while len(neg_items) < num_neg:
            neg = random.randint(0, num_items - 1)
            if neg not in train_user_dict.get(u, set()) and neg != pos_item:
                neg_items.add(neg)

        item_candidates = list(neg_items) + [pos_item]

        # ---------- 评分 ----------
        users_t  = torch.LongTensor([u] * len(item_candidates))
        items_t  = torch.LongTensor(item_candidates)
        scores   = score_fn(users_t, items_t)        # ← 只依赖 score_fn
        rank_idx = np.argsort(scores)[::-1]          # 降序
        ranked_items = [item_candidates[i] for i in rank_idx]

        # ---------- 指标 ----------
        if pos_item in ranked_items[:k]:
            hits.append(1)
            rank_pos = ranked_items.index(pos_item)
            ndcgs.append(1 / np.log2(rank_pos + 2))
        else:
            hits.append(0)
            ndcgs.append(0)

    hit_rate = float(np.mean(hits))
    ndcg     = float(np.mean(ndcgs))
    return hit_rate, ndcg

In [18]:
    # ------------------ 训练 ------------------
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model  = train_bpr(train_df, test_df, device=device)

[Epoch 1] Step 1/1015
[Epoch 1] Step 100/1015
[Epoch 1] Step 200/1015
[Epoch 1] Step 300/1015
[Epoch 1] Step 400/1015
[Epoch 1] Step 500/1015
[Epoch 1] Step 600/1015
[Epoch 1] Step 700/1015
[Epoch 1] Step 800/1015
[Epoch 1] Step 900/1015
[Epoch 1] Step 1000/1015
Epoch 01 | BPR loss = 0.7644
[Epoch 2] Step 1/1015
[Epoch 2] Step 100/1015
[Epoch 2] Step 200/1015
[Epoch 2] Step 300/1015
[Epoch 2] Step 400/1015
[Epoch 2] Step 500/1015
[Epoch 2] Step 600/1015
[Epoch 2] Step 700/1015
[Epoch 2] Step 800/1015
[Epoch 2] Step 900/1015
[Epoch 2] Step 1000/1015
Epoch 02 | BPR loss = 0.5990
[Epoch 3] Step 1/1015
[Epoch 3] Step 100/1015
[Epoch 3] Step 200/1015
[Epoch 3] Step 300/1015
[Epoch 3] Step 400/1015
[Epoch 3] Step 500/1015
[Epoch 3] Step 600/1015
[Epoch 3] Step 700/1015
[Epoch 3] Step 800/1015
[Epoch 3] Step 900/1015
[Epoch 3] Step 1000/1015
Epoch 03 | BPR loss = 0.5041
[Epoch 4] Step 1/1015
[Epoch 4] Step 100/1015
[Epoch 4] Step 200/1015
[Epoch 4] Step 300/1015
[Epoch 4] Step 400/1015
[Epoch

In [19]:
def make_popularity_score_fn(train_df, item_col='item_id'):
    item_cnt = Counter(train_df[item_col])
    default_score = min(item_cnt.values()) - 1  # 给没出现过的物品一个更低分
    def _score_fn(users_t, items_t):
        return np.array([item_cnt.get(int(i), default_score) for i in items_t])
    return _score_fn

In [20]:
def random_score_fn(users_t, items_t):
    # 随机给每个 items_t 一个分数；users_t 不使用，但必须接收
    return np.random.rand(len(items_t))

In [21]:
# ======== make_score_fn，用于 evaluate_ranking ======== #
def make_score_fn(model, hist_tensors, device="cpu"):
    model.eval()
    item_emb_table = model.id_embedding.weight.data  # 含 PAD
    @torch.no_grad()
    def score_fn(users_t, items_t):
        users_t = users_t.to(device)
        items_t = items_t.to(device)

        # 拼接用户历史 batch
        hist_batch = torch.cat([hist_tensors[int(u)] for u in users_t], dim=0).to(device)
        h = model(hist_batch)                       # (B,D)
        item_emb = item_emb_table[items_t]          # (B,D)
        scores = (h * item_emb).sum(-1)             # 点积
        return scores.cpu().numpy()
    return score_fn

In [22]:

# ---------------- GRU（或其他模型）------------
score_fn_gru = make_score_fn(model,hist_tensors=build_hist_tensors(train_df),device=device)
hit_gru, ndcg_gru = evaluate_ranking(
    test_df, train_df, score_fn_gru,
    num_items=num_items, k=10
)
print(f"GRU   Hit@10={hit_gru:.4f}  NDCG@10={ndcg_gru:.4f}")

# ---------------- baseline：Popular ----------------
pop_score_fn  = make_popularity_score_fn(train_df)
hit_pop, ndcg_pop = evaluate_ranking(
    test_df, train_df, pop_score_fn,
    num_items=num_items, k=10
)
print(f"Popular  Hit@10={hit_pop:.4f}  NDCG@10={ndcg_pop:.4f}")

# ---------------- baseline：Random -----------------
hit_rand, ndcg_rand = evaluate_ranking(
    test_df, train_df, random_score_fn,
    num_items=num_items, k=10
)
print(f"Random   Hit@10={hit_rand:.4f}  NDCG@10={ndcg_rand:.4f}")



GRU   Hit@10=0.2889  NDCG@10=0.1456
Popular  Hit@10=0.2528  NDCG@10=0.1262
Random   Hit@10=0.0996  NDCG@10=0.0455
