In [25]:
import pandas as pd
from collections import Counter
import torch
from sklearn.model_selection import train_test_split
from networkx.readwrite.json_graph import adjacency
import random, math, time, os
import torch.nn.functional as F
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader



In [26]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [27]:
dataset_pd = pd.read_csv('D:\\VideoRecSystem\\MicroLens\\DataSet\\MicroLens-50k_pairs.csv')
# dataset_pd = pd.read_csv('MicroLens-50k_pairs.csv')

In [28]:
dataset_pd.head(10)

Unnamed: 0,user,item,timestamp
0,36121,9580,1583378629552
1,26572,9580,1583436719018
2,37550,9580,1584412681021
3,14601,9580,1584848802432
4,15061,9580,1585388171106
5,6364,9580,1585390736041
6,3542,9580,1585404918503
7,21038,9580,1590144594477
8,12538,14631,1634867362929
9,47592,14631,1634872254913


In [29]:
dataset_pd.count

<bound method DataFrame.count of          user   item      timestamp
0       36121   9580  1583378629552
1       26572   9580  1583436719018
2       37550   9580  1584412681021
3       14601   9580  1584848802432
4       15061   9580  1585388171106
...       ...    ...            ...
359703  48702   1363  1662984066647
359704  27203   7291  1662984082974
359705  29261  19649  1662984103874
359706  28341  19188  1662984123833
359707  38967   7254  1662984132429

[359708 rows x 3 columns]>

In [30]:
user_counts = dataset_pd['user'].value_counts()
item_counts = dataset_pd['item'].value_counts()
# valid_users = user_counts[user_counts > 3].index
# valid_items = item_counts[item_counts > 3].index
# filtered_df = dataset_pd[dataset_pd['user'].isin(valid_users) & dataset_pd['item'].isin(valid_items)]
# filtered_df.count

In [31]:
# order by user,timestamp 
filtered_df = dataset_pd.sort_values(by=["user", "timestamp"])


In [32]:
def split(df, user_col='user', item_col='item', time_col='timestamp'):

    df = df.sort_values(by=[user_col, time_col])  # 按用户时间排序

    # 获取每个用户的最后一条记录作为 test
    test_df = df.groupby(user_col).tail(1)
    train_df = df.drop(index=test_df.index)

    # 过滤 test 中那些 user/item 不在 train 中的
    train_users = set(train_df[user_col])
    train_items = set(train_df[item_col])

    test_df = test_df[
        test_df[user_col].isin(train_users) &
        test_df[item_col].isin(train_items)
    ]

    return train_df.reset_index(drop=True), test_df.reset_index(drop=True)

In [33]:

train_df, test_df = split(filtered_df,user_col='user', item_col='item', time_col='timestamp')

print(f"Train size: {len(train_df)}")
print(f"Test size: {len(test_df)}")


Train size: 309708
Test size: 49424


In [34]:
# maintain a map from new id to old id, new id for constructing matrix
user2id = {u: i for i, u in enumerate(filtered_df['user'].unique())}
item2id = {i: j for j, i in enumerate(filtered_df['item'].unique())}

# apply to train_df and test_df
train_df['user_id'] = train_df['user'].map(user2id)
train_df['item_id'] = train_df['item'].map(item2id)
test_df['user_id'] = test_df['user'].map(user2id)
test_df['item_id'] = test_df['item'].map(item2id)

num_users = len(user2id)
num_items = len(item2id)


In [35]:
import lmdb

def load_lmdb_embeddings(lmdb_path, num_items, emb_dim=128):
    """读取 LMDB 中的 128 维 cover 向量，key 从 1 开始计数"""
    env = lmdb.open(lmdb_path, readonly=True, lock=False,subdir=False)
    cover_embs = np.zeros((num_items + 1, emb_dim), dtype=np.float32)  # index 0 保留给 PAD
    with env.begin() as txn:
        for idx in range(1, num_items + 1):
            val = txn.get(str(idx).encode('ascii'))
            if val is not None:
                cover_embs[idx] = np.frombuffer(val, dtype=np.float32)
    env.close()
    return torch.tensor(cover_embs)

COVER_EMB_PATH = r"D:/VideoRecSystem/MicroLens/cover_emb128.lmdb"
print(f"Loading cover embeddings from {COVER_EMB_PATH} ...")
COVER_EMBS = load_lmdb_embeddings(COVER_EMB_PATH, num_items=num_items, emb_dim=128)
print("COVER_EMBS shape:", COVER_EMBS.shape)  # (num_items+1, 128)

Loading cover embeddings from D:/VideoRecSystem/MicroLens/cover_emb128.lmdb ...
COVER_EMBS shape: torch.Size([19221, 128])


In [36]:
# ---------- 超参数 ----------
MAX_SEQ_LEN   = 20          # 输入序列长度
EMBEDDING_DIM = 64          # item & user 向量维度
NUM_BLOCK     = 3           # 残差层堆叠次数（每次含 dilation=[1,2,4,8]）
NEG_SAMPLE    = 5           # 训练时负采样个数
BATCH_SIZE    = 512         # 1650Ti 8G 推荐 512；显存紧张用 256
EPOCHS        = 10
LR            = 1e-3
SEED          = 42
DROPOUT       = 0.1
# ----------------------------

torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

# ---------- 常量 ----------
PAD_IDX  = num_items              # padding 专用 id，不与真实 item 冲突
N_ITEMS  = num_items + 1          # Embedding 行数（含 PAD）
ALL_ITEM_IDS = np.arange(num_items, dtype=np.int64)
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# ----------------------------

# ======== Dataset (同前) ======== #
class NextItNetBPRDataset(Dataset):
    def __init__(self, df, max_len=MAX_SEQ_LEN, pad_idx=PAD_IDX, n_neg=NEG_SAMPLE):
        self.max_len, self.pad_idx, self.n_neg = max_len, pad_idx, n_neg
        self.inputs, self.targets = [], []
        for _, user_hist in df.groupby('user_id'):
            seq = user_hist['item_id'].tolist()
            for i in range(1, len(seq)):
                hist = seq[max(0, i - max_len): i]
                hist = [pad_idx]*(max_len - len(hist)) + hist   # 左侧 pad
                self.inputs.append(hist)
                self.targets.append(seq[i])
        self.inputs  = np.asarray(self.inputs,  dtype=np.int64)
        self.targets = np.asarray(self.targets, dtype=np.int64)

    def __len__(self):
        return len(self.targets)

    def _neg_sample(self, pos):
        negs = np.random.choice(ALL_ITEM_IDS, size=self.n_neg, replace=False)
        while (negs == pos).any():
            dup = negs == pos
            negs[dup] = np.random.choice(ALL_ITEM_IDS, size=dup.sum(), replace=False)
        return negs

    def __getitem__(self, idx):
        hist = torch.tensor(self.inputs[idx], dtype=torch.long)
        pos  = torch.tensor(self.targets[idx], dtype=torch.long)
        negs = torch.tensor(self._neg_sample(self.targets[idx]), dtype=torch.long)
        return hist, pos, negs

In [37]:
# ======== NextItNet 模型 ======== #
class DilatedResidualBlock(nn.Module):
    """
    一组 (dilated conv → ReLU → dilated conv) + residual
    """
    def __init__(self, d, dilation, dropout=DROPOUT):
        super().__init__()
        self.conv1 = nn.Conv1d(d, d, kernel_size=3, padding=dilation, dilation=dilation)
        self.conv2 = nn.Conv1d(d, d, kernel_size=3, padding=1, dilation=1)
        self.dropout = nn.Dropout(dropout)
        self.layernorm1 = nn.LayerNorm(d)
        self.layernorm2 = nn.LayerNorm(d)

    def forward(self, x):
        """
        x: (B, T, D)  → conv 需要 (B, D, T)
        """
        residual = x
        x = self.layernorm1(x)
        x = F.relu(self.conv1(x.transpose(1,2)).transpose(1,2))
        x = self.dropout(x)
        x = self.layernorm2(x)
        x = F.relu(self.conv2(x.transpose(1,2)).transpose(1,2))
        x = self.dropout(x)
        return x + residual


class NextItNet(nn.Module):
    def __init__(
        self, n_items=N_ITEMS, dim=EMBEDDING_DIM,
        cover_embs=None, pad_idx=PAD_IDX,
        n_blocks=NUM_BLOCK, dropout=DROPOUT
    ):
        super().__init__()
        # ① ID embedding（可训练）
        self.id_emb   = nn.Embedding(n_items, dim, padding_idx=pad_idx)
        # ② cover embedding（冻结）
        self.cover_emb = nn.Embedding.from_pretrained(cover_embs, freeze=True)

        # ③ 将拼接后的 (dim+128) → dim 的 1×1 线性层
        self.in_proj = nn.Linear(dim + 128, dim, bias=False)

        # ④ Dilated-Residual blocks（保持原参数）
        dilations = [1, 2, 4, 8]
        blocks = []
        for _ in range(n_blocks):
            for d_in in dilations:
                blocks.append(DilatedResidualBlock(dim, dilation=d_in, dropout=dropout))
        self.net = nn.ModuleList(blocks)

    def forward(self, seq):                      # seq: (B, T)
        id_vec   = self.id_emb(seq)              # (B, T, D)
        cover_vec= self.cover_emb(seq)           # (B, T, 128)
        x = torch.cat([id_vec, cover_vec], -1)   # (B, T, D+128)
        x = self.in_proj(x)                      # (B, T, D) —— 轻量压缩

        for block in self.net:                   # Dilated CNN 堆叠
            x = block(x)
        h = x[:, -1, :]                          # (B, D)
        return h

    def score(self, h, items):
        # item embedding 拼接后投影
        item_id_vec = self.id_emb(items)         # (B, D)
        item_cover_vec = self.cover_emb(items)   # (B, 128)
        item_vec = torch.cat([item_id_vec, item_cover_vec], dim=-1)
        item_vec = self.in_proj(item_vec)        # (B, D)

        # === 加入 L2 规范化 ===
        h = F.normalize(h, p=2, dim=-1)               # (B, D)
        item_vec = F.normalize(item_vec, p=2, dim=-1) # (B, D)

        return (h.unsqueeze(-2) * item_vec).sum(-1)   # cosine similarity



In [38]:
# ======== BPR 损失 ======== #
def bpr_loss(pos_s, neg_s):
    return -torch.log(torch.sigmoid(pos_s.unsqueeze(-1) - neg_s) + 1e-8).mean()



In [39]:
# ======== 训练函数 ======== #
def train_nextitnet_bpr(train_df, test_df=None):
    ds = NextItNetBPRDataset(train_df)
    loader = DataLoader(ds, batch_size=BATCH_SIZE, shuffle=True,
                        num_workers=0, pin_memory=True)

    model = NextItNet(cover_embs=COVER_EMBS).to(DEVICE)
    optim = torch.optim.Adam(model.parameters(), lr=LR)

    for epoch in range(1, EPOCHS+1):
        model.train()
        total_loss, t0 = 0.0, time.time()
        for step, (hist, pos, neg) in enumerate(loader, 1):
            hist, pos, neg = hist.to(DEVICE), pos.to(DEVICE), neg.to(DEVICE)
            h = model(hist)
            pos_s = model.score(h, pos)
            neg_s = model.score(h, neg)
            loss  = bpr_loss(pos_s, neg_s)

            optim.zero_grad()
            loss.backward()
            optim.step()

            total_loss += loss.item() * hist.size(0)
            if step % 100 == 0 or step == 1:
                print(f"[Epoch {epoch}] step {step}/{len(loader)} | loss {loss.item():.4f}", flush=True)

        print(f"Epoch {epoch} finished | avg loss {total_loss/len(ds):.4f} | time {time.time()-t0:.1f}s", flush=True)

    return model


In [40]:
# ======== 构建 (user→历史张量) ======== #
def build_hist_tensors(df):
    cache = {}
    for uid, items in df.groupby('user_id')['item_id']:
        seq = items.tolist()[-MAX_SEQ_LEN:]
        seq = [PAD_IDX]*(MAX_SEQ_LEN-len(seq)) + seq
        cache[uid] = torch.tensor(seq, dtype=torch.long).unsqueeze(0)
    return cache

In [41]:

def evaluate_ranking(
        test_df,              # DataFrame, 必含 user_id / item_id
        train_df,             # DataFrame, 用来构建用户→已交互物品集合
        score_fn,             # callable(users_tensor, items_tensor) → np.array
        num_items,            # 物品总数
        k=10,                 # Hit@K / NDCG@K
        num_neg=100,          # 每个正样本采多少负样本
        user_col='user_id',
        item_col='item_id',
        seed=42
    ):
    """
    不依赖具体模型，只要提供 score_fn 就能评估。
    score_fn: 接收 (user_tensor, item_tensor) 并返回同长度的 Numpy 分数向量。
    """
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

    # 用户历史，用于采负样本 & 过滤
    train_user_dict = (
        train_df.groupby(user_col)[item_col].apply(set).to_dict()
    )

    hits, ndcgs = [], []

    for _, row in test_df.iterrows():
        u = int(row[user_col])
        pos_item = int(row[item_col])

        # ---------- 负采样 ----------
        neg_items = set()
        while len(neg_items) < num_neg:
            neg = random.randint(0, num_items - 1)
            if neg not in train_user_dict.get(u, set()) and neg != pos_item:
                neg_items.add(neg)

        item_candidates = list(neg_items) + [pos_item]

        # ---------- 评分 ----------
        users_t  = torch.LongTensor([u] * len(item_candidates))
        items_t  = torch.LongTensor(item_candidates)
        scores   = score_fn(users_t, items_t)        # ← 只依赖 score_fn
        rank_idx = np.argsort(scores)[::-1]          # 降序
        ranked_items = [item_candidates[i] for i in rank_idx]

        # ---------- 指标 ----------
        if pos_item in ranked_items[:k]:
            hits.append(1)
            rank_pos = ranked_items.index(pos_item)
            ndcgs.append(1 / np.log2(rank_pos + 2))
        else:
            hits.append(0)
            ndcgs.append(0)

    hit_rate = float(np.mean(hits))
    ndcg     = float(np.mean(ndcgs))
    return hit_rate, ndcg

In [42]:
    # ------------------ 训练 ------------------
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model  = train_nextitnet_bpr(train_df, test_df)

[Epoch 1] step 1/508 | loss 0.6959
[Epoch 1] step 100/508 | loss 0.6901
[Epoch 1] step 200/508 | loss 0.6749
[Epoch 1] step 300/508 | loss 0.6415
[Epoch 1] step 400/508 | loss 0.6115
[Epoch 1] step 500/508 | loss 0.5787
Epoch 1 finished | avg loss 0.6541 | time 211.6s
[Epoch 2] step 1/508 | loss 0.5917
[Epoch 2] step 100/508 | loss 0.5716
[Epoch 2] step 200/508 | loss 0.5491
[Epoch 2] step 300/508 | loss 0.5347
[Epoch 2] step 400/508 | loss 0.5346
[Epoch 2] step 500/508 | loss 0.5471
Epoch 2 finished | avg loss 0.5497 | time 213.7s
[Epoch 3] step 1/508 | loss 0.5171
[Epoch 3] step 100/508 | loss 0.4950
[Epoch 3] step 200/508 | loss 0.5053
[Epoch 3] step 300/508 | loss 0.5140
[Epoch 3] step 400/508 | loss 0.5256
[Epoch 3] step 500/508 | loss 0.4976
Epoch 3 finished | avg loss 0.5172 | time 208.4s
[Epoch 4] step 1/508 | loss 0.5034
[Epoch 4] step 100/508 | loss 0.5153
[Epoch 4] step 200/508 | loss 0.5231
[Epoch 4] step 300/508 | loss 0.4897
[Epoch 4] step 400/508 | loss 0.5205
[Epoch 4] 

In [43]:
def make_popularity_score_fn(train_df, item_col='item_id'):
    item_cnt = Counter(train_df[item_col])
    default_score = min(item_cnt.values()) - 1  # 给没出现过的物品一个更低分
    def _score_fn(users_t, items_t):
        return np.array([item_cnt.get(int(i), default_score) for i in items_t])
    return _score_fn

In [44]:
def random_score_fn(users_t, items_t):
    # 随机给每个 items_t 一个分数；users_t 不使用，但必须接收
    return np.random.rand(len(items_t))

In [49]:
def make_score_fn(model, hist_cache):
    """
    评估阶段：user_t 可能是长度 m 的重复用户 id 向量，
    items_t 是长度 m 的候选 item 列表。
    只取 user_t[0] 来生成单个用户向量，再与全部候选做点积。
    """
    model.eval()
    device = next(model.parameters()).device

    # —— 预生成 “ID+cover→投影→L2” 的 item 表 —— #
    with torch.no_grad():
        id_mat    = model.id_emb.weight.data          # (n_items+1, D)
        cover_mat = model.cover_emb.weight.data       # (n_items+1, 128)
        fused_mat = model.in_proj(
            torch.cat([id_mat, cover_mat], dim=1)
        )
        fused_mat = F.normalize(fused_mat, p=2, dim=-1).to(device)

    @torch.no_grad()
    def score_fn(user_t, item_t):
        """
        user_t : 1-D ndarray / tensor (长度 m，全是同一个用户 id)
        item_t : 1-D tensor  (长度 m，候选 item ids)
        return : 1-D numpy  (长度 m，分数)
        """
        # ---- 1. 取该用户的历史序列，算用户向量 ----
        uid = int(user_t[0])                  # 只需一个
        hist_seq = hist_cache[uid].to(device) # shape (1, T)
        h = model(hist_seq)                   # (1, D)
        h = F.normalize(h, p=2, dim=-1)       # (1, D)

        # ---- 2. 取候选 item 融合后表征 ----
        item_vec = fused_mat[item_t.to(device)]   # (m, D)  已归一化

        # ---- 3. cosine 相似度 ----
        scores = (h * item_vec).sum(-1)           # → (m,)
        return scores.cpu().numpy()               # 1-D，长度 = m

    return score_fn


In [50]:

# ----------------NextItNet （或其他模型）------------
score_fn_nextItNet = make_score_fn(model,hist_cache=build_hist_tensors(train_df))
hit_nextItNet, ndcg_nextItNet = evaluate_ranking(
    test_df, train_df, score_fn_nextItNet,
    num_items=num_items, k=10
)
print(f"NextItNet   Hit@10={hit_nextItNet:.4f}  NDCG@10={ndcg_nextItNet:.4f}")

# ---------------- baseline：Popular ----------------
pop_score_fn  = make_popularity_score_fn(train_df)
hit_pop, ndcg_pop = evaluate_ranking(
    test_df, train_df, pop_score_fn,
    num_items=num_items, k=10
)
print(f"Popular  Hit@10={hit_pop:.4f}  NDCG@10={ndcg_pop:.4f}")

# ---------------- baseline：Random -----------------
hit_rand, ndcg_rand = evaluate_ranking(
    test_df, train_df, random_score_fn,
    num_items=num_items, k=10
)
print(f"Random   Hit@10={hit_rand:.4f}  NDCG@10={ndcg_rand:.4f}")



NextItNet   Hit@10=0.2905  NDCG@10=0.1443
Popular  Hit@10=0.2528  NDCG@10=0.1262
Random   Hit@10=0.0996  NDCG@10=0.0455
