In [1]:
import random
import pandas as pd
from collections import Counter
import torch
from sklearn.model_selection import train_test_split
from networkx.readwrite.json_graph import adjacency

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
dataset_pd = pd.read_csv('D:\\VideoRecSystem\\MicroLens\\DataSet\\MicroLens-50k_pairs.csv')
# dataset_pd = pd.read_csv('MicroLens-50k_pairs.csv')

In [4]:
dataset_pd.head(10)

Unnamed: 0,user,item,timestamp
0,36121,9580,1583378629552
1,26572,9580,1583436719018
2,37550,9580,1584412681021
3,14601,9580,1584848802432
4,15061,9580,1585388171106
5,6364,9580,1585390736041
6,3542,9580,1585404918503
7,21038,9580,1590144594477
8,12538,14631,1634867362929
9,47592,14631,1634872254913


In [5]:
dataset_pd.count

<bound method DataFrame.count of          user   item      timestamp
0       36121   9580  1583378629552
1       26572   9580  1583436719018
2       37550   9580  1584412681021
3       14601   9580  1584848802432
4       15061   9580  1585388171106
...       ...    ...            ...
359703  48702   1363  1662984066647
359704  27203   7291  1662984082974
359705  29261  19649  1662984103874
359706  28341  19188  1662984123833
359707  38967   7254  1662984132429

[359708 rows x 3 columns]>

In [6]:
user_counts = dataset_pd['user'].value_counts()
item_counts = dataset_pd['item'].value_counts()
# valid_users = user_counts[user_counts > 3].index
# valid_items = item_counts[item_counts > 3].index
# filtered_df = dataset_pd[dataset_pd['user'].isin(valid_users) & dataset_pd['item'].isin(valid_items)]
# filtered_df.count

In [7]:
# order by user,timestamp 
filtered_df = dataset_pd.sort_values(by=["user", "timestamp"])


In [8]:
def split(df, user_col='user', item_col='item', time_col='timestamp'):

    df = df.sort_values(by=[user_col, time_col])  # 按用户时间排序

    # 获取每个用户的最后一条记录作为 test
    test_df = df.groupby(user_col).tail(1)
    train_df = df.drop(index=test_df.index)

    # 过滤 test 中那些 user/item 不在 train 中的
    train_users = set(train_df[user_col])
    train_items = set(train_df[item_col])

    test_df = test_df[
        test_df[user_col].isin(train_users) &
        test_df[item_col].isin(train_items)
    ]

    return train_df.reset_index(drop=True), test_df.reset_index(drop=True)

In [9]:

train_df, test_df = split(filtered_df,user_col='user', item_col='item', time_col='timestamp')

print(f"Train size: {len(train_df)}")
print(f"Test size: {len(test_df)}")


Train size: 309708
Test size: 49424


In [10]:
# main tain a map from new id to old id, new id for constructing matrix
user2id = {u: i for i, u in enumerate(filtered_df['user'].unique())}
item2id = {i: j for j, i in enumerate(filtered_df['item'].unique())}

# apply to train_df and test_df
train_df['user_id'] = train_df['user'].map(user2id)
train_df['item_id'] = train_df['item'].map(item2id)
test_df['user_id'] = test_df['user'].map(user2id)
test_df['item_id'] = test_df['item'].map(item2id)

num_users = len(user2id)
num_items = len(item2id)


In [11]:

# ========== Load cover_emb128.lmdb and build cover_matrix ==========

import lmdb
import numpy as np
import os

def _decode_key(key_bytes: bytes) -> int:
    # Assume key stored as ascii string of integer
    return int(key_bytes.decode('ascii'))

def load_cover_matrix(lmdb_path: str, num_items: int, vector_dim: int = 128, dtype=np.float32):
    """
    加载封面向量 LMDB，key 从 1 到 num_items，对应行号。
    第 0 行保留为 padding，全 0。

    返回:
        np.ndarray, shape = (num_items + 1, vector_dim)
    """
    import lmdb
    import numpy as np

    mat = np.zeros((num_items + 1, vector_dim), dtype=dtype)  # index 0 is reserved padding
    env = lmdb.open(lmdb_path, readonly=True, subdir=False, lock=False, readahead=False)

    with env.begin() as txn:
        for idx in range(1, num_items + 1):
            key = str(idx).encode()
            val = txn.get(key)
            if val is None:
                continue
            vec = np.frombuffer(val, dtype=dtype)
            if vec.size != vector_dim:
                raise ValueError(f"Item {idx} vector dim {vec.size} != {vector_dim}")
            mat[idx] = vec

    env.close()
    return mat

# ----- Path to LMDB -----
lmdb_path = r"D:/VideoRecSystem/MicroLens/cover_emb128.lmdb"
cover_matrix = load_cover_matrix(lmdb_path, num_items=num_items, vector_dim=128)
print('cover_matrix shape:', cover_matrix.shape)


cover_matrix shape: (19221, 128)


In [12]:

# DSSM implementation that directly concatenates ID embedding with cover embedding

import torch
import torch.nn as nn
import torch.nn.functional as F

class DSSM(nn.Module):
    def __init__(
        self,
        num_users: int,
        num_items: int,
        id_emb_dim: int,
        cover_matrix,
        hidden_dims=(128, 64),
        dropout: float = 0.2,
        padding_idx: int = 0,
        freeze_cover: bool = True,
    ):
        super().__init__()

        # User ID embedding
        self.user_emb = nn.Embedding(num_users + 1, id_emb_dim, padding_idx=padding_idx)
        nn.init.xavier_uniform_(self.user_emb.weight)

        # Item ID embedding
        self.item_id_emb = nn.Embedding(num_items + 1, id_emb_dim, padding_idx=padding_idx)
        nn.init.xavier_uniform_(self.item_id_emb.weight)

        # Cover embedding lookup
        cover_tensor = torch.tensor(cover_matrix, dtype=torch.float32)
        self.cover_emb = nn.Embedding.from_pretrained(
            cover_tensor,
            freeze=freeze_cover,
            padding_idx=padding_idx
        )
        self.item_input_dim = id_emb_dim + cover_tensor.shape[1]

        def mlp_block(in_dim):
            layers = []
            for h in hidden_dims:
                layers.append(nn.Linear(in_dim, h))
                layers.append(nn.ReLU())
                layers.append(nn.Dropout(dropout))
                in_dim = h
            return nn.Sequential(*layers)

        self.mlp_user = mlp_block(id_emb_dim)
        self.mlp_item = mlp_block(self.item_input_dim)

    def forward(self, user_ids, item_ids, l2_norm: bool = True):
        u_emb = self.user_emb(user_ids)
        id_vec = self.item_id_emb(item_ids)
        cover_vec = self.cover_emb(item_ids)
        i_emb = torch.cat([id_vec, cover_vec], dim=-1)

        u_vec = self.mlp_user(u_emb)
        i_vec = self.mlp_item(i_emb)

        if l2_norm:
            u_vec = F.normalize(u_vec, p=2, dim=1)
            i_vec = F.normalize(i_vec, p=2, dim=1)

        score = (u_vec * i_vec).sum(dim=1)
        return score, u_vec, i_vec
    def get_item_vec(self, item_ids):
        id_vec = self.item_id_emb(item_ids)
        cover_vec = self.cover_emb(item_ids)
        combined = torch.cat([id_vec, cover_vec], dim=-1)
        return self.mlp_item(combined)
    def get_embedding(self, user_id, item_id):
        return self.forward(user_id, item_id)[0]


In [13]:
# -----------------------------------------------------------
# bpr_loss: 直接接受分数，内部求 mean，返回标量 Tensor
# -----------------------------------------------------------
def bpr_loss(pos_scores, neg_scores):
    """
    pos_scores, neg_scores: shape = (B,)
    返回标量 Tensor
    """
    return -torch.log(torch.sigmoid(pos_scores - neg_scores)).mean()


In [14]:
# Negative sampling function
users = train_df['user_id'].unique()
user_pos_dict = train_df.groupby('user_id')['item_id'].apply(set).to_dict()

def sample_batch(num_items, batch_size):

    batch_users = np.random.choice(users, size=batch_size)

    user_ids, pos_ids, neg_ids = [], [], []
    for u in batch_users:
        pos_items = list(user_pos_dict[u])
        pos = random.choice(pos_items)
        while True:
            neg = random.randint(0, num_items - 1)
            if neg not in user_pos_dict[u]:
                break
        user_ids.append(u)
        pos_ids.append(pos)
        neg_ids.append(neg)

    return torch.LongTensor(user_ids), torch.LongTensor(pos_ids), torch.LongTensor(neg_ids)

In [15]:
import torch
import numpy as np

# --------- 采样函数保持不变 (示意) ------------
# def sample_batch(train_df, num_items, batch_size): ...

# --------- BPR 损失保持不变 -------------------
# def bpr_loss(user_vec, pos_vec, neg_vec): ...

# -----------------------------------------------------------

# -----------------------------------------------------------
def train_model(model,
                train_df,
                num_items,
                epochs=10,
                batch_size=1024,
                lr=1e-3,
                print_every=1,
                max_grad_norm=None,
                device=None):

    if device is None:
        device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    for epoch in range(1, epochs + 1):
        model.train()         # train model

        # —— epoch 内建议跑多个 mini-batch ——
        #   下面示例：每个 epoch 跑 len(train_df)//batch_size 个 batch
        num_steps = max(1, len(train_df) // batch_size)

        epoch_loss = 0.0
        for _ in range(num_steps):
            # 1.负采样
            user_ids, pos_ids, neg_ids = sample_batch( num_items, batch_size)

            # 2.搬设备
            user_ids = user_ids.to(device)   # 已经是 LongTensor
            pos_ids  = pos_ids.to(device)
            neg_ids  = neg_ids.to(device)

            # 只跑一次 user 塔
            u_vec = model.mlp_user(model.user_emb(user_ids))          # (B,d)
            pos_vec = model.get_item_vec(pos_ids)
            neg_vec = model.get_item_vec(neg_ids)


            # 归一化（与 forward 保持一致）
            u_vec = F.normalize(u_vec, p=2, dim=1)
            pos_vec = F.normalize(pos_vec, p=2, dim=1)
            neg_vec = F.normalize(neg_vec, p=2, dim=1)

            loss = bpr_loss((u_vec * pos_vec).sum(-1), (u_vec * neg_vec).sum(-1))

            # 5.反向 & 更新
            optimizer.zero_grad()
            loss.backward()
            if max_grad_norm is not None:
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
            optimizer.step()

            epoch_loss += loss.item()

        # —— epoch 结束，打印日志 ——
        avg_loss = epoch_loss / num_steps
        if epoch % print_every == 0:
            print(f"[Epoch {epoch:02d}/{epochs}]  avg BPR Loss = {avg_loss:.4f}")

    return



In [16]:

def evaluate_ranking(
        test_df,              # DataFrame, 必含 user_id / item_id
        train_df,             # DataFrame, 用来构建用户→已交互物品集合
        score_fn,             # callable(users_tensor, items_tensor) → np.array
        num_items,            # 物品总数
        k=10,                 # Hit@K / NDCG@K
        num_neg=100,          # 每个正样本采多少负样本
        user_col='user_id',
        item_col='item_id',
        seed=42
    ):
    """
    不依赖具体模型，只要提供 score_fn 就能评估。
    score_fn: 接收 (user_tensor, item_tensor) 并返回同长度的 Numpy 分数向量。
    """
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

    # 用户历史，用于采负样本 & 过滤
    train_user_dict = (
        train_df.groupby(user_col)[item_col].apply(set).to_dict()
    )

    hits, ndcgs = [], []

    for _, row in test_df.iterrows():
        u = int(row[user_col])
        pos_item = int(row[item_col])

        # ---------- 负采样 ----------
        neg_items = set()
        while len(neg_items) < num_neg:
            neg = random.randint(0, num_items - 1)
            if neg not in train_user_dict.get(u, set()) and neg != pos_item:
                neg_items.add(neg)

        item_candidates = list(neg_items) + [pos_item]

        # ---------- 评分 ----------
        users_t  = torch.LongTensor([u] * len(item_candidates))
        items_t  = torch.LongTensor(item_candidates)
        scores   = score_fn(users_t, items_t)        # ← 只依赖 score_fn
        rank_idx = np.argsort(scores)[::-1]          # 降序
        ranked_items = [item_candidates[i] for i in rank_idx]

        # ---------- 指标 ----------
        if pos_item in ranked_items[:k]:
            hits.append(1)
            rank_pos = ranked_items.index(pos_item)
            ndcgs.append(1 / np.log2(rank_pos + 2))
        else:
            hits.append(0)
            ndcgs.append(0)

    hit_rate = float(np.mean(hits))
    ndcg     = float(np.mean(ndcgs))
    return hit_rate, ndcg

In [17]:
# Top-K recommendation for a user
def recommend_top_k(model, user_id, train_df, k=10):
    model.eval()
    user_emb, item_emb = model.get_embedding()
    seen_items = set(train_df[train_df['user_id'] == user_id]['item_id'])
    all_items = torch.arange(model.num_items)
    scores = model.predict(torch.LongTensor([user_id] * model.num_items), all_items).detach().numpy()

    ranked_items = np.argsort(scores)[::-1]
    recommended = [i for i in ranked_items if i not in seen_items][:k]
    return recommended

In [18]:
# Save and load model

def save_model(model, path):
    torch.save(model.state_dict(), path)

def load_model(model, path):
    model.load_state_dict(torch.load(path))
    model.eval()

In [19]:

# ----- Instantiate DSSM with cover embeddings -----
model = DSSM(
    num_users=num_users,
    num_items=num_items,
    id_emb_dim=64,            # keep same ID embedding dim as before
    cover_matrix=cover_matrix,
)
model.to(device)

train_model(model=model, epochs=30, train_df=train_df, num_items=num_items)


[Epoch 01/30]  avg BPR Loss = 0.6109
[Epoch 02/30]  avg BPR Loss = 0.5879
[Epoch 03/30]  avg BPR Loss = 0.5839
[Epoch 04/30]  avg BPR Loss = 0.5801
[Epoch 05/30]  avg BPR Loss = 0.5762
[Epoch 06/30]  avg BPR Loss = 0.5702
[Epoch 07/30]  avg BPR Loss = 0.5667
[Epoch 08/30]  avg BPR Loss = 0.5639
[Epoch 09/30]  avg BPR Loss = 0.5622
[Epoch 10/30]  avg BPR Loss = 0.5597
[Epoch 11/30]  avg BPR Loss = 0.5597
[Epoch 12/30]  avg BPR Loss = 0.5582
[Epoch 13/30]  avg BPR Loss = 0.5573
[Epoch 14/30]  avg BPR Loss = 0.5574
[Epoch 15/30]  avg BPR Loss = 0.5565
[Epoch 16/30]  avg BPR Loss = 0.5559
[Epoch 17/30]  avg BPR Loss = 0.5562
[Epoch 18/30]  avg BPR Loss = 0.5557
[Epoch 19/30]  avg BPR Loss = 0.5563
[Epoch 20/30]  avg BPR Loss = 0.5554
[Epoch 21/30]  avg BPR Loss = 0.5546
[Epoch 22/30]  avg BPR Loss = 0.5546
[Epoch 23/30]  avg BPR Loss = 0.5538
[Epoch 24/30]  avg BPR Loss = 0.5539
[Epoch 25/30]  avg BPR Loss = 0.5546
[Epoch 26/30]  avg BPR Loss = 0.5543
[Epoch 27/30]  avg BPR Loss = 0.5542
[

In [20]:
def make_popularity_score_fn(train_df, item_col='item_id'):
    item_cnt = Counter(train_df[item_col])
    default_score = min(item_cnt.values()) - 1  # 给没出现过的物品一个更低分
    def _score_fn(users_t, items_t):
        return np.array([item_cnt.get(int(i), default_score) for i in items_t])
    return _score_fn

In [21]:
def random_score_fn(users_t, items_t):
    # 随机给每个 items_t 一个分数；users_t 不使用，但必须接收
    return np.random.rand(len(items_t))

In [22]:
def make_DSSM_score_fn(model):
    # model.eval()
    # with torch.no_grad():
    #     score = model.get_embedding(users_t,items_t)
    #     user_emb = user_emb.to("cuda" if torch.cuda.is_available() else "cpu")
    #     item_emb = item_emb.to("cuda" if torch.cuda.is_available() else "cpu")

    def score_fn(users_t, items_t):
        model.eval()
        with torch.no_grad():
         users_t = users_t.to(device)
         items_t = items_t.to(device)
         scores = model.get_embedding(users_t,items_t)
         return scores.cpu().detach().numpy()
        # u_vec = user_emb[users_t]
        # i_vec = item_emb[items_t]
        # return torch.sum(u_vec * i_vec, dim=1).detach().cpu().numpy()
    return score_fn

In [23]:

# ---------------- DSSM（或其他模型）------------
score_fn_dssm = make_DSSM_score_fn(model)
hit_dssm, ndcg_dssm = evaluate_ranking(
    test_df, train_df, score_fn_dssm,
    num_items=num_items, k=10
)
print(f"DSSM   Hit@10={hit_dssm:.4f}  NDCG@10={ndcg_dssm:.4f}")

# ---------------- baseline：Popular ----------------
pop_score_fn  = make_popularity_score_fn(train_df)
hit_pop, ndcg_pop = evaluate_ranking(
    test_df, train_df, pop_score_fn,
    num_items=num_items, k=10
)
print(f"Popular  Hit@10={hit_pop:.4f}  NDCG@10={ndcg_pop:.4f}")

# ---------------- baseline：Random -----------------
hit_rand, ndcg_rand = evaluate_ranking(
    test_df, train_df, random_score_fn,
    num_items=num_items, k=10
)
print(f"Random   Hit@10={hit_rand:.4f}  NDCG@10={ndcg_rand:.4f}")



DSSM   Hit@10=0.2697  NDCG@10=0.1236
Popular  Hit@10=0.2528  NDCG@10=0.1262
Random   Hit@10=0.0996  NDCG@10=0.0455
