In [1]:
import random
import pandas as pd
from collections import Counter
import torch
from sklearn.model_selection import train_test_split
from networkx.readwrite.json_graph import adjacency

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
dataset_pd = pd.read_csv('D:\\VideoRecSystem\\MicroLens\\DataSet\\MicroLens-50k_pairs.csv')
# dataset_pd = pd.read_csv('MicroLens-50k_pairs.csv')

In [4]:
dataset_pd.head(10)

Unnamed: 0,user,item,timestamp
0,36121,9580,1583378629552
1,26572,9580,1583436719018
2,37550,9580,1584412681021
3,14601,9580,1584848802432
4,15061,9580,1585388171106
5,6364,9580,1585390736041
6,3542,9580,1585404918503
7,21038,9580,1590144594477
8,12538,14631,1634867362929
9,47592,14631,1634872254913


In [5]:
dataset_pd.count

<bound method DataFrame.count of          user   item      timestamp
0       36121   9580  1583378629552
1       26572   9580  1583436719018
2       37550   9580  1584412681021
3       14601   9580  1584848802432
4       15061   9580  1585388171106
...       ...    ...            ...
359703  48702   1363  1662984066647
359704  27203   7291  1662984082974
359705  29261  19649  1662984103874
359706  28341  19188  1662984123833
359707  38967   7254  1662984132429

[359708 rows x 3 columns]>

In [6]:
user_counts = dataset_pd['user'].value_counts()
item_counts = dataset_pd['item'].value_counts()
valid_users = user_counts[user_counts > 3].index
valid_items = item_counts[item_counts > 3].index
filtered_df = dataset_pd[dataset_pd['user'].isin(valid_users) & dataset_pd['item'].isin(valid_items)]
filtered_df.count

<bound method DataFrame.count of          user   item      timestamp
0       36121   9580  1583378629552
1       26572   9580  1583436719018
2       37550   9580  1584412681021
3       14601   9580  1584848802432
4       15061   9580  1585388171106
...       ...    ...            ...
359703  48702   1363  1662984066647
359704  27203   7291  1662984082974
359705  29261  19649  1662984103874
359706  28341  19188  1662984123833
359707  38967   7254  1662984132429

[352649 rows x 3 columns]>

In [7]:
dataset_pd = filtered_df.count

In [8]:
# order by user,timestamp 
filtered_df = filtered_df.sort_values(by=["user", "timestamp"])


In [9]:
def split(df, user_col='user', item_col='item', time_col='timestamp'):

    df = df.sort_values(by=[user_col, time_col])  # 按用户时间排序

    # 获取每个用户的最后一条记录作为 test
    test_df = df.groupby(user_col).tail(1)
    train_df = df.drop(index=test_df.index)

    # 过滤 test 中那些 user/item 不在 train 中的
    train_users = set(train_df[user_col])
    train_items = set(train_df[item_col])

    test_df = test_df[
        test_df[user_col].isin(train_users) &
        test_df[item_col].isin(train_items)
    ]

    return train_df.reset_index(drop=True), test_df.reset_index(drop=True)

In [10]:

train_df, test_df = split(filtered_df,user_col='user', item_col='item', time_col='timestamp')

print(f"Train size: {len(train_df)}")
print(f"Test size: {len(test_df)}")


Train size: 302650
Test size: 49681


In [11]:
# main tain a map from new id to old id, new id for constructing matrix
user2id = {u: i for i, u in enumerate(filtered_df['user'].unique())}
item2id = {i: j for j, i in enumerate(filtered_df['item'].unique())}

# apply to train_df and test_df
train_df['user_id'] = train_df['user'].map(user2id)
train_df['item_id'] = train_df['item'].map(item2id)
test_df['user_id'] = test_df['user'].map(user2id)
test_df['item_id'] = test_df['item'].map(item2id)

num_users = len(user2id)
num_items = len(item2id)


In [12]:
import numpy as np
import scipy.sparse as sp
import torch

def build_adj_matrix(df, num_users, num_items):
    rows = df['user_id'].values
    cols = df['item_id'].values
    data = np.ones(len(df))
    # set interaction of user-item as 1, other as 0
    R = sp.coo_matrix((data, (rows, cols)), shape=(num_users, num_items))

    # construct symetric matrix A
    upper = sp.hstack([sp.csr_matrix((num_users, num_users)), R])
    lower = sp.hstack([R.T, sp.csr_matrix((num_items, num_items))])
    A = sp.vstack([upper, lower])

    # normalization A → Ĥ = D^{-1/2} A D^{-1/2}
    rowsum = np.array(A.sum(1)).flatten()
    d_inv_sqrt = np.power(rowsum, -0.5)
    d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0.
    D_inv_sqrt = sp.diags(d_inv_sqrt)
    A_norm = D_inv_sqrt @ A @ D_inv_sqrt

    # transform to torch.sparse
    A_norm = A_norm.tocoo()
    indices = torch.LongTensor([A_norm.row, A_norm.col])
    values = torch.FloatTensor(A_norm.data)
    return torch.sparse_coo_tensor(indices, values, A_norm.shape)


In [13]:
adj_torch = build_adj_matrix(train_df, num_users, num_items)

  d_inv_sqrt = np.power(rowsum, -0.5)
  indices = torch.LongTensor([A_norm.row, A_norm.col])


In [15]:

# ====== Load 128‑d cover embeddings from LMDB and align to item2id ======
import lmdb, numpy as np, torch

lmdb_path = r"D:\VideoRecSystem\MicroLens\cover_emb128.lmdb"   # ← 修改为实际路径（可 Pathlib）

def build_image_matrix(lmdb_path: str, item2id: dict, dtype=np.float32):
    env = lmdb.open(lmdb_path, subdir=False, readonly=True, lock=False, readahead=False)
    img_dim = 128
    num_items = len(item2id)
    mat = np.zeros((num_items, img_dim), dtype=dtype)
    with env.begin() as txn:
        for orig_id, new_id in item2id.items():
            key = str(orig_id).encode()
            val = txn.get(key)
            if val is None:
                continue          # 若缺失则保持 0
            vec = np.frombuffer(val, dtype=dtype)
            if vec.size != img_dim:
                raise ValueError(f"Item {orig_id} vec dim {vec.size} != 128")
            mat[new_id] = vec
    env.close()
    return torch.from_numpy(mat)

image_tensor = build_image_matrix(lmdb_path, item2id)          # shape = (num_items,128)


In [16]:

# ====== LightGCN WITH Image Embedding ======
import torch
import torch.nn as nn
import torch.nn.functional as F

class LightGCN(nn.Module):
    """LightGCN that concatenates id embedding with pre‑computed 128‑d image vectors."""
    def __init__(self, num_users, num_items, embedding_dim, n_layers, adjacency,
                 image_tensor=None, freeze_image=True):
        super().__init__()
        self.num_users = num_users
        self.num_items = num_items
        self.id_dim     = embedding_dim
        self.img_dim    = 0 if image_tensor is None else image_tensor.size(1)
        self.embedding_dim = self.id_dim + self.img_dim
        self.n_layers   = n_layers
        self.adjacency  = adjacency  # torch.sparse

        # --- user embedding (id part + optional img‑pad) ---
        self.embedding_user = nn.Embedding(num_users, self.embedding_dim)
        nn.init.xavier_uniform_(self.embedding_user.weight)

        # --- item id embedding ---
        self.embedding_item_id = nn.Embedding(num_items, self.id_dim)
        nn.init.xavier_uniform_(self.embedding_item_id.weight)

        # --- item image embedding ---
        if image_tensor is not None:
            self.embedding_item_img = nn.Embedding.from_pretrained(
                image_tensor, freeze=freeze_image
            )
        else:
            self.embedding_item_img = None

    # --------------------------------------------------------------
    def propagate(self, all_emb, adj_mat):
        emb_list = [all_emb]
        for _ in range(self.n_layers):
            all_emb = torch.sparse.mm(adj_mat, all_emb)
            emb_list.append(all_emb)
        return torch.stack(emb_list, dim=1).mean(dim=1)

    # --------------------------------------------------------------
    def forward(self):
        # ----- build initial concatenated vector -----
        if self.embedding_item_img is not None:
            item_emb = torch.cat([self.embedding_item_id.weight,
                                  self.embedding_item_img.weight], dim=1)
        else:
            zeros = torch.zeros(self.num_items, self.img_dim, device=self.embedding_item_id.weight.device)
            item_emb = torch.cat([self.embedding_item_id.weight, zeros], dim=1)

        all_emb_0 = torch.cat([self.embedding_user.weight, item_emb], dim=0)
        all_emb_L = self.propagate(all_emb_0, self.adjacency)

        user_emb_final, item_emb_final = torch.split(
            all_emb_L, [self.num_users, self.num_items]
        )
        return user_emb_final, item_emb_final

    # --------------------------------------------------------------
    def get_embedding(self):
        return self.forward()

    def predict(self, user_indices, item_indices):
        u_e, i_e = self.forward()
        return (u_e[user_indices] * i_e[item_indices]).sum(dim=1)


In [17]:
def bpr_loss(user_emb, pos_item_emb, neg_item_emb):
    pos_scores = (user_emb * pos_item_emb).sum(dim=1)
    neg_scores = (user_emb * neg_item_emb).sum(dim=1)
    loss = -torch.log(torch.sigmoid(pos_scores - neg_scores)).mean()
    return loss

In [18]:
# Negative sampling function
def sample_batch(train_df, num_items, batch_size):
    users = train_df['user_id'].unique()
    batch_users = np.random.choice(users, size=batch_size)
    user_pos_dict = train_df.groupby('user_id')['item_id'].apply(set).to_dict()

    user_ids, pos_ids, neg_ids = [], [], []
    for u in batch_users:
        pos_items = list(user_pos_dict[u])
        pos = random.choice(pos_items)
        while True:
            neg = random.randint(0, num_items - 1)
            if neg not in user_pos_dict[u]:
                break
        user_ids.append(u)
        pos_ids.append(pos)
        neg_ids.append(neg)

    return torch.LongTensor(user_ids), torch.LongTensor(pos_ids), torch.LongTensor(neg_ids)

In [19]:
import torch
import numpy as np

# --------- 采样函数保持不变 (示意) ------------
# def sample_batch(train_df, num_items, batch_size): ...

# --------- BPR 损失保持不变 -------------------
# def bpr_loss(user_vec, pos_vec, neg_vec): ...

def train_model(model,
                train_df,
                num_items,
                epochs=10,
                batch_size=1024,
                lr=1e-3,
                print_every=1,
                max_grad_norm=None,          # =None 时不裁剪
                device=None):
    """
    训练 LightGCN (或其它 BPR 模型) 的通用函数
    ------------------------------------------------
    • train_df      : pandas DataFrame，含 user_id / item_id
    • num_items     : 物品总数
    • device        : torch.device；默认为 'cuda' (若可用) 否则 'cpu'
    • max_grad_norm : 梯度裁剪阈值；避免梯度爆炸，可选
    """
    # -------- 设备 ----------
    if device is None:
        device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    model = model.to(device)
    if hasattr(model, "adjacency"):               # adjacency 可能是稀疏张量
        model.adjacency = model.adjacency.to(device)

    # -------- 优化器 ----------
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    # -------- 训练循环 ----------
    for epoch in range(1, epochs + 1):
        model.train()

        # ==== 采样一个 mini-batch ====
        user_ids, pos_ids, neg_ids = sample_batch(train_df, num_items, batch_size)

        # ==== Tensor 化并搬设备 ====
        user_ids = torch.LongTensor(user_ids).to(device)
        pos_ids  = torch.LongTensor(pos_ids).to(device)
        neg_ids  = torch.LongTensor(neg_ids).to(device)

        # ==== 前向、计算损失 ====
        user_emb, item_emb = model.get_embedding()          # already on device
        loss = bpr_loss(user_emb[user_ids],
                        item_emb[pos_ids],
                        item_emb[neg_ids])

        # ==== 反向 ====
        optimizer.zero_grad()
        loss.backward()

        # （可选）梯度裁剪
        if max_grad_norm is not None:
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)

        optimizer.step()

        # ==== 打印 ====
        if epoch % print_every == 0:
            print(f"[Epoch {epoch}/{epochs}]  BPR Loss = {loss.item():.4f}")

    # -------- 训练完返回最终 loss (方便日志) --------
    return float(loss.item())


In [20]:




def evaluate_ranking(
        test_df,              # DataFrame, 必含 user_id / item_id
        train_df,             # DataFrame, 用来构建用户→已交互物品集合
        score_fn,             # callable(users_tensor, items_tensor) → np.array
        num_items,            # 物品总数
        k=10,                 # Hit@K / NDCG@K
        num_neg=100,          # 每个正样本采多少负样本
        user_col='user_id',
        item_col='item_id',
        seed=42
    ):
    """
    不依赖具体模型，只要提供 score_fn 就能评估。
    score_fn: 接收 (user_tensor, item_tensor) 并返回同长度的 Numpy 分数向量。
    """
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

    # 用户历史，用于采负样本 & 过滤
    train_user_dict = (
        train_df.groupby(user_col)[item_col].apply(set).to_dict()
    )

    hits, ndcgs = [], []

    for _, row in test_df.iterrows():
        u = int(row[user_col])
        pos_item = int(row[item_col])

        # ---------- 负采样 ----------
        neg_items = set()
        while len(neg_items) < num_neg:
            neg = random.randint(0, num_items - 1)
            if neg not in train_user_dict.get(u, set()) and neg != pos_item:
                neg_items.add(neg)

        item_candidates = list(neg_items) + [pos_item]

        # ---------- 评分 ----------
        users_t  = torch.LongTensor([u] * len(item_candidates))
        items_t  = torch.LongTensor(item_candidates)
        scores   = score_fn(users_t, items_t)        # ← 只依赖 score_fn
        rank_idx = np.argsort(scores)[::-1]          # 降序
        ranked_items = [item_candidates[i] for i in rank_idx]

        # ---------- 指标 ----------
        if pos_item in ranked_items[:k]:
            hits.append(1)
            rank_pos = ranked_items.index(pos_item)
            ndcgs.append(1 / np.log2(rank_pos + 2))
        else:
            hits.append(0)
            ndcgs.append(0)

    hit_rate = float(np.mean(hits))
    ndcg     = float(np.mean(ndcgs))
    return hit_rate, ndcg

In [21]:
# Top-K recommendation for a user
def recommend_top_k(model, user_id, train_df, k=10):
    model.eval()
    user_emb, item_emb = model.get_embedding()
    seen_items = set(train_df[train_df['user_id'] == user_id]['item_id'])
    all_items = torch.arange(model.num_items)
    scores = model.predict(torch.LongTensor([user_id] * model.num_items), all_items).detach().numpy()

    ranked_items = np.argsort(scores)[::-1]
    recommended = [i for i in ranked_items if i not in seen_items][:k]
    return recommended

In [22]:
# Save and load model

def save_model(model, path):
    torch.save(model.state_dict(), path)

def load_model(model, path):
    model.load_state_dict(torch.load(path))
    model.eval()

In [23]:

# ===== Instantiate LightGCN with image vectors =====
model = LightGCN(num_users, num_items,
                 embedding_dim=128,       # id embedding dim
                 n_layers=2,
                 adjacency=adj_torch,
                 image_tensor=image_tensor,
                 freeze_image=True)
model.to(device)
train_model(model=model, train_df=train_df, epochs=300,
            batch_size=1024, lr=0.005, num_items=model.num_items)


[Epoch 1/300]  BPR Loss = 0.6930
[Epoch 2/300]  BPR Loss = 0.6928
[Epoch 3/300]  BPR Loss = 0.6925
[Epoch 4/300]  BPR Loss = 0.6921
[Epoch 5/300]  BPR Loss = 0.6915
[Epoch 6/300]  BPR Loss = 0.6907
[Epoch 7/300]  BPR Loss = 0.6904
[Epoch 8/300]  BPR Loss = 0.6891
[Epoch 9/300]  BPR Loss = 0.6874
[Epoch 10/300]  BPR Loss = 0.6864
[Epoch 11/300]  BPR Loss = 0.6850
[Epoch 12/300]  BPR Loss = 0.6835
[Epoch 13/300]  BPR Loss = 0.6812
[Epoch 14/300]  BPR Loss = 0.6778
[Epoch 15/300]  BPR Loss = 0.6775
[Epoch 16/300]  BPR Loss = 0.6744
[Epoch 17/300]  BPR Loss = 0.6746
[Epoch 18/300]  BPR Loss = 0.6695
[Epoch 19/300]  BPR Loss = 0.6643
[Epoch 20/300]  BPR Loss = 0.6608
[Epoch 21/300]  BPR Loss = 0.6640
[Epoch 22/300]  BPR Loss = 0.6590
[Epoch 23/300]  BPR Loss = 0.6557
[Epoch 24/300]  BPR Loss = 0.6549
[Epoch 25/300]  BPR Loss = 0.6470
[Epoch 26/300]  BPR Loss = 0.6460
[Epoch 27/300]  BPR Loss = 0.6462
[Epoch 28/300]  BPR Loss = 0.6412
[Epoch 29/300]  BPR Loss = 0.6305
[Epoch 30/300]  BPR Los

0.10271526128053665

In [24]:
def random_score_fn(users_t, items_t):
    # 随机给每个 items_t 一个分数；users_t 不使用，但必须接收
    return np.random.rand(len(items_t))

In [25]:
def make_popularity_score_fn(train_df, item_col='item_id'):
    item_cnt = Counter(train_df[item_col])
    default_score = min(item_cnt.values()) - 1  # 给没出现过的物品一个更低分
    def _score_fn(users_t, items_t):
        return np.array([item_cnt.get(int(i), default_score) for i in items_t])
    return _score_fn

In [26]:
def make_lightgcn_score_fn(model):
    model.eval()
    with torch.no_grad():
        user_emb, item_emb = model.get_embedding()
        user_emb = user_emb.to("cuda" if torch.cuda.is_available() else "cpu")
        item_emb = item_emb.to("cuda" if torch.cuda.is_available() else "cpu")

    def score_fn(users_t, items_t):
        u_vec = user_emb[users_t]
        i_vec = item_emb[items_t]
        return torch.sum(u_vec * i_vec, dim=1).cpu().numpy()
    return score_fn


In [27]:
pop_score_fn  = make_popularity_score_fn(train_df)
# ---------------- baseline：Popular ----------------
hit_pop, ndcg_pop = evaluate_ranking(
    test_df, train_df, pop_score_fn,
    num_items=model.num_items, k=10
)
print(f"Popular  Hit@10={hit_pop:.4f}  NDCG@10={ndcg_pop:.4f}")

# ---------------- baseline：Random -----------------
hit_rand, ndcg_rand = evaluate_ranking(
    test_df, train_df, random_score_fn,
    num_items=model.num_items, k=10
)
print(f"Random   Hit@10={hit_rand:.4f}  NDCG@10={ndcg_rand:.4f}")

# ---------------- LightGCN（或其他模型）------------
score_fn_gcn = make_lightgcn_score_fn(model)
hit_gcn, ndcg_gcn = evaluate_ranking(
    test_df, train_df, score_fn_gcn,
    num_items=model.num_items, k=10
)
print(f"LightGCN Hit@10={hit_gcn:.4f}  NDCG@10={ndcg_gcn:.4f}")

Popular  Hit@10=0.2166  NDCG@10=0.1081
Random   Hit@10=0.0996  NDCG@10=0.0455
LightGCN Hit@10=0.4949  NDCG@10=0.2983
