In [1]:
from google.colab import drive
import shutil
import os
def copy_from_drive(src_path, dst_path):

    if os.path.exists(dst_path):
        print(f"skip:{dst_path} exists")
        return

    if os.path.isdir(src_path):
        shutil.copytree(src_path, dst_path)
    elif os.path.isfile(src_path):
        shutil.copy(src_path, dst_path)

drive.mount('/content/drive')
copy_from_drive('/content/drive/MyDrive/tool', '/content/tool')
copy_from_drive('/content/drive/MyDrive/MicroLens-50k_pairs.csv','/content/MicroLens-50k_pairs.csv')

Mounted at /content/drive


In [2]:
from datetime import datetime
from tool import evaluate
from tool import preprocess
from tool import customdataset
!pip install faiss-cpu
import faiss
import scipy.sparse as sp
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import random


Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl (31.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m64.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.11.0


In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
# dataset_pd = pd.read_csv('D:\\VideoRecSystem\\MicroLens\\DataSet\\MicroLens-50k_pairs.csv')
path = 'MicroLens-50k_pairs.csv'
user = 'user'
item = 'item'
user_id = 'user_id'
item_id = 'item_id'
timestamp = 'timestamp'
save_dir = './embeddings'
top_k = 10
num_workers = 10
k_neg = 10
# path = pd.read_csv('MicroLens-50k_pairs.csv')
# dataset_pd = pd.read_csv('MicroLens-50k_pairs.csv')

In [5]:
dataset_pd,num_users,num_items = preprocess.openAndSort(path,user_id=user,item_id=item,timestamp='timestamp')

dataset base information：
- number of users：50000
- number of items：19220
- number of rows：359708


In [6]:

train_df, test_df = preprocess.split(dataset_pd,user, item, timestamp)
print(f"Train size: {len(train_df)}")
print(f"Test size: {len(test_df)}")

Train size: 309708
Test size: 49424


In [7]:
# maintain a map from new id to old id, new id for constructing matrix
user2id = {u: i for i, u in enumerate(dataset_pd[user].unique())}
item2id = {i: j for j, i in enumerate(dataset_pd[item].unique())}

# apply to train_df and test_df
train_df[user_id] = train_df[user].map(user2id)
train_df[item_id] = train_df[item].map(item2id)
test_df[user_id] = test_df[user].map(user2id)
test_df[item_id] = test_df[item].map(item2id)

In [8]:

def build_adj_matrix(df, num_users, num_items ,user_id, item_id):
    rows = df[user_id].values
    cols = df[item_id].values
    data = np.ones(len(df))
    # set interaction of user-item as 1, other as 0
    R = sp.coo_matrix((data, (rows, cols)), shape=(num_users, num_items))

    # construct symetric matrix A
    upper = sp.hstack([sp.csr_matrix((num_users, num_users)), R])
    lower = sp.hstack([R.T, sp.csr_matrix((num_items, num_items))])
    A = sp.vstack([upper, lower])

    # normalization A → Ĥ = D^{-1/2} A D^{-1/2}
    rowsum = np.array(A.sum(1)).flatten()
    d_inv_sqrt = np.power(rowsum, -0.5)
    d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0.
    D_inv_sqrt = sp.diags(d_inv_sqrt)
    A_norm = D_inv_sqrt @ A @ D_inv_sqrt

    # transform to torch.sparse
    A_norm = A_norm.tocoo()
    indices = torch.LongTensor([A_norm.row, A_norm.col])
    values = torch.FloatTensor(A_norm.data)
    return torch.sparse_coo_tensor(indices, values, A_norm.shape)


In [9]:
adj_torch = build_adj_matrix(train_df, num_users, num_items ,user_id, item_id)

  d_inv_sqrt = np.power(rowsum, -0.5)
  indices = torch.LongTensor([A_norm.row, A_norm.col])


In [10]:
#  LightGCN implementation in PyTorch
class LightGCN(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim, n_layers, adjacency):
        super(LightGCN, self).__init__()
        self.user_emb = None
        self.item_emb = None
        self.num_users = num_users
        self.num_items = num_items
        self.embedding_dim = embedding_dim
        self.n_layers = n_layers
        self.adjacency = adjacency  # torch.sparse format

        self.embedding_user = nn.Embedding(num_users, embedding_dim)
        self.embedding_item = nn.Embedding(num_items, embedding_dim)

        nn.init.xavier_uniform_(self.embedding_user.weight)
        nn.init.xavier_uniform_(self.embedding_item.weight)


    def forward(self):
        all_embeddings = torch.cat([self.embedding_user.weight, self.embedding_item.weight], dim=0)
        embeddings_list = [all_embeddings]

        for _ in range(self.n_layers):
            all_embeddings = torch.sparse.mm(self.adjacency, all_embeddings)
            embeddings_list.append(all_embeddings)

        final_embedding = torch.stack(embeddings_list, dim=1).mean(dim=1)
        user_embedding, item_embedding = torch.split(final_embedding, [self.num_users, self.num_items])

        self.user_emb = user_embedding.detach()
        self.item_emb = item_embedding.detach()

        return user_embedding, item_embedding


    def get_users_embedding(self,user_ids,l2_norm=True):
        u_vec = self.user_emb[user_ids]          # (B, emb_dim)
        if l2_norm:
            u_vec = F.normalize(u_vec, p=2, dim=1)
        return u_vec

    def get_items_embedding(self,item_ids,l2_norm=True):
        i_vec = self.item_emb[item_ids]          # (B, emb_dim)
        if l2_norm:
            i_vec = F.normalize(i_vec, p=2, dim=1)
        return i_vec

    def save_embeddings(self, num_users, num_items, device, save_dir='./embeddings'):
        import os
        import faiss
        os.makedirs(save_dir, exist_ok=True)

        self.eval()
        self.to(device)

        user_ids = torch.arange(num_users, dtype=torch.long, device=device)
        item_ids = torch.arange(num_items, dtype=torch.long, device=device)

        with torch.no_grad():
            user_embeds = self.get_users_embedding(user_ids, l2_norm=True)
            item_embeds = self.get_items_embedding(item_ids, l2_norm=True)

        user_embeds = user_embeds.cpu().numpy().astype(np.float32)
        item_embeds = item_embeds.cpu().numpy().astype(np.float32)

        # 保存向量
        np.save(f"{save_dir}/user_embeddings.npy", user_embeds)
        np.save(f"{save_dir}/item_embeddings.npy", item_embeds)

        # 构建 FAISS index（使用内积）
        dim = item_embeds.shape[1]
        index = faiss.IndexFlatIP(dim)
        index.add(item_embeds)

        faiss.write_index(index, f"{save_dir}/item_index.faiss")
        print("Saved user/item embeddings and FAISS index.")


In [11]:
import torch
import numpy as np

def train_model(model,
                train_df,
                num_items,
                epochs=50,
                batch_size=1024,
                lr=1e-3,
                device=None):
    """
    训练 LightGCN (或其它 BPR 模型) 的通用函数
    ------------------------------------------------
    • train_df      : pandas DataFrame，含 user_id / item_id
    • num_items     : 物品总数
    • device        : torch.device；默认为 'cuda' (若可用) 否则 'cpu'
    • max_grad_norm : 梯度裁剪阈值；避免梯度爆炸，可选
    """
    # -------- 设备 ----------
    if device is None:
        device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    model = model.to(device)
    if hasattr(model, "adjacency"):               # adjacency 可能是稀疏张量
        model.adjacency = model.adjacency.to(device)

    # -------- 优化器 ----------
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    train_loader = customdataset.build_train_loader_inbatch(train_df, batch_size=batch_size,user_col=user_id, item_col=item_id)

    # -------- 训练循环 ----------
    for epoch in range(1, epochs + 1):
        model.train()
        dt_start = datetime.now()
        epoch_loss = 0.0

        for batch in train_loader:
            user_ids, pos_item_ids = batch
            user_ids = user_ids.to(device).long()
            pos_item_ids = pos_item_ids.to(device).long()


            # 1. 前向传播（返回 user / item 向量）
            user_emb, item_emb = model()
            u_vec = user_emb[user_ids]
            i_vec = item_emb[pos_item_ids]

            # 2. 得分矩阵：每个 user 对所有正 item 的打分
            logits = torch.matmul(u_vec, i_vec.T)  # shape: (B, B)

            # 3. 构造标签：每个 user 的正确 item 在对角线（即位置 i）
            labels = torch.arange(logits.size(0), device=device)  # [0, 1, ..., B-1]

            # 4. Cross Entropy Loss
            loss = F.cross_entropy(logits, labels)

            # 5. 反向传播
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()

        # 日志
        avg_loss = epoch_loss / len(train_loader)
        dt_end = datetime.now()
        dt = dt_end - dt_start

        print(f"[Epoch {epoch:02d}/{epochs}] avg InBatch Softmax Loss = {avg_loss:.4f}, time = {dt.total_seconds():.2f}s")

    return


In [26]:
model = LightGCN(num_users,num_items,embedding_dim=128,n_layers=2,adjacency=adj_torch)
model.to(device)
train_model(model=model,epochs=10, train_df=train_df,num_items=num_items,batch_size=1024)

[Epoch 01/10] avg InBatch Softmax Loss = 6.5553, time = 3.02s
[Epoch 02/10] avg InBatch Softmax Loss = 5.7569, time = 2.95s
[Epoch 03/10] avg InBatch Softmax Loss = 5.3165, time = 2.96s
[Epoch 04/10] avg InBatch Softmax Loss = 4.9698, time = 2.98s
[Epoch 05/10] avg InBatch Softmax Loss = 4.6297, time = 2.97s
[Epoch 06/10] avg InBatch Softmax Loss = 4.2736, time = 2.99s
[Epoch 07/10] avg InBatch Softmax Loss = 3.8969, time = 2.95s
[Epoch 08/10] avg InBatch Softmax Loss = 3.5051, time = 3.00s
[Epoch 09/10] avg InBatch Softmax Loss = 3.1122, time = 2.96s
[Epoch 10/10] avg InBatch Softmax Loss = 2.7353, time = 2.98s


In [27]:
model.save_embeddings(num_users=num_users,num_items=num_items,device=device,save_dir=save_dir)

Saved user/item embeddings and FAISS index.


In [28]:
test_loader = customdataset.build_test_loader(test_df, num_items ,user_col = user_id, item_col = item_id, batch_size=1024, num_workers=num_workers)
item_pool = list(range(num_items))
faiss_index = faiss.read_index(f"{save_dir}/item_index.faiss")

In [29]:
hr_r, ndcg_r = evaluate.evaluate_random(test_loader, item_pool ,top_k=top_k)
print(f"Random HR@{top_k} = {hr_r:.4f}, nDCG@{top_k} = {ndcg_r:.4f}")
hr_p, ndcg_p = evaluate.evaluate_popular(test_loader, train_df,top_k=top_k)
print(f"Popular HR@{top_k} = {hr_p:.4f}, nDCG@{top_k} = {ndcg_p:.4f}")
hr_m, ndcg_m = evaluate.evaluate_model(test_loader, model, faiss_index, device,top_k=top_k)
print(f"Model   HR@{top_k} = {hr_m:.4f}, nDCG@{top_k} = {ndcg_m:.4f}")


Random HR@10 = 0.0006, nDCG@10 = 0.0003
Popular HR@10 = 0.0029, nDCG@10 = 0.0014
Model   HR@10 = 0.0322, nDCG@10 = 0.0121
