In [1]:
from google.colab import drive
import shutil
import os
def copy_from_drive(src_path, dst_path):

    if os.path.exists(dst_path):
        print(f"skip:{dst_path} exists")
        return

    if os.path.isdir(src_path):
        shutil.copytree(src_path, dst_path)
    elif os.path.isfile(src_path):
        shutil.copy(src_path, dst_path)

drive.mount('/content/drive')
copy_from_drive('/content/drive/MyDrive/tool', '/content/tool')
copy_from_drive('/content/drive/MyDrive/MicroLens-50k_pairs.csv','/content/MicroLens-50k_pairs.csv')
copy_from_drive('/content/drive/MyDrive/cover_emb128.lmdb','/content/cover_emb128.lmdb')
copy_from_drive('/content/drive/MyDrive/title_emb1024.lmdb','/content/title_emb1024.lmdb')


Mounted at /content/drive


In [2]:
!pip install faiss-cpu
!pip install lmdb
from tool import preprocess
from tool import customdataset
from tool import evaluate
import faiss
import torch.nn as nn
import numpy as np
import torch
import torch.nn.functional as F
import random
from datetime import datetime

Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0.post1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.0 kB)
Downloading faiss_cpu-1.11.0.post1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m73.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.11.0.post1
Collecting lmdb
  Downloading lmdb-1.7.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.3 kB)
Downloading lmdb-1.7.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (299 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m299.6/299.6 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: lmdb
Successfully installed lmdb-1.7.3


In [3]:
preprocess.set_seed(42)

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
path = 'MicroLens-50k_pairs.csv'
user = 'user'
item = 'item'
user_id = 'user_id'
item_id = 'item_id'
timestamp = 'timestamp'
save_dir = './embeddings'
cover_lmdb_path = 'cover_emb128.lmdb'
title_lmdb_path = 'title_emb1024.lmdb'

top_k = 10
num_workers = 10
k_neg = 10
# path = pd.read_csv('MicroLens-50k_pairs.csv')

In [6]:
dataset_pd,num_users,num_items = preprocess.openAndSort(path,user_id=user,item_id=item,timestamp='timestamp')

dataset base information：
- number of users：50000
- number of items：19220
- number of rows：359708


In [7]:
train_df, test_df = preprocess.split(dataset_pd,user, item, timestamp)
print(f"Train size: {len(train_df)}")
print(f"Test size: {len(test_df)}")

Train size: 309708
Test size: 49424


In [8]:
# maintain a map from new id to old id, new id for constructing matrix
user2id = {u: i for i, u in enumerate(dataset_pd[user].unique())}
item2id = {i: j for j, i in enumerate(dataset_pd[item].unique())}

# apply to train_df and test_df
train_df[user_id] = train_df[user].map(user2id)
train_df[item_id] = train_df[item].map(item2id)
test_df[user_id] = test_df[user].map(user2id)
test_df[item_id] = test_df[item].map(item2id)

# 1. 构建 item_id 到 item 的映射（来自 train_df）
item_id_to_item = {v: k for k, v in item2id.items()}

In [9]:
# ---------- 超参数 ----------
MAX_SEQ_LEN   = 20          # 输入序列长度
EMBEDDING_DIM = 64          # item / user embedding 维度
HIDDEN_SIZE   = 64          # GRU 隐藏维度（可与 EMBEDDING_DIM 相同）
NEG_SAMPLE    = 5           # 训练时每个正样本采负样本数
BATCH_SIZE    = 1024
EPOCHS        = 45
LR            = 1e-3
SEED          = 42
NUM_LAYERS      = 1

MODAL = {'COVER':{"LMDB_DIM":128, "HIDDEN_SIZE":[EMBEDDING_DIM],"DROPOUT":0.2} , 'TITLE':{"LMDB_DIM":1024,"HIDDEN_SIZE":[EMBEDDING_DIM],"DROPOUT":0.2}
         ,'COVER-TITLE': {"LMDB_DIM":128+1024, "HIDDEN_SIZE":[EMBEDDING_DIM],"DROPOUT":0.2}}
CURRENT_MODAL = "COVER"
MODAL_CONFIG = MODAL[CURRENT_MODAL]
MODAL_HIDDEN_SIZE = MODAL_CONFIG.get('HIDDEN_SIZE')
LMDB_DIM = MODAL_CONFIG.get('LMDB_DIM')
MODAL_DROPOUT = MODAL_CONFIG.get('DROPOUT')
L2_NORM =False
FUSION_MODE = "late"
# ----------------------------

# ---------- 随机种子 ----------
torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)
# ----------------------------

# ---------- 常量 ----------
PAD_IDX  = num_items              # padding 专用 id（不与真实 item 冲突）
N_ITEMS  = num_items + 1          # Embedding 行数（含 PAD）
# ----------------------------



In [10]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

class GRU4RecBPR(nn.Module):
    def __init__(self, n_items=num_items,
                 embedding_dim=EMBEDDING_DIM, hidden_size=HIDDEN_SIZE,
                 modal_hidden_size=MODAL_HIDDEN_SIZE, modal_dropout=MODAL_DROPOUT, lmdb_dim=LMDB_DIM,
                 num_layers=NUM_LAYERS, pad_idx=PAD_IDX,
                 fusion_mode=FUSION_MODE):  # 'none' | 'early' | 'late'
        super().__init__()
        assert fusion_mode in {'none', 'early', 'late'}
        self.fusion_mode = fusion_mode

        # ---------- Item embedding (ID) ----------
        self.embedding = nn.Embedding(n_items, embedding_dim, padding_idx=pad_idx)
        nn.init.normal_(self.embedding.weight, mean=0.0, std=0.05)
        with torch.no_grad():
            self.embedding.weight[self.embedding.padding_idx].zero_()  # PAD → 0

        # ---------- User encoder ----------
        self.gru = nn.GRU(embedding_dim, hidden_size, num_layers=num_layers, batch_first=True)
        self.proj = (nn.Linear(hidden_size, embedding_dim, bias=False)
                     if hidden_size != embedding_dim else nn.Identity())

        # modal 向量（冻结）
        modal_emb_tensor = None
        if CURRENT_MODAL=='COVER':
            modal_emb_tensor = preprocess.load_tensor_from_lmdb(
                cover_lmdb_path, num_items, item_id_to_item, lmdb_dim
            )
        if CURRENT_MODAL=='TITLE':
            modal_emb_tensor = preprocess.load_tensor_from_lmdb(
                title_lmdb_path, num_items, item_id_to_item, lmdb_dim
            )
        if CURRENT_MODAL=='COVER-TITLE':
            cover_emb_tensor = preprocess.load_tensor_from_lmdb(
                cover_lmdb_path, num_items, item_id_to_item, 128
            )
            title_emb_tensor = preprocess.load_tensor_from_lmdb(
                title_lmdb_path, num_items, item_id_to_item, 1024
            )
            modal_emb_tensor = torch.cat([cover_emb_tensor, title_emb_tensor], dim=-1)
        modal_emb_tensor[pad_idx].zero_()
        self.register_buffer('frozen_extra_emb', modal_emb_tensor)


        # ---------- 前融合用的投影：[id_emb; modal] -> emb_dim ----------
        self.mlp_item_modal = self.build_mlp(embedding_dim + lmdb_dim, modal_hidden_size, modal_dropout)

        # ---------- 后融合用 α（全局标量） ----------
        # sigmoid(0)=0.5，起步两路各占一半；如需更稳可设为 1.0（偏向 ID）
        self.alpha_param = nn.Parameter(torch.tensor(0.0))

    def build_mlp(self, input_dim, hidden_sizes, dropout):
        layers = []
        for h in hidden_sizes:
            layers += [nn.Linear(input_dim, h), nn.LayerNorm(h), nn.Tanh(), nn.Dropout(dropout)]
            input_dim = h
        return nn.Sequential(*layers)

    # ===================== 用户侧（序列） =====================
    def _seq_emb_id_only(self, seq):
        """仅用 ID embedding 作为 GRU 输入"""
        return self.embedding(seq)  # (B,T,D)

    def _seq_emb_early(self, seq):
        """前融合：在时间步拼接 modal 再映射回 emb_dim"""
        modal = self.frozen_extra_emb.to(seq.device)[seq]          # (B,T,lmdb_dim)
        emb   = self.embedding(seq)                                 # (B,T,D)
        emb   = torch.cat([emb, modal], dim=-1)                     # (B,T,D+lmdb_dim)
        emb   = self.mlp_item_modal(emb)                            # (B,T,D)
        return emb

    def forward(self, seq):
        """
        输入:  seq (B,T)
        输出:  用户向量 u_vec (B,D)
        说明:  只负责用户向量；物品向量交给 get_items_embedding 按 fusion_mode 产出
        """
        if self.fusion_mode == 'early':
            emb = self._seq_emb_early(seq)
        else:
            # 'none' 与 'late' 都建议用 ID-only 序列来编码用户状态（最小侵入、计算稳定）
            emb = self._seq_emb_id_only(seq)

        out, _ = self.gru(emb)             # (B,T,H)
        h = out[:, -1, :]                  # (B,H)
        u_vec = self.proj(h)               # (B,D)

        if L2_NORM:
            u_vec = F.normalize(u_vec, p=2, dim=1)
        return u_vec

    # ===================== 物品侧（候选向量） =====================
    def _item_vec_id_only(self, item_ids, l2_norm=False):
        i_vec = self.embedding(item_ids)   # (B,D)
        if l2_norm:
            i_vec = F.normalize(i_vec, p=2, dim=1)
        return i_vec

    def _item_vec_early(self, item_ids, l2_norm=False):
        modal = self.frozen_extra_emb.to(item_ids.device)[item_ids]  # (B,lmdb_dim)
        i_vec = self.embedding(item_ids)                              # (B,D)
        i_vec = torch.cat([i_vec, modal], dim=-1)                     # (B,D+lmdb_dim)
        i_vec = self.mlp_item_modal(i_vec)                            # (B,D)
        if l2_norm:
            i_vec = F.normalize(i_vec, p=2, dim=1)
        return i_vec

    def _item_vec_late(self, item_ids, l2_norm=False):
        # 向量级后融合：i = α * i_id + (1-α) * i_mm
        i_id = self._item_vec_id_only(item_ids, l2_norm=False)   # (B,D)
        i_mm = self._item_vec_early(item_ids, l2_norm=False)     # (B,D)
        alpha = torch.sigmoid(self.alpha_param)                  # 标量
        i_vec = alpha * i_id + (1.0 - alpha) * i_mm
        if l2_norm:
            i_vec = F.normalize(i_vec, p=2, dim=1)
        return i_vec

    def get_items_embedding(self, item_ids, l2_norm=False):
        """
        根据 fusion_mode 返回候选物品向量（用于打分/ANN 检索）
        """
        if self.fusion_mode == 'none':
            return self._item_vec_id_only(item_ids, l2_norm=l2_norm)
        elif self.fusion_mode == 'early':
            return self._item_vec_early(item_ids, l2_norm=l2_norm)
        else:  # 'late'
            return self._item_vec_late(item_ids, l2_norm=l2_norm)

    # ===================== 导出/检索 =====================
    def save_embeddings(self, num_users, num_items, device, save_dir='./embeddings', l2_norm=L2_NORM):
        import os, faiss
        os.makedirs(save_dir, exist_ok=True)
        self.eval().to(device)

        item_ids = torch.arange(num_items, dtype=torch.long, device=device)
        with torch.no_grad():
            item_embeds = self.get_items_embedding(item_ids, l2_norm=l2_norm)

        item_embeds = item_embeds.cpu().numpy().astype(np.float32)
        np.save(f"{save_dir}/item_embeddings.npy", item_embeds)

        dim = item_embeds.shape[1]
        index = faiss.IndexFlatIP(dim)
        index.add(item_embeds)
        faiss.write_index(index, f"{save_dir}/item_index.faiss")
        print("Saved item embeddings and FAISS index.")


In [11]:
# ======== 训练流程 ======== #
def train_model(model, train_df, epochs,lr , batch_size, test_df=None, device=None):
    if device is None:
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


    train_loader  = customdataset.build_seq_loader(train_df, batch_size=batch_size,
                         shuffle=True, num_workers=10,pad_idx=PAD_IDX,max_len=MAX_SEQ_LEN,user_id=user_id,item_id=item_id)

    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    for epoch in range(1, epochs + 1):
        model.train()
        dt_start = datetime.now()
        epoch_loss = 0.0

        for batch in train_loader:
            hist, pos = batch
            hist, pos = hist.to(device), pos.to(device)

            # 1. 前向传播（返回预测向量）
            predict = model(hist)                      # (B, D)
            i_vec = model.get_items_embedding(pos,l2_norm=False)

            # 2. 得分矩阵：每个 user 对所有正 item 的打分
            logits = torch.matmul(predict, i_vec.T)  # shape: (B, B)

            # 3. 构造标签：每个 user 的正确 item 在对角线（即位置 i）
            labels = torch.arange(logits.size(0), device=device)  # [0, 1, ..., B-1]

            # 4. Cross Entropy Loss
            loss = F.cross_entropy(logits, labels)

            # 5. 反向传播
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()

        # 日志
        avg_loss = epoch_loss / len(train_loader)
        dt_end = datetime.now()
        dt = dt_end - dt_start

        print(f"[Epoch {epoch:02d}/{epochs}] avg InBatch Softmax Loss = {avg_loss:.4f}, time = {dt.total_seconds():.2f}s")
        if(model.fusion_mode=='late'):
            print('alpha:', torch.sigmoid(model.alpha_param).item())
    return


In [12]:
def build_hist_matrix(df,
                      num_users,
                      max_len=MAX_SEQ_LEN,
                      pad_idx=PAD_IDX,
                      user_col=user_id,
                      item_col=item_id):
    """
    返回形状为 (num_users, max_len) 的 LongTensor。
    第 i 行是用户 i 的历史序列，左侧 PAD，右对齐。
    不存在历史的用户整行都是 pad_idx。
    """
    # 先全部填 PAD
    hist = torch.full((num_users, max_len), pad_idx, dtype=torch.long)

    # groupby 遍历每个用户已有交互
    for uid, items in df.groupby(user_col)[item_col]:
        seq = items.to_numpy()[-max_len:]             # 取最近 max_len 条
        hist[uid, -len(seq):] = torch.as_tensor(seq, dtype=torch.long)

    return hist    # (U, T)


In [13]:
    # ------------------ 训练 ------------------
model = GRU4RecBPR(n_items=N_ITEMS,
                 embedding_dim=EMBEDDING_DIM, hidden_size=HIDDEN_SIZE,
                 num_layers=NUM_LAYERS, pad_idx=PAD_IDX)
model = model.to(device)
train_model(model=model,epochs=EPOCHS, train_df=train_df,batch_size=BATCH_SIZE,lr=LR,test_df=test_df,device=device)

[Epoch 01/45] avg InBatch Softmax Loss = 6.6739, time = 3.59s
[Epoch 02/45] avg InBatch Softmax Loss = 6.3684, time = 2.53s
[Epoch 03/45] avg InBatch Softmax Loss = 6.2792, time = 2.53s
[Epoch 04/45] avg InBatch Softmax Loss = 6.2231, time = 2.46s
[Epoch 05/45] avg InBatch Softmax Loss = 6.1756, time = 2.53s
[Epoch 06/45] avg InBatch Softmax Loss = 6.1284, time = 2.58s
[Epoch 07/45] avg InBatch Softmax Loss = 6.0741, time = 2.51s
[Epoch 08/45] avg InBatch Softmax Loss = 6.0137, time = 2.49s
[Epoch 09/45] avg InBatch Softmax Loss = 5.9425, time = 2.54s
[Epoch 10/45] avg InBatch Softmax Loss = 5.8641, time = 2.51s
[Epoch 11/45] avg InBatch Softmax Loss = 5.7799, time = 2.60s
[Epoch 12/45] avg InBatch Softmax Loss = 5.7000, time = 2.34s
[Epoch 13/45] avg InBatch Softmax Loss = 5.6233, time = 2.53s
[Epoch 14/45] avg InBatch Softmax Loss = 5.5559, time = 2.49s
[Epoch 15/45] avg InBatch Softmax Loss = 5.5009, time = 2.54s
[Epoch 16/45] avg InBatch Softmax Loss = 5.4537, time = 2.62s
[Epoch 1

In [14]:
model.save_embeddings(num_users=num_users,num_items=num_items,device=device,save_dir=save_dir)

Saved user/item embeddings and FAISS index.


In [15]:
test_loader = customdataset.build_test_loader(test_df, num_items ,user_col = user_id, item_col = item_id, batch_size=1024, num_workers=num_workers)
item_pool = list(range(num_items))
faiss_index = faiss.read_index(f"{save_dir}/item_index.faiss")
hist_tensors = build_hist_matrix(train_df, max_len=MAX_SEQ_LEN, pad_idx=PAD_IDX,num_users=num_users).to(device)

In [16]:
hr_r, ndcg_r = evaluate.evaluate_random(test_loader, item_pool ,top_k=top_k)
print(f"Random HR@{top_k} = {hr_r:.4f}, NDCG@{top_k} = {ndcg_r:.4f}")
hr_p, ndcg_p = evaluate.evaluate_popular(test_loader, train_df,top_k=top_k)
print(f"Popular HR@{top_k} = {hr_p:.4f}, NDCG@{top_k} = {ndcg_p:.4f}")
hr_m, ndcg_m = evaluate.evaluate_seq_model(test_loader, model, faiss_index, device,top_k=top_k,hist_tensors=hist_tensors)
print(f"Model   HR@{top_k} = {hr_m:.4f}, NDCG@{top_k} = {ndcg_m:.4f}")

Random HR@10 = 0.0005, nDCG@10 = 0.0002
Popular HR@10 = 0.0029, nDCG@10 = 0.0014
Model   HR@10 = 0.0763, nDCG@10 = 0.0379
