In [12]:
from tkinter.constants import HIDDEN

from google.colab import drive
import shutil
import os


def copy_from_drive(src_path, dst_path):

    if os.path.exists(dst_path):
        print(f"skip:{dst_path} exists")
        return

    if os.path.isdir(src_path):
        shutil.copytree(src_path, dst_path)
    elif os.path.isfile(src_path):
        shutil.copy(src_path, dst_path)

drive.mount('/content/drive')
copy_from_drive('/content/drive/MyDrive/tool', '/content/tool')
copy_from_drive('/content/drive/MyDrive/MicroLens-50k_pairs.csv','/content/MicroLens-50k_pairs.csv')
copy_from_drive('/content/drive/MyDrive/cover_emb128.lmdb','/content/cover_emb128.lmdb')
copy_from_drive('/content/drive/MyDrive/title_emb1024.lmdb','/content/title_emb1024.lmdb')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
skip:/content/tool exists
skip:/content/MicroLens-50k_pairs.csv exists


In [13]:
!pip install faiss-cpu
!pip install lmdb
from tool import preprocess
from tool import customdataset
from tool import evaluate
import faiss
import torch.nn as nn
import numpy as np
import torch
import torch.nn.functional as F
from datetime import datetime



In [14]:
preprocess.set_seed(42)

In [15]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [16]:
path = 'MicroLens-50k_pairs.csv'
user = 'user'
item = 'item'
user_id = 'user_id'
item_id = 'item_id'
timestamp = 'timestamp'
save_dir = './embeddings'
cover_lmdb_path = 'cover_emb128.lmdb'
title_lmdb_path = 'title_emb1024.lmdb'

top_k = 10
num_workers = 10
k_neg = 10
L2_NORM = False
EPOCHS = 30
BATCH_SIZE = 1024
EMBEDDING_DIM = 256
HIDDEN_SIZE = [256, 128, 64]
DROPOUT = 0.2
MODAL = {'COVER':{"LMDB_DIM":128, "HIDDEN_SIZE":[EMBEDDING_DIM],"DROPOUT":0.2} , 'TITLE':{"LMDB_DIM":1024,"HIDDEN_SIZE":[EMBEDDING_DIM],"DROPOUT":0.2}
         ,'COVER-TITLE': {"LMDB_DIM":128+1024, "HIDDEN_SIZE":[EMBEDDING_DIM],"DROPOUT":0.2}}
CURRENT_MODAL = "COVER"
MODAL_CONFIG = MODAL[CURRENT_MODAL]
MODAL_HIDDEN_SIZE = MODAL_CONFIG.get('HIDDEN_SIZE')
LMDB_DIM = MODAL_CONFIG.get('LMDB_DIM')
MODAL_DROPOUT = MODAL_CONFIG.get('DROPOUT')
FUSION_MODE='late'
# path = pd.read_csv('MicroLens-50k_pairs.csv')

In [17]:
dataset_pd,num_users,num_items = preprocess.openAndSort(path,user_id=user,item_id=item,timestamp='timestamp')

dataset base information：
- number of users：50000
- number of items：19220
- number of rows：359708


In [18]:

train_df, test_df = preprocess.split(dataset_pd,user, item, timestamp)
print(f"Train size: {len(train_df)}")
print(f"Test size: {len(test_df)}")


Train size: 309708
Test size: 49424


In [19]:
# maintain a map from new id to old id, new id for constructing matrix
user2id = {u: i for i, u in enumerate(dataset_pd[user].unique())}
item2id = {i: j for j, i in enumerate(dataset_pd[item].unique())}

# apply to train_df and test_df
train_df[user_id] = train_df[user].map(user2id)
train_df[item_id] = train_df[item].map(item2id)
test_df[user_id] = test_df[user].map(user2id)
test_df[item_id] = test_df[item].map(item2id)

# 1. 构建 item_id 到 item 的映射（来自 train_df）
item_id_to_item = {v: k for k, v in item2id.items()}

In [20]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

class DSSM(nn.Module):
    """
    双塔 DSSM（支持三种融合）:
      fusion_mode:
        - 'none'  : 纯 ID
        - 'early' : 前融合（你现有的： [item_id_emb; modal] -> mlp_item_modal -> emb_dim）
        - 'late'  : 后融合（ID 路径与模态路径分别编码，再 alpha 加权）
    """
    def __init__(self, num_users, num_items,
                 emb_dim=EMBEDDING_DIM,
                 mlp_hidden_size=HIDDEN_SIZE,
                 dropout=DROPOUT,
                 modal_hidden_size=MODAL_HIDDEN_SIZE,
                 modal_dropout=MODAL_DROPOUT,
                 lmdb_dim=LMDB_DIM,
                 fusion_mode=FUSION_MODE):
        super().__init__()
        assert fusion_mode in {'none','early','late'}
        self.fusion_mode = fusion_mode

        self.user_emb = nn.Embedding(num_users, emb_dim)
        self.item_emb = nn.Embedding(num_items, emb_dim)
        nn.init.xavier_uniform_(self.user_emb.weight)
        nn.init.xavier_uniform_(self.item_emb.weight)
        # modal 向量（冻结）
        modal_emb_tensor = None
        if CURRENT_MODAL=='COVER':
            modal_emb_tensor = preprocess.load_tensor_from_lmdb(
                cover_lmdb_path, num_items, item_id_to_item, lmdb_dim
            )
        if CURRENT_MODAL=='TITLE':
            modal_emb_tensor = preprocess.load_tensor_from_lmdb(
                title_lmdb_path, num_items, item_id_to_item, lmdb_dim
            )
        if CURRENT_MODAL=='COVER-TITLE':
            cover_emb_tensor = preprocess.load_tensor_from_lmdb(
                cover_lmdb_path, num_items, item_id_to_item, 128
            )
            title_emb_tensor = preprocess.load_tensor_from_lmdb(
                title_lmdb_path, num_items, item_id_to_item, 1024
            )
            modal_emb_tensor = torch.cat([cover_emb_tensor, title_emb_tensor], dim=-1)

        self.register_buffer('frozen_extra_emb', modal_emb_tensor)

        # 通用 MLP
        self.mlp_user = self.build_mlp(emb_dim, mlp_hidden_size, dropout)
        self.mlp_item = self.build_mlp(emb_dim, mlp_hidden_size, dropout)

        # 前融合用：将 [item; modal] -> emb_dim
        self.mlp_item_modal = self.build_mlp(emb_dim + lmdb_dim, modal_hidden_size, modal_dropout)

        # 后融合用：全局 alpha（标量，sigmoid 后 ∈ (0,1)）
        self.alpha_param = nn.Parameter(torch.tensor(0.0))  # sigmoid(0)=0.5 起步

    def build_mlp(self, input_dim, hidden_sizes, dropout):
        layers = []
        for h in hidden_sizes:
            layers += [nn.Linear(input_dim, h), nn.BatchNorm1d(h), nn.Tanh(), nn.Dropout(dropout)]
            input_dim = h
        return nn.Sequential(*layers)

    def _item_vec_id_only(self, item_id):
        i_id = self.item_emb(item_id)      # (B, emb_dim)
        return self.mlp_item(i_id)         # (B, d)

    def _item_vec_early(self, item_id):
        # 前融合路径： [item_emb; modal] -> emb_dim -> mlp_item -> d
        i_id = self.item_emb(item_id)
        modal = self.frozen_extra_emb.to(item_id.device)[item_id]
        i_cat = torch.cat([i_id, modal], dim=-1)
        i_emb = self.mlp_item_modal(i_cat)   # (B, emb_dim)
        return self.mlp_item(i_emb)          # (B, d)

    def _item_vec_late(self, item_id):
        # 后融合：向量级
        i_vec_id = self._item_vec_id_only(item_id)   # (B, d)
        i_vec_mm = self._item_vec_early(item_id)     # (B, d) —— 复用 early 路径的“模态子塔”
        alpha = torch.sigmoid(self.alpha_param)      # 标量
        return alpha * i_vec_id + (1.0 - alpha) * i_vec_mm

    def forward(self, user_id, item_id, l2_norm=L2_NORM):
        # 用户向量
        u = self.user_emb(user_id)
        u_vec = self.mlp_user(u)

        # 物品向量（按模式）
        if self.fusion_mode == 'none':
            i_vec = self._item_vec_id_only(item_id)
        elif self.fusion_mode == 'early':
            i_vec = self._item_vec_early(item_id)
        else:  # 'late'
            i_vec = self._item_vec_late(item_id)

        if l2_norm:
            u_vec = F.normalize(u_vec, p=2, dim=1)
            i_vec = F.normalize(i_vec, p=2, dim=1)
        return u_vec, i_vec

    def get_users_embedding(self, user_ids, l2_norm=L2_NORM):
        u = self.user_emb(user_ids)
        u_vec = self.mlp_user(u)
        if l2_norm: u_vec = F.normalize(u_vec, p=2, dim=1)
        return u_vec

    def get_items_embedding(self, item_ids, l2_norm=L2_NORM):
        if self.fusion_mode == 'none':
            i_vec = self._item_vec_id_only(item_ids)
        elif self.fusion_mode == 'early':
            i_vec = self._item_vec_early(item_ids)
        else:
            i_vec = self._item_vec_late(item_ids)
        if l2_norm: i_vec = F.normalize(i_vec, p=2, dim=1)
        return i_vec

    def save_embeddings(self, num_users, num_items, device, save_dir='./embeddings', l2_norm=L2_NORM):
        import os, faiss
        os.makedirs(save_dir, exist_ok=True)
        self.eval().to(device)
        user_ids = torch.arange(num_users, dtype=torch.long, device=device)
        item_ids = torch.arange(num_items, dtype=torch.long, device=device)
        with torch.no_grad():
            user_embeds = self.get_users_embedding(user_ids, l2_norm=l2_norm)
            item_embeds = self.get_items_embedding(item_ids, l2_norm=l2_norm)
        user_embeds = user_embeds.cpu().numpy().astype(np.float32)
        item_embeds = item_embeds.cpu().numpy().astype(np.float32)
        np.save(f"{save_dir}/user_embeddings.npy", user_embeds)
        np.save(f"{save_dir}/item_embeddings.npy", item_embeds)
        dim = item_embeds.shape[1]
        index = faiss.IndexFlatIP(dim)
        index.add(item_embeds)
        faiss.write_index(index, f"{save_dir}/item_index.faiss")
        print("Saved user/item embeddings and FAISS index.")


In [21]:

def train_model(model,train_df,
                epochs=10,
                batch_size=64,
                lr=1e-3,
                device=None):

    if device is None:
        device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    # 你需要一个 data_loader 返回 (user_id, pos_item_id) 对，无负样本
    train_loader = customdataset.build_train_loader_inbatch(train_df, batch_size=batch_size,user_col=user_id, item_col=item_id)

    for epoch in range(1, epochs + 1):
        model.train()
        dt_start = datetime.now()
        epoch_loss = 0.0

        for batch in train_loader:
            user_ids, pos_item_ids = batch
            user_ids = user_ids.to(device)
            pos_item_ids = pos_item_ids.to(device)

            # 1. 前向传播（返回 user / item 向量）
            u_vec, i_vec = model(user_ids, pos_item_ids, l2_norm=L2_NORM)

            # 2. 得分矩阵：每个 user 对所有正 item 的打分
            logits = torch.matmul(u_vec, i_vec.T)  # shape: (B, B)

            # 3. 构造标签：每个 user 的正确 item 在对角线（即位置 i）
            labels = torch.arange(logits.size(0), device=device)  # [0, 1, ..., B-1]

            # 4. Cross Entropy Loss
            loss = F.cross_entropy(logits, labels)

            # 5. 反向传播
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()

        # 日志
        avg_loss = epoch_loss / len(train_loader)
        dt_end = datetime.now()
        dt = dt_end - dt_start

        print(f"[Epoch {epoch:02d}/{epochs}] avg InBatch Softmax Loss = {avg_loss:.4f}, time = {dt.total_seconds():.2f}s")
        if(model.fusion_mode=='late'):
            print('alpha:', torch.sigmoid(model.alpha_param).item())

    return




In [22]:
model = DSSM(num_users,num_items)
model.to(device)
train_model(model=model,epochs=EPOCHS, train_df=train_df,batch_size=BATCH_SIZE)

[Epoch 01/30] avg InBatch Softmax Loss = 9.0778, time = 4.97s
[Epoch 02/30] avg InBatch Softmax Loss = 7.3072, time = 4.05s
[Epoch 03/30] avg InBatch Softmax Loss = 6.7081, time = 4.00s
[Epoch 04/30] avg InBatch Softmax Loss = 6.3262, time = 4.01s
[Epoch 05/30] avg InBatch Softmax Loss = 6.0367, time = 4.03s
[Epoch 06/30] avg InBatch Softmax Loss = 5.7931, time = 4.02s
[Epoch 07/30] avg InBatch Softmax Loss = 5.6063, time = 4.05s
[Epoch 08/30] avg InBatch Softmax Loss = 5.4759, time = 3.96s
[Epoch 09/30] avg InBatch Softmax Loss = 5.3760, time = 4.10s
[Epoch 10/30] avg InBatch Softmax Loss = 5.2945, time = 4.07s
[Epoch 11/30] avg InBatch Softmax Loss = 5.2217, time = 4.01s
[Epoch 12/30] avg InBatch Softmax Loss = 5.1566, time = 4.06s
[Epoch 13/30] avg InBatch Softmax Loss = 5.0971, time = 4.03s
[Epoch 14/30] avg InBatch Softmax Loss = 5.0420, time = 3.99s
[Epoch 15/30] avg InBatch Softmax Loss = 4.9961, time = 4.05s
[Epoch 16/30] avg InBatch Softmax Loss = 4.9591, time = 4.02s
[Epoch 1

In [23]:
model.save_embeddings(num_users=num_users,num_items=num_items,device=device,save_dir=save_dir)

Saved user/item embeddings and FAISS index.


In [24]:
test_loader = customdataset.build_test_loader(test_df, num_items ,user_col = user_id, item_col = item_id, batch_size=1024, num_workers=num_workers)
item_pool = list(range(num_items))
faiss_index = faiss.read_index(f"{save_dir}/item_index.faiss")

In [25]:
hr_r, ndcg_r = evaluate.evaluate_random(test_loader, item_pool ,top_k=top_k)
print(f"Random HR@{top_k} = {hr_r:.4f}, NDCG@{top_k} = {ndcg_r:.4f}")
hr_p, ndcg_p = evaluate.evaluate_popular(test_loader, train_df,top_k=top_k)
print(f"Popular HR@{top_k} = {hr_p:.4f}, NDCG@{top_k} = {ndcg_p:.4f}")
hr_m, ndcg_m = evaluate.evaluate_model(test_loader, model, faiss_index, device,top_k=top_k)
print(f"Model   HR@{top_k} = {hr_m:.4f}, NDCG@{top_k} = {ndcg_m:.4f}")


Random HR@10 = 0.0008, nDCG@10 = 0.0004
Popular HR@10 = 0.0029, nDCG@10 = 0.0014
Model   HR@10 = 0.0244, nDCG@10 = 0.0103
