In [207]:
from google.colab import drive
import shutil
import os
def copy_from_drive(src_path, dst_path):

    if os.path.exists(dst_path):
        print(f"skip:{dst_path} exists")
        return

    if os.path.isdir(src_path):
        shutil.copytree(src_path, dst_path)
    elif os.path.isfile(src_path):
        shutil.copy(src_path, dst_path)

drive.mount('/content/drive')
copy_from_drive('/content/drive/MyDrive/tool', '/content/tool')
copy_from_drive('/content/drive/MyDrive/MicroLens-50k_pairs.csv','/content/MicroLens-50k_pairs.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
skip:/content/tool exists
skip:/content/MicroLens-50k_pairs.csv exists


In [208]:
!pip install faiss-cpu
!pip install lmdb
import os
from tool import preprocess
from tool import customdataset
from tool import evaluate
import faiss
import torch.nn as nn
import numpy as np
import torch
import torch.nn.functional as F
from datetime import datetime




In [None]:
preprocess.set_seed(42)

In [209]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [210]:
path = 'MicroLens-50k_pairs.csv'
user = 'user'
item = 'item'
user_id = 'user_id'
item_id = 'item_id'
timestamp = 'timestamp'
save_dir = './embeddings'
top_k = 10
num_workers = 10
k_neg = 10
# path = pd.read_csv('MicroLens-50k_pairs.csv')

In [211]:
dataset_pd,num_users,num_items = preprocess.openAndSort(path,user_id=user,item_id=item,timestamp='timestamp')

dataset base information：
- number of users：50000
- number of items：19220
- number of rows：359708


In [212]:
train_df, test_df = preprocess.split(dataset_pd,user, item, timestamp)
print(f"Train size: {len(train_df)}")
print(f"Test size: {len(test_df)}")

Train size: 309708
Test size: 49424


In [213]:
# maintain a map from new id to old id, new id for constructing matrix
user2id = {u: i for i, u in enumerate(dataset_pd[user].unique())}
item2id = {i: j for j, i in enumerate(dataset_pd[item].unique())}

# apply to train_df and test_df
train_df[user_id] = train_df[user].map(user2id)
train_df[item_id] = train_df[item].map(item2id)
test_df[user_id] = test_df[user].map(user2id)
test_df[item_id] = test_df[item].map(item2id)



In [214]:
# ---------- 超参数 ----------
MAX_SEQ_LEN   = 3          # 序列长度
EMBEDDING_DIM = 64          # item / user embedding 维度
N_HEADS       = 1           # Multi-Head Attention 头数
N_LAYERS      = 1           # Transformer block 层数
DROPOUT       = 0.1
NEG_SAMPLE    = 5
BATCH_SIZE    = 1024
EPOCHS        = 10
LR            = 2e-3
SEED          = 42
# ----------------------------

torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

# ---------- 常量（来自你已有变量） ----------
PAD_IDX = num_items          # 专用 padding id
N_ITEMS = num_items + 1      # Embedding 行数（含 PAD）

# -------------------------------------------


In [215]:
# ======== SASRec 模型 ======== #
class SASRec(nn.Module):
    def __init__(self,
                 n_items=N_ITEMS,
                 embedding_dim=EMBEDDING_DIM,
                 n_heads=N_HEADS,
                 n_layers=N_LAYERS,
                 max_len=MAX_SEQ_LEN,
                 pad_idx=PAD_IDX,
                 dropout=DROPOUT):
        super().__init__()
        self.embedding = nn.Embedding(n_items, embedding_dim, padding_idx=pad_idx)
        # self.pos_emb  = nn.Embedding(max_len, embedding_dim)
        self.pos_emb = nn.Embedding(max_len + 1, embedding_dim, padding_idx=0)

        self.dropout  = nn.Dropout(dropout)

        enc_layer = nn.TransformerEncoderLayer(
            d_model=embedding_dim,
            nhead=n_heads,
            dim_feedforward=embedding_dim*4,
            dropout=dropout,
            batch_first=True,
            activation='gelu')
        self.encoder = nn.TransformerEncoder(enc_layer, num_layers=n_layers)

        # 为 causal mask 预生成上三角矩阵
        self.register_buffer(
            "mask", torch.triu(torch.ones(max_len, max_len), diagonal=1).bool())

        nn.init.normal_(self.embedding.weight, mean=0.0, std=0.05)
        nn.init.normal_(self.pos_emb.weight, mean=0.0, std=0.05)
    def forward(self, seq):
        """
        seq: (B, T)  用户的历史 item 序列（左侧 padding）
        返回每个用户的最终表征向量: (B, D)
        """

        B, T = seq.size()
        device = seq.device

        # 1. item embedding
        item_emb = self.embedding(seq)  # (B, T, D)

        # 2. 位置编码 —— PAD 位的位置设为 0，其余为 1..T
        pos_ids = torch.arange(1, T + 1, device=device).unsqueeze(0).expand(B, -1)  # (B, T)
        pos_ids = pos_ids * (seq != self.embedding.padding_idx)  # pad → 0
        pos_emb = self.pos_emb(pos_ids)

        x = item_emb + pos_emb
        x = self.dropout(x)

        # 3. causal attention mask
        causal_mask = self.mask[:T, :T]  # (T, T)
        key_padding_mask = (seq == self.embedding.padding_idx)  # (B, T)

        x = self.encoder(x, mask=causal_mask, src_key_padding_mask=key_padding_mask)  # (B, T, D)

        # 4. 取每个序列中最后一个非 PAD 的位置的输出
        seq_lens = (seq != self.embedding.padding_idx).sum(dim=1).clamp(min=1)  # (B,)
        last_idx = (seq_lens - 1).unsqueeze(1).unsqueeze(2).expand(-1, 1, x.size(-1))  # (B, 1, D)
        h = x.gather(dim=1, index=last_idx).squeeze(1)  # (B, D)
        h = F.normalize(h, p=2, dim=1)
        return h


    def get_items_embedding(self,item_ids,l2_norm=True):
        i_vec = self.embedding(item_ids)          # (B, emb_dim)
        if l2_norm:
            i_vec = F.normalize(i_vec, p=2, dim=1)
        return i_vec

    def save_embeddings(self, num_users, num_items, device, save_dir='./embeddings'):
        import os
        import faiss
        os.makedirs(save_dir, exist_ok=True)

        self.eval()
        self.to(device)

        item_ids = torch.arange(num_items, dtype=torch.long, device=device)

        with torch.no_grad():
            item_embeds = self.get_items_embedding(item_ids, l2_norm=True)

        item_embeds = item_embeds.cpu().numpy().astype(np.float32)

        # 保存向量
        np.save(f"{save_dir}/item_embeddings.npy", item_embeds)

        # 构建 FAISS index（使用内积）
        dim = item_embeds.shape[1]
        index = faiss.IndexFlatIP(dim)
        index.add(item_embeds)

        faiss.write_index(index, f"{save_dir}/item_index.faiss")
        print("Saved user/item embeddings and FAISS index.")



In [216]:
# ======== 训练流程 ======== #
from transformers import get_cosine_schedule_with_warmup

def train_model(model, train_df, epochs,lr , batch_size, test_df=None, device=None):
    if device is None:
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


    train_loader  = customdataset.build_seq_loader(train_df, batch_size=batch_size,
                         shuffle=True, num_workers=10,pad_idx=PAD_IDX,max_len=MAX_SEQ_LEN,user_id=user,item_id=item_id)

    optimizer = torch.optim.Adam(model.parameters(), lr=lr)


    for epoch in range(1, epochs + 1):
        model.train()
        dt_start = datetime.now()
        epoch_loss = 0.0

        for batch in train_loader:
            hist, pos = batch
            hist, pos = hist.to(device), pos.to(device)

            # 1. 前向传播（返回预测向量）
            predict = model(hist)                      # (B, D)
            i_vec = model.get_items_embedding(pos,l2_norm=True)

            # 2. 得分矩阵：每个 user 对所有正 item 的打分
            logits = torch.matmul(predict, i_vec.T)  # shape: (B, B)

            # 3. 构造标签：每个 user 的正确 item 在对角线（即位置 i）
            labels = torch.arange(logits.size(0), device=device)  # [0, 1, ..., B-1]

            # 4. Cross Entropy Loss
            loss = F.cross_entropy(logits, labels)

            # 5. 反向传播
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()

        # 日志
        avg_loss = epoch_loss / len(train_loader)
        dt_end = datetime.now()
        dt = dt_end - dt_start

        print(f"[Epoch {epoch:02d}/{epochs}] avg InBatch Softmax Loss = {avg_loss:.4f}, time = {dt.total_seconds():.2f}s")

    return


In [217]:
def build_hist_matrix(df,
                      num_users,
                      max_len=MAX_SEQ_LEN,
                      pad_idx=PAD_IDX,
                      user_col=user_id,
                      item_col=item_id):
    """
    返回形状为 (num_users, max_len) 的 LongTensor。
    第 i 行是用户 i 的历史序列，左侧 PAD，右对齐。
    不存在历史的用户整行都是 pad_idx。
    """
    # 先全部填 PAD
    hist = torch.full((num_users, max_len), pad_idx, dtype=torch.long)

    # groupby 遍历每个用户已有交互
    for uid, items in df.groupby(user_col)[item_col]:
        seq = items.to_numpy()[-max_len:]             # 取最近 max_len 条
        hist[uid, -len(seq):] = torch.as_tensor(seq, dtype=torch.long)

    return hist    # (U, T)


In [218]:
model = SASRec(n_items=N_ITEMS,
                 embedding_dim=EMBEDDING_DIM,
                 pad_idx=PAD_IDX)
model = model.to(device)
train_model(model=model,epochs=50, train_df=train_df,batch_size=1024,lr=LR,test_df=test_df,device=device)



[Epoch 01/50] avg InBatch Softmax Loss = 6.8118, time = 2.46s
[Epoch 02/50] avg InBatch Softmax Loss = 6.6064, time = 2.48s
[Epoch 03/50] avg InBatch Softmax Loss = 6.5186, time = 2.51s
[Epoch 04/50] avg InBatch Softmax Loss = 6.4771, time = 2.41s
[Epoch 05/50] avg InBatch Softmax Loss = 6.4550, time = 2.31s
[Epoch 06/50] avg InBatch Softmax Loss = 6.4426, time = 2.42s
[Epoch 07/50] avg InBatch Softmax Loss = 6.4356, time = 2.22s
[Epoch 08/50] avg InBatch Softmax Loss = 6.4302, time = 2.17s
[Epoch 09/50] avg InBatch Softmax Loss = 6.4266, time = 2.28s
[Epoch 10/50] avg InBatch Softmax Loss = 6.4235, time = 2.16s
[Epoch 11/50] avg InBatch Softmax Loss = 6.4209, time = 2.19s
[Epoch 12/50] avg InBatch Softmax Loss = 6.4187, time = 2.28s
[Epoch 13/50] avg InBatch Softmax Loss = 6.4168, time = 2.15s
[Epoch 14/50] avg InBatch Softmax Loss = 6.4148, time = 2.20s
[Epoch 15/50] avg InBatch Softmax Loss = 6.4129, time = 2.16s
[Epoch 16/50] avg InBatch Softmax Loss = 6.4114, time = 2.20s
[Epoch 1

In [219]:
model.save_embeddings(num_users=num_users,num_items=num_items,device=device,save_dir=save_dir)


Saved user/item embeddings and FAISS index.


In [220]:
test_loader = customdataset.build_test_loader(test_df, num_items ,user_col = user_id, item_col = item_id, batch_size=1024, num_workers=num_workers)
item_pool = list(range(num_items))
faiss_index = faiss.read_index(f"{save_dir}/item_index.faiss")
hist_tensors = build_hist_matrix(train_df, max_len=MAX_SEQ_LEN, pad_idx=PAD_IDX,num_users=num_users).to(device)

In [221]:
hr_r, ndcg_r = evaluate.evaluate_random(test_loader, item_pool ,top_k=top_k)
print(f"Random HR@{top_k} = {hr_r:.4f}, nDCG@{top_k} = {ndcg_r:.4f}")
hr_p, ndcg_p = evaluate.evaluate_popular(test_loader, train_df,top_k=top_k)
print(f"Popular HR@{top_k} = {hr_p:.4f}, nDCG@{top_k} = {ndcg_p:.4f}")
hr_m, ndcg_m = evaluate.evaluate_seq_model(test_loader, model, faiss_index, device,top_k=top_k,hist_tensors=hist_tensors)
print(f"Model   HR@{top_k} = {hr_m:.4f}, nDCG@{top_k} = {ndcg_m:.4f}")

Random HR@10 = 0.0005, nDCG@10 = 0.0002
Popular HR@10 = 0.0029, nDCG@10 = 0.0014
Model   HR@10 = 0.0487, nDCG@10 = 0.0245
