In [None]:
from google.colab import drive
import shutil
import os
def copy_from_drive(src_path, dst_path):

    if os.path.exists(dst_path):
        print(f"skip:{dst_path} exists")
        return

    if os.path.isdir(src_path):
        shutil.copytree(src_path, dst_path)
    elif os.path.isfile(src_path):
        shutil.copy(src_path, dst_path)

drive.mount('/content/drive')
copy_from_drive('/content/drive/MyDrive/tool', '/content/tool')
copy_from_drive('/content/drive/MyDrive/MicroLens-50k_pairs.csv','/content/MicroLens-50k_pairs.csv')

In [1]:
import pandas as pd
from collections import Counter
import torch
from sklearn.model_selection import train_test_split
from networkx.readwrite.json_graph import adjacency
import random, math, time, os
import torch.nn.functional as F
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import random
from tool import preprocess
from tool import customdataset
from tool import evaluate
!pip install faiss-cpu
import faiss
import torch.nn as nn
import numpy as np
import torch
import torch.nn.functional as F
from datetime import datetime


In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
path = 'MicroLens-50k_pairs.csv'
user = 'user'
item = 'item'
user_id = 'user_id'
item_id = 'item_id'
timestamp = 'timestamp'
save_dir = './embeddings'
top_k = 10
num_workers = 10
k_neg = 10
# path = pd.read_csv('MicroLens-50k_pairs.csv')

In [4]:
dataset_pd,num_users,num_items = preprocess.openAndSort(path,user_id=user,item_id=item,timestamp='timestamp')

Unnamed: 0,user,item,timestamp
0,36121,9580,1583378629552
1,26572,9580,1583436719018
2,37550,9580,1584412681021
3,14601,9580,1584848802432
4,15061,9580,1585388171106
5,6364,9580,1585390736041
6,3542,9580,1585404918503
7,21038,9580,1590144594477
8,12538,14631,1634867362929
9,47592,14631,1634872254913


In [None]:
train_df, test_df = preprocess.split(dataset_pd,user, item, timestamp)
print(f"Train size: {len(train_df)}")
print(f"Test size: {len(test_df)}")

In [5]:
# maintain a map from new id to old id, new id for constructing matrix
user2id = {u: i for i, u in enumerate(dataset_pd[user].unique())}
item2id = {i: j for j, i in enumerate(dataset_pd[item].unique())}

# apply to train_df and test_df
train_df[user_id] = train_df[user].map(user2id)
train_df[item_id] = train_df[item].map(item2id)
test_df[user_id] = test_df[user].map(user2id)
test_df[item_id] = test_df[item].map(item2id)



<bound method DataFrame.count of          user   item      timestamp
0       36121   9580  1583378629552
1       26572   9580  1583436719018
2       37550   9580  1584412681021
3       14601   9580  1584848802432
4       15061   9580  1585388171106
...       ...    ...            ...
359703  48702   1363  1662984066647
359704  27203   7291  1662984082974
359705  29261  19649  1662984103874
359706  28341  19188  1662984123833
359707  38967   7254  1662984132429

[359708 rows x 3 columns]>

In [11]:
# ---------- 超参数 ----------
MAX_SEQ_LEN   = 20          # 序列长度
EMBEDDING_DIM = 64          # item / user embedding 维度
N_HEADS       = 2           # Multi-Head Attention 头数
N_LAYERS      = 2           # Transformer block 层数
DROPOUT       = 0.2
NEG_SAMPLE    = 5
BATCH_SIZE    = 1024
EPOCHS        = 10
LR            = 1e-3
SEED          = 42
# ----------------------------

torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

# ---------- 常量（来自你已有变量） ----------
PAD_IDX = num_items          # 专用 padding id
N_ITEMS = num_items + 1      # Embedding 行数（含 PAD）

# -------------------------------------------


In [12]:
# ======== SASRec 模型 ======== #
class SASRec(nn.Module):
    def __init__(self,
                 n_items=N_ITEMS,
                 embedding_dim=EMBEDDING_DIM,
                 n_heads=N_HEADS,
                 n_layers=N_LAYERS,
                 max_len=MAX_SEQ_LEN,
                 pad_idx=PAD_IDX,
                 dropout=DROPOUT):
        super().__init__()
        self.embedding = nn.Embedding(n_items, embedding_dim, padding_idx=pad_idx)
        self.pos_emb  = nn.Embedding(max_len, embedding_dim)
        self.dropout  = nn.Dropout(dropout)

        enc_layer = nn.TransformerEncoderLayer(
            d_model=embedding_dim,
            nhead=n_heads,
            dim_feedforward=embedding_dim*4,
            dropout=dropout,
            batch_first=True,
            activation='gelu')
        self.encoder = nn.TransformerEncoder(enc_layer, num_layers=n_layers)

        # 为 causal mask 预生成上三角矩阵
        self.register_buffer(
            "mask", torch.triu(torch.ones(max_len, max_len), diagonal=1).bool())

    def forward(self, seq):                       # seq : (B,T)
        B, T = seq.size()
        item_e = self.embedding(seq)               # (B,T,D)
        pos_ids = torch.arange(T, device=seq.device).unsqueeze(0).expand(B, -1)
        x = item_e + self.pos_emb(pos_ids)        # 加位置编码
        x = self.dropout(x)

        # causal attention mask
        causal_mask = self.mask[:T, :T]
        x = self.encoder(x, src_key_padding_mask=(seq == PAD_IDX), mask=causal_mask)
        h = x[:, -1, :]                           # 取最后位置向量 (B,D)
        return h

    def get_items_embedding(self,item_ids,l2_norm=True):
        i_vec = self.embedding(item_ids)          # (B, emb_dim)
        if l2_norm:
            i_vec = F.normalize(i_vec, p=2, dim=1)
        return i_vec

    def save_embeddings(self, num_users, num_items, device, save_dir='./embeddings'):
        import os
        import faiss
        os.makedirs(save_dir, exist_ok=True)

        self.eval()
        self.to(device)

        item_ids = torch.arange(num_items, dtype=torch.long, device=device)

        with torch.no_grad():
            item_embeds = self.get_items_embedding(item_ids, l2_norm=True)

        item_embeds = item_embeds.cpu().numpy().astype(np.float32)

        # 保存向量
        np.save(f"{save_dir}/item_embeddings.npy", item_embeds)

        # 构建 FAISS index（使用内积）
        dim = item_embeds.shape[1]
        index = faiss.IndexFlatIP(dim)
        index.add(item_embeds)

        faiss.write_index(index, f"{save_dir}/item_index.faiss")
        print("Saved user/item embeddings and FAISS index.")



In [14]:
# ======== 训练流程 ======== #
def train_model(model, train_df, epochs,lr , batch_size, test_df=None, device=None):
    if device is None:
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


    train_loader  = customdataset.build_seq_neg_loader(train_df, batch_size=batch_size,
                         shuffle=True, num_workers=10,pad_idx=PAD_IDX,max_len=MAX_SEQ_LEN,user_id=user,item_id=item_id,k_neg=1,neg_sampling='pop',alpha=0.75,num_items=num_items)

    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    for epoch in range(1, epochs + 1):
        model.train()
        dt_start = datetime.now()
        epoch_loss = 0.0

        for batch in train_loader:

            # 1.负采样
            hist_ids, pos_ids, neg_ids = batch
            # 2.搬设备
            hist_ids = hist_ids.to(device)   # 已经是 LongTensor
            pos_ids  = pos_ids.to(device)
            neg_ids  = neg_ids.to(device)

            # 3.Forward ---- #
            user_emb = model(hist_ids)                      # (B, d)
            pos_emb  = model.get_items_embedding(pos_ids)              # (B, d)
            pos_score = (user_emb * pos_emb).sum(-1)        # (B,)

            neg_emb  = model.get_items_embedding(neg_ids)              # (B, k, d)
            neg_score = (user_emb.unsqueeze(1) * neg_emb).sum(-1)   # (B, k)

            # 4.BPR (pairwise) Loss ---- #
            if k_neg == 1:
                loss = -F.logsigmoid(pos_score - neg_score.squeeze(1)).mean()
            else:
                loss = -F.logsigmoid(pos_score.unsqueeze(1) - neg_score).mean()

            # 5. 反向传播
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()

        # 日志
        avg_loss = epoch_loss / len(train_loader)
        dt_end = datetime.now()
        dt = dt_end - dt_start

        print(f"[Epoch {epoch:02d}/{epochs}] avg InBatch Softmax Loss = {avg_loss:.4f}, time = {dt.total_seconds():.2f}s")

    return


In [15]:
def build_hist_matrix(df,
                      num_users,
                      max_len=MAX_SEQ_LEN,
                      pad_idx=PAD_IDX,
                      user_col=user_id,
                      item_col=item_id):
    """
    返回形状为 (num_users, max_len) 的 LongTensor。
    第 i 行是用户 i 的历史序列，左侧 PAD，右对齐。
    不存在历史的用户整行都是 pad_idx。
    """
    # 先全部填 PAD
    hist = torch.full((num_users, max_len), pad_idx, dtype=torch.long)

    # groupby 遍历每个用户已有交互
    for uid, items in df.groupby(user_col)[item_col]:
        seq = items.to_numpy()[-max_len:]             # 取最近 max_len 条
        hist[uid, -len(seq):] = torch.as_tensor(seq, dtype=torch.long)

    return hist    # (U, T)


In [25]:
model = SASRec(n_items=N_ITEMS,
                 embedding_dim=EMBEDDING_DIM,
                 pad_idx=PAD_IDX)
model = model.to(device)
train_model(model=model,epochs=50, train_df=train_df,batch_size=1024,lr=LR,test_df=test_df,device=device)

50000

In [17]:
    # ------------------ 训练 ------------------
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
train_model(model=model,epochs=50, train_df=train_df,batch_size=1024,lr=LR,test_df=test_df,device=device)


[Ep 1] step 1/508 | loss 4.4320
[Ep 1] step 100/508 | loss 3.3622
[Ep 1] step 200/508 | loss 2.3144
[Ep 1] step 300/508 | loss 1.7428
[Ep 1] step 400/508 | loss 1.2451
[Ep 1] step 500/508 | loss 0.9118
Ep 1 done | avg loss 2.1889 | 197.6s

[Ep 2] step 1/508 | loss 0.9144
[Ep 2] step 100/508 | loss 0.7561
[Ep 2] step 200/508 | loss 0.7230
[Ep 2] step 300/508 | loss 0.6795
[Ep 2] step 400/508 | loss 0.6865
[Ep 2] step 500/508 | loss 0.6798
Ep 2 done | avg loss 0.7240 | 190.1s

[Ep 3] step 1/508 | loss 0.6465
[Ep 3] step 100/508 | loss 0.6473
[Ep 3] step 200/508 | loss 0.6133
[Ep 3] step 300/508 | loss 0.6335
[Ep 3] step 400/508 | loss 0.6117
[Ep 3] step 500/508 | loss 0.5873
Ep 3 done | avg loss 0.6296 | 227.0s

[Ep 4] step 1/508 | loss 0.6314
[Ep 4] step 100/508 | loss 0.6080
[Ep 4] step 200/508 | loss 0.5872
[Ep 4] step 300/508 | loss 0.5316
[Ep 4] step 400/508 | loss 0.5851
[Ep 4] step 500/508 | loss 0.5617
Ep 4 done | avg loss 0.5716 | 205.8s

[Ep 5] step 1/508 | loss 0.5435
[Ep 5] s

In [None]:
model.save_embeddings(num_users=num_users,num_items=num_items,device=device,save_dir=save_dir)


In [None]:
test_loader = customdataset.build_test_loader(test_df, num_items ,user_col = user_id, item_col = item_id, batch_size=1024, num_workers=num_workers)
item_pool = list(range(num_items))
faiss_index = faiss.read_index(f"{save_dir}/item_index.faiss")
hist_tensors = build_hist_matrix(train_df, max_len=MAX_SEQ_LEN, pad_idx=PAD_IDX,num_users=num_users).to(device)

In [30]:
hr_r, ndcg_r = evaluate.evaluate_random(test_loader, item_pool ,top_k=top_k)
print(f"Random HR@{top_k} = {hr_r:.4f}, nDCG@{top_k} = {ndcg_r:.4f}")
hr_p, ndcg_p = evaluate.evaluate_popular(test_loader, train_df,top_k=top_k)
print(f"Popular HR@{top_k} = {hr_p:.4f}, nDCG@{top_k} = {ndcg_p:.4f}")
hr_m, ndcg_m = evaluate.evaluate_seq_model(test_loader, model, faiss_index, device,top_k=top_k,hist_tensors=hist_tensors)
print(f"Model   HR@{top_k} = {hr_m:.4f}, nDCG@{top_k} = {ndcg_m:.4f}")

SASRec   Hit@10=0.9847  NDCG@10=0.9845


KeyboardInterrupt: 