In [None]:
from google.colab import drive
import shutil
import os
def copy_from_drive(src_path, dst_path):

    if os.path.exists(dst_path):
        print(f"skip:{dst_path} exists")
        return

    if os.path.isdir(src_path):
        shutil.copytree(src_path, dst_path)
    elif os.path.isfile(src_path):
        shutil.copy(src_path, dst_path)

drive.mount('/content/drive')
copy_from_drive('/content/drive/MyDrive/tool', '/content/tool')
copy_from_drive('/content/drive/MyDrive/MicroLens-50k_pairs.csv','/content/MicroLens-50k_pairs.csv')

In [1]:
import pandas as pd
from collections import Counter
import torch
from sklearn.model_selection import train_test_split
from networkx.readwrite.json_graph import adjacency
import random

import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from tool import preprocess
from tool import customdataset
from tool import evaluate
!pip install faiss-cpu
import faiss
import torch.nn as nn
import numpy as np
import torch
import torch.nn.functional as F
from datetime import datetime




In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
path = 'MicroLens-50k_pairs.csv'
user = 'user'
item = 'item'
user_id = 'user_id'
item_id = 'item_id'
timestamp = 'timestamp'
save_dir = './embeddings'
top_k = 10
num_workers = 10
k_neg = 10
# path = pd.read_csv('MicroLens-50k_pairs.csv')

In [4]:
dataset_pd,num_users,num_items = preprocess.openAndSort(path,user_id=user,item_id=item,timestamp='timestamp')

Unnamed: 0,user,item,timestamp
0,36121,9580,1583378629552
1,26572,9580,1583436719018
2,37550,9580,1584412681021
3,14601,9580,1584848802432
4,15061,9580,1585388171106
5,6364,9580,1585390736041
6,3542,9580,1585404918503
7,21038,9580,1590144594477
8,12538,14631,1634867362929
9,47592,14631,1634872254913


In [6]:
train_df, test_df = preprocess.split(dataset_pd,user, item, timestamp)
print(f"Train size: {len(train_df)}")
print(f"Test size: {len(test_df)}")

In [7]:
# maintain a map from new id to old id, new id for constructing matrix
user2id = {u: i for i, u in enumerate(dataset_pd[user].unique())}
item2id = {i: j for j, i in enumerate(dataset_pd[item].unique())}

# apply to train_df and test_df
train_df[user_id] = train_df[user].map(user2id)
train_df[item_id] = train_df[item].map(item2id)
test_df[user_id] = test_df[user].map(user2id)
test_df[item_id] = test_df[item].map(item2id)



In [11]:
# ---------- 超参数 ----------
MAX_SEQ_LEN   = 20          # 输入序列长度
EMBEDDING_DIM = 64          # item / user embedding 维度
HIDDEN_SIZE   = 64          # GRU 隐藏维度（可与 EMBEDDING_DIM 相同）
NEG_SAMPLE    = 5           # 训练时每个正样本采负样本数
BATCH_SIZE    = 256
EPOCHS        = 10
LR            = 1e-3
SEED          = 42
# ----------------------------

# ---------- 随机种子 ----------
torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)
# ----------------------------

# ---------- 常量 ----------
PAD_IDX  = num_items              # padding 专用 id（不与真实 item 冲突）
N_ITEMS  = num_items + 1          # Embedding 行数（含 PAD）
# ----------------------------



In [12]:
# ======== 模型 ======== #
class GRU4RecBPR(nn.Module):
    def __init__(self, n_items=N_ITEMS,
                 embedding_dim=EMBEDDING_DIM, hidden_size=HIDDEN_SIZE,
                 num_layers=1, pad_idx=PAD_IDX):
        super().__init__()
        self.embedding = nn.Embedding(n_items, embedding_dim, padding_idx=pad_idx)
        self.gru = nn.GRU(embedding_dim, hidden_size,
                          num_layers=num_layers, batch_first=True)
        self.proj = (nn.Linear(hidden_size, embedding_dim, bias=False)
                     if hidden_size != embedding_dim else nn.Identity())

    def forward(self, seq):
        """
        seq : (B, T) → 返回用户向量 h_hat : (B, D)
        """
        emb  = self.embedding(seq)            # (B, T, D)
        out, _ = self.gru(emb)                # (B, T, H)
        h     = out[:, -1, :]                 # (B, H) 最后一步
        return self.proj(h)                   # (B, D)


    def get_items_embedding(self,item_ids,l2_norm=True):
        i = self.item_emb(item_ids)          # (B, emb_dim)

        i_vec = self.mlp_item(i)            # (B, d)

        if l2_norm:
            i_vec = F.normalize(i_vec, p=2, dim=1)
        return i_vec

    def save_embeddings(self, num_users, num_items, device, save_dir='./embeddings'):
        import os
        import faiss
        os.makedirs(save_dir, exist_ok=True)

        self.eval()
        self.to(device)

        item_ids = torch.arange(num_items, dtype=torch.long, device=device)

        with torch.no_grad():
            item_embeds = self.get_items_embedding(item_ids, l2_norm=True)

        item_embeds = item_embeds.cpu().numpy().astype(np.float32)

        # 保存向量
        np.save(f"{save_dir}/item_embeddings.npy", item_embeds)

        # 构建 FAISS index（使用内积）
        dim = item_embeds.shape[1]
        index = faiss.IndexFlatIP(dim)
        index.add(item_embeds)

        faiss.write_index(index, f"{save_dir}/item_index.faiss")
        print("Saved user/item embeddings and FAISS index.")


In [13]:
# # ======== 损失 ======== #
# def bpr_loss(pos_score, neg_score):
#     """
#     BPR  pairwise  loss
#     pos_score : (B,)
#     neg_score : (B, n_neg)
#     """
#     diff = pos_score.unsqueeze(-1) - neg_score
#     return -torch.log(torch.sigmoid(diff) + 1e-8).mean()

In [14]:
# ======== 训练流程 ======== #
def train_model(model, train_df, epochs,lr , batch_size, test_df=None, device=None):
    if device is None:
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


    train_loader  = customdataset.build_seq_loader(train_df, batch_size=batch_size,
                         shuffle=True, num_workers=10,pad_idx=PAD_IDX,max_len=MAX_SEQ_LEN,user_id=user,item_id=item_id)

    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    for epoch in range(1, epochs + 1):
        model.train()
        dt_start = datetime.now()
        epoch_loss = 0.0

        for batch in train_loader:
            hist, pos = batch
            hist, pos = hist.to(device), pos.to(device)

            # 1. 前向传播（返回预测向量）
            h = model(hist)                      # (B, D)

            # 2. 得分矩阵：每个 user 对所有正 item 的打分
            logits = torch.matmul(h, pos.T)  # shape: (B, B)

            # 3. 构造标签：每个 user 的正确 item 在对角线（即位置 i）
            labels = torch.arange(logits.size(0), device=device)  # [0, 1, ..., B-1]

            # 4. Cross Entropy Loss
            loss = F.cross_entropy(logits, labels)

            # 5. 反向传播
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()

        # 日志
        avg_loss = epoch_loss / len(train_loader)
        dt_end = datetime.now()
        dt = dt_end - dt_start

        print(f"[Epoch {epoch:02d}/{epochs}] avg InBatch Softmax Loss = {avg_loss:.4f}, time = {dt.total_seconds():.2f}s")

    return


In [15]:
# ======== 构建 (user → 历史张量) ======== #
def build_hist_tensors(df, max_len=MAX_SEQ_LEN, pad_idx=PAD_IDX,user_id=user_id,item_id=item_id):
    user_hist_t = {}
    for uid, items in df.groupby(user_id)[item_id]:
        seq = items.tolist()[-max_len:]
        seq = [pad_idx] * (max_len - len(seq)) + seq
        user_hist_t[uid] = torch.tensor(seq, dtype=torch.long).unsqueeze(0)  # (1,T)
    return user_hist_t

In [16]:

# def evaluate_ranking(
#         test_df,              # DataFrame, 必含 user_id / item_id
#         train_df,             # DataFrame, 用来构建用户→已交互物品集合
#         score_fn,             # callable(users_tensor, items_tensor) → np.array
#         num_items,            # 物品总数
#         k=10,                 # Hit@K / NDCG@K
#         num_neg=100,          # 每个正样本采多少负样本
#         user_col='user_id',
#         item_col='item_id',
#         seed=42
#     ):
#     """
#     不依赖具体模型，只要提供 score_fn 就能评估。
#     score_fn: 接收 (user_tensor, item_tensor) 并返回同长度的 Numpy 分数向量。
#     """
#     random.seed(seed)
#     np.random.seed(seed)
#     torch.manual_seed(seed)
#
#     # 用户历史，用于采负样本 & 过滤
#     train_user_dict = (
#         train_df.groupby(user_col)[item_col].apply(set).to_dict()
#     )
#
#     hits, ndcgs = [], []
#
#     for _, row in test_df.iterrows():
#         u = int(row[user_col])
#         pos_item = int(row[item_col])
#
#         # ---------- 负采样 ----------
#         neg_items = set()
#         while len(neg_items) < num_neg:
#             neg = random.randint(0, num_items - 1)
#             if neg not in train_user_dict.get(u, set()) and neg != pos_item:
#                 neg_items.add(neg)
#
#         item_candidates = list(neg_items) + [pos_item]
#
#         # ---------- 评分 ----------
#         users_t  = torch.LongTensor([u] * len(item_candidates))
#         items_t  = torch.LongTensor(item_candidates)
#         scores   = score_fn(users_t, items_t)        # ← 只依赖 score_fn
#         rank_idx = np.argsort(scores)[::-1]          # 降序
#         ranked_items = [item_candidates[i] for i in rank_idx]
#
#         # ---------- 指标 ----------
#         if pos_item in ranked_items[:k]:
#             hits.append(1)
#             rank_pos = ranked_items.index(pos_item)
#             ndcgs.append(1 / np.log2(rank_pos + 2))
#         else:
#             hits.append(0)
#             ndcgs.append(0)
#
#     hit_rate = float(np.mean(hits))
#     ndcg     = float(np.mean(ndcgs))
#     return hit_rate, ndcg

In [17]:
    # ------------------ 训练 ------------------
model = GRU4RecBPR(n_items=N_ITEMS,
                 embedding_dim=EMBEDDING_DIM, hidden_size=HIDDEN_SIZE,
                 num_layers=1, pad_idx=PAD_IDX)
device = model.to(device)
train_model(model=model,epochs=50, train_df=train_df,batch_size=1024,lr=LR,test_df=test_df,device=device)

[Epoch 1] Step 1/1015
[Epoch 1] Step 100/1015
[Epoch 1] Step 200/1015
[Epoch 1] Step 300/1015
[Epoch 1] Step 400/1015
[Epoch 1] Step 500/1015
[Epoch 1] Step 600/1015
[Epoch 1] Step 700/1015
[Epoch 1] Step 800/1015
[Epoch 1] Step 900/1015
[Epoch 1] Step 1000/1015
Epoch 01 | BPR loss = 0.7656
[Epoch 2] Step 1/1015
[Epoch 2] Step 100/1015
[Epoch 2] Step 200/1015
[Epoch 2] Step 300/1015
[Epoch 2] Step 400/1015
[Epoch 2] Step 500/1015
[Epoch 2] Step 600/1015
[Epoch 2] Step 700/1015
[Epoch 2] Step 800/1015
[Epoch 2] Step 900/1015
[Epoch 2] Step 1000/1015
Epoch 02 | BPR loss = 0.6004
[Epoch 3] Step 1/1015
[Epoch 3] Step 100/1015
[Epoch 3] Step 200/1015
[Epoch 3] Step 300/1015
[Epoch 3] Step 400/1015
[Epoch 3] Step 500/1015
[Epoch 3] Step 600/1015
[Epoch 3] Step 700/1015
[Epoch 3] Step 800/1015
[Epoch 3] Step 900/1015
[Epoch 3] Step 1000/1015
Epoch 03 | BPR loss = 0.5054
[Epoch 4] Step 1/1015
[Epoch 4] Step 100/1015
[Epoch 4] Step 200/1015
[Epoch 4] Step 300/1015
[Epoch 4] Step 400/1015
[Epoch

In [None]:
model.save_embeddings(num_users=num_users,num_items=num_items,device=device,save_dir=save_dir)

In [None]:
test_loader = customdataset.build_test_loader(test_df, num_items ,user_col = user_id, item_col = item_id, batch_size=1024, num_workers=num_workers)
item_pool = list(range(num_items))
faiss_index = faiss.read_index(f"{save_dir}/item_index.faiss")
hist_tensors = build_hist_tensors(train_df, max_len=MAX_SEQ_LEN, pad_idx=PAD_IDX)

In [None]:
hr_r, ndcg_r = evaluate.evaluate_random(test_loader, item_pool ,top_k=top_k)
print(f"Random HR@{top_k} = {hr_r:.4f}, nDCG@{top_k} = {ndcg_r:.4f}")
hr_p, ndcg_p = evaluate.evaluate_popular(test_loader, train_df,top_k=top_k)
print(f"Popular HR@{top_k} = {hr_p:.4f}, nDCG@{top_k} = {ndcg_p:.4f}")
hr_m, ndcg_m = evaluate.evaluate_seq_model(test_loader, model, faiss_index, device,top_k=top_k,hist_tensors=hist_tensors)
print(f"Model   HR@{top_k} = {hr_m:.4f}, nDCG@{top_k} = {ndcg_m:.4f}")

In [18]:
# def make_popularity_score_fn(train_df, item_col='item_id'):
#     item_cnt = Counter(train_df[item_col])
#     default_score = min(item_cnt.values()) - 1  # 给没出现过的物品一个更低分
#     def _score_fn(users_t, items_t):
#         return np.array([item_cnt.get(int(i), default_score) for i in items_t])
#     return _score_fn

In [19]:
# def random_score_fn(users_t, items_t):
#     # 随机给每个 items_t 一个分数；users_t 不使用，但必须接收
#     return np.random.rand(len(items_t))

In [20]:
# # ======== make_score_fn，用于 evaluate_ranking ======== #
# def make_score_fn(model, hist_tensors, device="cpu"):
#     model.eval()
#     item_emb_table = model.embedding.weight.data  # 含 PAD
#     @torch.no_grad()
#     def score_fn(users_t, items_t):
#         users_t = users_t.to(device)
#         items_t = items_t.to(device)
#
#         # 拼接用户历史 batch
#         hist_batch = torch.cat([hist_tensors[int(u)] for u in users_t], dim=0).to(device)
#         h = model(hist_batch)                       # (B,D)
#         item_emb = item_emb_table[items_t]          # (B,D)
#         scores = (h * item_emb).sum(-1)             # 点积
#         return scores.cpu().numpy()
#     return score_fn

In [21]:

# # ---------------- GRU（或其他模型）------------
# score_fn_gru = make_score_fn(model,hist_tensors=build_hist_tensors(train_df),device=device)
# hit_gru, ndcg_gru = evaluate_ranking(
#     test_df, train_df, score_fn_gru,
#     num_items=num_items, k=10
# )
# print(f"GRU   Hit@10={hit_gru:.4f}  NDCG@10={ndcg_gru:.4f}")
#
# # ---------------- baseline：Popular ----------------
# pop_score_fn  = make_popularity_score_fn(train_df)
# hit_pop, ndcg_pop = evaluate_ranking(
#     test_df, train_df, pop_score_fn,
#     num_items=num_items, k=10
# )
# print(f"Popular  Hit@10={hit_pop:.4f}  NDCG@10={ndcg_pop:.4f}")
#
# # ---------------- baseline：Random -----------------
# hit_rand, ndcg_rand = evaluate_ranking(
#     test_df, train_df, random_score_fn,
#     num_items=num_items, k=10
# )
# print(f"Random   Hit@10={hit_rand:.4f}  NDCG@10={ndcg_rand:.4f}")



GRU   Hit@10=0.2895  NDCG@10=0.1460
Popular  Hit@10=0.2528  NDCG@10=0.1262
Random   Hit@10=0.0996  NDCG@10=0.0455
