In [1]:
import pandas as pd
from collections import Counter
import torch
from sklearn.model_selection import train_test_split
from networkx.readwrite.json_graph import adjacency
import random, math, time, os
import torch.nn.functional as F
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader



In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
dataset_pd = pd.read_csv('D:\\VideoRecSystem\\MicroLens\\DataSet\\MicroLens-50k_pairs.csv')
# dataset_pd = pd.read_csv('MicroLens-50k_pairs.csv')

In [4]:
dataset_pd.head(10)

Unnamed: 0,user,item,timestamp
0,36121,9580,1583378629552
1,26572,9580,1583436719018
2,37550,9580,1584412681021
3,14601,9580,1584848802432
4,15061,9580,1585388171106
5,6364,9580,1585390736041
6,3542,9580,1585404918503
7,21038,9580,1590144594477
8,12538,14631,1634867362929
9,47592,14631,1634872254913


In [5]:
dataset_pd.count

<bound method DataFrame.count of          user   item      timestamp
0       36121   9580  1583378629552
1       26572   9580  1583436719018
2       37550   9580  1584412681021
3       14601   9580  1584848802432
4       15061   9580  1585388171106
...       ...    ...            ...
359703  48702   1363  1662984066647
359704  27203   7291  1662984082974
359705  29261  19649  1662984103874
359706  28341  19188  1662984123833
359707  38967   7254  1662984132429

[359708 rows x 3 columns]>

In [6]:
user_counts = dataset_pd['user'].value_counts()
item_counts = dataset_pd['item'].value_counts()
# valid_users = user_counts[user_counts > 3].index
# valid_items = item_counts[item_counts > 3].index
# filtered_df = dataset_pd[dataset_pd['user'].isin(valid_users) & dataset_pd['item'].isin(valid_items)]
# filtered_df.count

In [7]:
# order by user,timestamp 
filtered_df = dataset_pd.sort_values(by=["user", "timestamp"])


In [8]:
def split(df, user_col='user', item_col='item', time_col='timestamp'):

    df = df.sort_values(by=[user_col, time_col])  # 按用户时间排序

    # 获取每个用户的最后一条记录作为 test
    test_df = df.groupby(user_col).tail(1)
    train_df = df.drop(index=test_df.index)

    # 过滤 test 中那些 user/item 不在 train 中的
    train_users = set(train_df[user_col])
    train_items = set(train_df[item_col])

    test_df = test_df[
        test_df[user_col].isin(train_users) &
        test_df[item_col].isin(train_items)
    ]

    return train_df.reset_index(drop=True), test_df.reset_index(drop=True)

In [9]:

train_df, test_df = split(filtered_df,user_col='user', item_col='item', time_col='timestamp')

print(f"Train size: {len(train_df)}")
print(f"Test size: {len(test_df)}")


Train size: 309708
Test size: 49424


In [10]:
# maintain a map from new id to old id, new id for constructing matrix
user2id = {u: i for i, u in enumerate(filtered_df['user'].unique())}
item2id = {i: j for j, i in enumerate(filtered_df['item'].unique())}

# apply to train_df and test_df
train_df['user_id'] = train_df['user'].map(user2id)
train_df['item_id'] = train_df['item'].map(item2id)
test_df['user_id'] = test_df['user'].map(user2id)
test_df['item_id'] = test_df['item'].map(item2id)

num_users = len(user2id)
num_items = len(item2id)


In [11]:
# ---------- 超参数 ----------
MAX_SEQ_LEN   = 20          # 输入序列长度
EMBEDDING_DIM = 64          # item & user 向量维度
NUM_BLOCK     = 3           # 残差层堆叠次数（每次含 dilation=[1,2,4,8]）
NEG_SAMPLE    = 5           # 训练时负采样个数
BATCH_SIZE    = 512         # 1650Ti 8G 推荐 512；显存紧张用 256
EPOCHS        = 10
LR            = 1e-3
SEED          = 42
DROPOUT       = 0.1
# ----------------------------

torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

# ---------- 常量 ----------
PAD_IDX  = num_items              # padding 专用 id，不与真实 item 冲突
N_ITEMS  = num_items + 1          # Embedding 行数（含 PAD）
ALL_ITEM_IDS = np.arange(num_items, dtype=np.int64)
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# ----------------------------

# ======== Dataset (同前) ======== #
class NextItNetBPRDataset(Dataset):
    def __init__(self, df, max_len=MAX_SEQ_LEN, pad_idx=PAD_IDX, n_neg=NEG_SAMPLE):
        self.max_len, self.pad_idx, self.n_neg = max_len, pad_idx, n_neg
        self.inputs, self.targets = [], []
        for _, user_hist in df.groupby('user_id'):
            seq = user_hist['item_id'].tolist()
            for i in range(1, len(seq)):
                hist = seq[max(0, i - max_len): i]
                hist = [pad_idx]*(max_len - len(hist)) + hist   # 左侧 pad
                self.inputs.append(hist)
                self.targets.append(seq[i])
        self.inputs  = np.asarray(self.inputs,  dtype=np.int64)
        self.targets = np.asarray(self.targets, dtype=np.int64)

    def __len__(self):
        return len(self.targets)

    def _neg_sample(self, pos):
        negs = np.random.choice(ALL_ITEM_IDS, size=self.n_neg, replace=False)
        while (negs == pos).any():
            dup = negs == pos
            negs[dup] = np.random.choice(ALL_ITEM_IDS, size=dup.sum(), replace=False)
        return negs

    def __getitem__(self, idx):
        hist = torch.tensor(self.inputs[idx], dtype=torch.long)
        pos  = torch.tensor(self.targets[idx], dtype=torch.long)
        negs = torch.tensor(self._neg_sample(self.targets[idx]), dtype=torch.long)
        return hist, pos, negs

In [12]:
# ======== NextItNet 模型 ======== #
class DilatedResidualBlock(nn.Module):
    """
    一组 (dilated conv → ReLU → dilated conv) + residual
    """
    def __init__(self, d, dilation, dropout=DROPOUT):
        super().__init__()
        self.conv1 = nn.Conv1d(d, d, kernel_size=3, padding=dilation, dilation=dilation)
        self.conv2 = nn.Conv1d(d, d, kernel_size=3, padding=1, dilation=1)
        self.dropout = nn.Dropout(dropout)
        self.layernorm1 = nn.LayerNorm(d)
        self.layernorm2 = nn.LayerNorm(d)

    def forward(self, x):
        """
        x: (B, T, D)  → conv 需要 (B, D, T)
        """
        residual = x
        x = self.layernorm1(x)
        x = F.relu(self.conv1(x.transpose(1,2)).transpose(1,2))
        x = self.dropout(x)
        x = self.layernorm2(x)
        x = F.relu(self.conv2(x.transpose(1,2)).transpose(1,2))
        x = self.dropout(x)
        return x + residual


class NextItNet(nn.Module):
    """
    Embedding → N × [dilated residual blocks] → 取最后位置 hidden → (B, D)
    """
    def __init__(self, n_items=N_ITEMS, dim=EMBEDDING_DIM, n_blocks=NUM_BLOCK, pad_idx=PAD_IDX):
        super().__init__()
        self.embedding = nn.Embedding(n_items, dim, padding_idx=pad_idx)

        blocks = []
        dilations = [1,2,4,8]  # 每轮堆叠 1-2-4-8
        for _ in range(n_blocks):
            for d in dilations:
                blocks.append(DilatedResidualBlock(dim, dilation=d))
        self.net = nn.ModuleList(blocks)

    def forward(self, seq):
        """
        seq: (B, T) → 返回用户向量 (B, D)
        """
        x = self.embedding(seq)              # (B, T, D)
        for block in self.net:
            x = block(x)
        h = x[:, -1, :]                      # 最后一个时间步
        return h

    def score(self, h, items):
        """
        点积打分：<h, E[item]>
        items: (B,) or (B,k)
        """
        item_emb = self.embedding(items)
        return (h.unsqueeze(-2) * item_emb).sum(-1)




In [13]:
# ======== BPR 损失 ======== #
def bpr_loss(pos_s, neg_s):
    return -torch.log(torch.sigmoid(pos_s.unsqueeze(-1) - neg_s) + 1e-8).mean()



In [14]:
# ======== 训练函数 ======== #
def train_nextitnet_bpr(train_df, test_df=None):
    ds = NextItNetBPRDataset(train_df)
    loader = DataLoader(ds, batch_size=BATCH_SIZE, shuffle=True,
                        num_workers=0, pin_memory=True)

    model = NextItNet().to(DEVICE)
    optim = torch.optim.Adam(model.parameters(), lr=LR)

    for epoch in range(1, EPOCHS+1):
        model.train()
        total_loss, t0 = 0.0, time.time()
        for step, (hist, pos, neg) in enumerate(loader, 1):
            hist, pos, neg = hist.to(DEVICE), pos.to(DEVICE), neg.to(DEVICE)
            h = model(hist)
            pos_s = model.score(h, pos)
            neg_s = model.score(h, neg)
            loss  = bpr_loss(pos_s, neg_s)

            optim.zero_grad()
            loss.backward()
            optim.step()

            total_loss += loss.item() * hist.size(0)
            if step % 100 == 0 or step == 1:
                print(f"[Epoch {epoch}] step {step}/{len(loader)} | loss {loss.item():.4f}", flush=True)

        print(f"Epoch {epoch} finished | avg loss {total_loss/len(ds):.4f} | time {time.time()-t0:.1f}s", flush=True)

    return model


In [15]:
# ======== 构建 (user→历史张量) ======== #
def build_hist_tensors(df):
    cache = {}
    for uid, items in df.groupby('user_id')['item_id']:
        seq = items.tolist()[-MAX_SEQ_LEN:]
        seq = [PAD_IDX]*(MAX_SEQ_LEN-len(seq)) + seq
        cache[uid] = torch.tensor(seq, dtype=torch.long).unsqueeze(0)
    return cache

In [16]:

def evaluate_ranking(
        test_df,              # DataFrame, 必含 user_id / item_id
        train_df,             # DataFrame, 用来构建用户→已交互物品集合
        score_fn,             # callable(users_tensor, items_tensor) → np.array
        num_items,            # 物品总数
        k=10,                 # Hit@K / NDCG@K
        num_neg=100,          # 每个正样本采多少负样本
        user_col='user_id',
        item_col='item_id',
        seed=42
    ):
    """
    不依赖具体模型，只要提供 score_fn 就能评估。
    score_fn: 接收 (user_tensor, item_tensor) 并返回同长度的 Numpy 分数向量。
    """
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

    # 用户历史，用于采负样本 & 过滤
    train_user_dict = (
        train_df.groupby(user_col)[item_col].apply(set).to_dict()
    )

    hits, ndcgs = [], []

    for _, row in test_df.iterrows():
        u = int(row[user_col])
        pos_item = int(row[item_col])

        # ---------- 负采样 ----------
        neg_items = set()
        while len(neg_items) < num_neg:
            neg = random.randint(0, num_items - 1)
            if neg not in train_user_dict.get(u, set()) and neg != pos_item:
                neg_items.add(neg)

        item_candidates = list(neg_items) + [pos_item]

        # ---------- 评分 ----------
        users_t  = torch.LongTensor([u] * len(item_candidates))
        items_t  = torch.LongTensor(item_candidates)
        scores   = score_fn(users_t, items_t)        # ← 只依赖 score_fn
        rank_idx = np.argsort(scores)[::-1]          # 降序
        ranked_items = [item_candidates[i] for i in rank_idx]

        # ---------- 指标 ----------
        if pos_item in ranked_items[:k]:
            hits.append(1)
            rank_pos = ranked_items.index(pos_item)
            ndcgs.append(1 / np.log2(rank_pos + 2))
        else:
            hits.append(0)
            ndcgs.append(0)

    hit_rate = float(np.mean(hits))
    ndcg     = float(np.mean(ndcgs))
    return hit_rate, ndcg

In [17]:
    # ------------------ 训练 ------------------
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model  = train_nextitnet_bpr(train_df, test_df)

[Epoch 1] step 1/508 | loss 6.8156
[Epoch 1] step 100/508 | loss 4.1198
[Epoch 1] step 200/508 | loss 3.6768
[Epoch 1] step 300/508 | loss 3.3334
[Epoch 1] step 400/508 | loss 3.3411
[Epoch 1] step 500/508 | loss 2.8915
Epoch 1 finished | avg loss 3.6849 | time 219.5s
[Epoch 2] step 1/508 | loss 2.7976
[Epoch 2] step 100/508 | loss 2.5741
[Epoch 2] step 200/508 | loss 2.3416
[Epoch 2] step 300/508 | loss 2.2341
[Epoch 2] step 400/508 | loss 2.0840
[Epoch 2] step 500/508 | loss 1.9960
Epoch 2 finished | avg loss 2.3734 | time 215.2s
[Epoch 3] step 1/508 | loss 1.7842
[Epoch 3] step 100/508 | loss 1.6016
[Epoch 3] step 200/508 | loss 1.5918
[Epoch 3] step 300/508 | loss 1.4118
[Epoch 3] step 400/508 | loss 1.4312
[Epoch 3] step 500/508 | loss 1.4476
Epoch 3 finished | avg loss 1.5983 | time 205.0s
[Epoch 4] step 1/508 | loss 1.2418
[Epoch 4] step 100/508 | loss 1.3542
[Epoch 4] step 200/508 | loss 1.4371
[Epoch 4] step 300/508 | loss 1.2469
[Epoch 4] step 400/508 | loss 1.2142
[Epoch 4] 

In [18]:
def make_popularity_score_fn(train_df, item_col='item_id'):
    item_cnt = Counter(train_df[item_col])
    default_score = min(item_cnt.values()) - 1  # 给没出现过的物品一个更低分
    def _score_fn(users_t, items_t):
        return np.array([item_cnt.get(int(i), default_score) for i in items_t])
    return _score_fn

In [19]:
def random_score_fn(users_t, items_t):
    # 随机给每个 items_t 一个分数；users_t 不使用，但必须接收
    return np.random.rand(len(items_t))

In [20]:
# ======== make_score_fn (给 evaluate_ranking 用) ======== #
def make_score_fn(model, hist_cache):
    model.eval()
    item_emb_table = model.embedding.weight.data               # 含 PAD

    @torch.no_grad()
    def score_fn(user_t, item_t):
        hist_batch = torch.cat([hist_cache[int(u)] for u in user_t], dim=0).to(DEVICE)
        h = model(hist_batch)
        item_emb = item_emb_table[item_t.to(DEVICE)]
        scores = (h * item_emb).sum(-1)
        return scores.cpu().numpy()
    return score_fn

In [21]:

# ----------------NextItNet （或其他模型）------------
score_fn_nextItNet = make_score_fn(model,hist_cache=build_hist_tensors(train_df))
hit_nextItNet, ndcg_nextItNet = evaluate_ranking(
    test_df, train_df, score_fn_nextItNet,
    num_items=num_items, k=10
)
print(f"NextItNet   Hit@10={hit_nextItNet:.4f}  NDCG@10={ndcg_nextItNet:.4f}")

# ---------------- baseline：Popular ----------------
pop_score_fn  = make_popularity_score_fn(train_df)
hit_pop, ndcg_pop = evaluate_ranking(
    test_df, train_df, pop_score_fn,
    num_items=num_items, k=10
)
print(f"Popular  Hit@10={hit_pop:.4f}  NDCG@10={ndcg_pop:.4f}")

# ---------------- baseline：Random -----------------
hit_rand, ndcg_rand = evaluate_ranking(
    test_df, train_df, random_score_fn,
    num_items=num_items, k=10
)
print(f"Random   Hit@10={hit_rand:.4f}  NDCG@10={ndcg_rand:.4f}")



NextItNet   Hit@10=0.2551  NDCG@10=0.1269
Popular  Hit@10=0.2528  NDCG@10=0.1262
Random   Hit@10=0.0996  NDCG@10=0.0455
