### 导包

In [39]:
import paddle
import paddle.nn as nn
import numpy as np
import pandas as pd
import random
from tqdm import tqdm
from paddle.io import Dataset, DataLoader
from sklearn.model_selection import KFold,GroupKFold

### 模型

In [40]:

class PMLAM(nn.Layer):
    def __init__(self, num_users, num_items, embed_dim):
        super(PMLAM, self).__init__()
        self.embed_dim = embed_dim

        # 用户和物品的高斯分布参数
        self.user_mu = nn.Embedding(num_users, embed_dim)
        self.user_sigma = nn.Embedding(num_users, embed_dim)
        self.item_mu = nn.Embedding(num_items, embed_dim)
        self.item_sigma = nn.Embedding(num_items, embed_dim)

        # 自适应边距生成网络
        self.margin_net = nn.Sequential(
            nn.Linear(3 * embed_dim, 64),  # 输入：用户-正样本-负样本的差异
            nn.ReLU(),
            nn.Linear(64, 1),
            nn.Softplus()  # 确保边距 > 0
        )

        # 初始化参数
        self._init_weights()

    def _init_weights(self):
        # 高斯分布参数初始化
        nn.initializer.Normal(std=0.01)(self.user_mu.weight)
        nn.initializer.Normal(std=0.01)(self.user_sigma.weight)
        nn.initializer.Normal(std=0.01)(self.item_mu.weight)
        nn.initializer.Normal(std=0.01)(self.item_sigma.weight)

    def forward(self, users, pos_items, neg_items):
        # 获取高斯分布参数
        u_mu = self.user_mu(users)
        u_sigma = paddle.exp(self.user_sigma(users))  # 保证方差为正
        pos_mu = self.item_mu(pos_items)
        pos_sigma = paddle.exp(self.item_sigma(pos_items))
        neg_mu = self.item_mu(neg_items)
        neg_sigma = paddle.exp(self.item_sigma(neg_items))

        # 计算Wasserstein距离
        def wasserstein_dist(mu1, sigma1, mu2, sigma2):
            return paddle.sum((mu1 - mu2)**2, axis=1) + paddle.sum((paddle.sqrt(sigma1) - paddle.sqrt(sigma2))**2, axis=1)

        pos_dist = wasserstein_dist(u_mu, u_sigma, pos_mu, pos_sigma)
        neg_dist = wasserstein_dist(u_mu, u_sigma, neg_mu, neg_sigma)

        # 自适应边距生成
        s_ij = (u_mu - pos_mu)**2  # 用户-正样本差异
        s_ik = (u_mu - neg_mu)**2  # 用户-负样本差异
        s_input = paddle.concat([s_ij, s_ik, s_ij*s_ik], axis=1)  # Eq. 11
        margin = self.margin_net(s_input)

        return pos_dist, neg_dist, margin

    def predict(self, users):
        # 为指定用户生成所有物品的Wasserstein距离
        u_mu = self.user_mu(users)
        u_sigma = paddle.exp(self.user_sigma(users))
        all_items_mu = self.item_mu.weight
        all_items_sigma = paddle.exp(self.item_sigma.weight)

        # 计算用户与所有物品的距离
        dist = paddle.sum((u_mu.unsqueeze(1) - all_items_mu)**2, axis=2) + \
            paddle.sum((paddle.sqrt(u_sigma).unsqueeze(1) -
                       paddle.sqrt(all_items_sigma))**2, axis=2)
        return dist

### 功能函数(类)

In [41]:
class RecDataset(Dataset):
    def __init__(self, user_item_pairs, num_users, num_items, user_pos_items):
        self.user_neg_items = {} 
        self.user_item_pairs = user_item_pairs
        self.num_users = num_users
        self.num_items = num_items
        # 记录每个用户的正样本
        self.user_pos_items = user_pos_items  
        all_items = set(range(num_items))
        for user, pos_list in self.user_pos_items.items():
            pos_set = set(pos_list)
            self.user_neg_items[user] = list(all_items - pos_set)

    def __len__(self):
        return len(self.user_item_pairs)

    def __getitem__(self, idx):
        user, pos_item = self.user_item_pairs[idx]
        # 随机负样本采样
        neg_item = random.choice(self.user_neg_items[user])
        return user, pos_item, neg_item



In [42]:
def load_and_split_data(csv_path, min_rating=4.0, n_splits=5):
    df = pd.read_csv(csv_path, names=["item","user",  "rate", "timestamp"])
    df["rate"]=pd.to_numeric(df["rate"],errors='coerce')
    df = df[df["rate"] >= min_rating]
    
    
    # 编码用户和物品ID
    user_ids = df["user"].astype("category").cat.codes.values
    item_ids = df["item"].astype("category").cat.codes.values
    num_users, num_items = len(df["user"].unique()), len(df["item"].unique())
    
    # 记录每个用户的所有正样本
    user_pos_items = {}
    for u, i in zip(user_ids, item_ids):
        if u not in user_pos_items:
            user_pos_items[u] = []
        user_pos_items[u].append(i)
    
    # 5折分割：按用户分组，确保同一用户的所有交互在同一fold中
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    folds = []
    for train_idx, test_idx in kf.split(df):
        train_pairs = list(zip(user_ids[train_idx], item_ids[train_idx]))
        test_pairs = list(zip(user_ids[test_idx], item_ids[test_idx]))
        folds.append((train_pairs, test_pairs))
    
    return folds, num_users, num_items, user_pos_items

In [43]:
def evaluate(model, test_pairs, num_users, num_items, user_pos_items, k=10):
    model.eval()
    recalls = []

    for user in range(num_users):
        # 获取用户所有正样本（包括训练集和测试集）
        all_pos_items = user_pos_items.get(user, [])

        # 获取该用户的测试集正样本
        test_pos_items = [i for (u, i) in test_pairs if u == user]
        if not test_pos_items:
            continue

        # 训练集中该用户的正样本（即从 all_pos_items 中去掉 test ）
        train_pos_items = list(set(all_pos_items) - set(test_pos_items))

        # 模型预测
        dist = model.predict(paddle.to_tensor([user]))  # [1, num_items]
        dist = dist.squeeze()  # [num_items]

        # 创建mask
        mask = paddle.ones([num_items], dtype='float32')
        if train_pos_items:
            train_pos_tensor = paddle.to_tensor(train_pos_items, dtype='int64')
            zero_tensor = paddle.zeros_like(train_pos_tensor, dtype='float32')
            mask = paddle.scatter(mask, train_pos_tensor, zero_tensor)

        # 给训练集正样本加大值
        dist = dist + mask * 1e9

        # 取 top-k 最小的 item（即推荐的）
        _, topk = paddle.topk(dist, k=k, largest=False)

        # 计算 recall
        hit = len(set(topk.numpy()) & set(test_pos_items))
        recalls.append(hit / len(test_pos_items))

    return np.mean(recalls) if recalls else 0.0

In [None]:
def train_and_evaluate(folds, num_users, num_items, user_pos_items, embed_dim=50, epochs=10):
    fold_metrics = []
    for fold_id, (train_pairs, test_pairs) in enumerate(folds):
        print(f"\n=== Fold {fold_id + 1} ===")

        # 初始化模型
        model = PMLAM(num_users, num_items, embed_dim)

        # 分组参数
        margin_params = [p for name,
                         p in model.named_parameters() if "margin_net" in name]
        other_params = [p for name, p in model.named_parameters()
                        if "margin_net" not in name]

        # 初始化优化器
        inner_optim = paddle.optimizer.Adam(
            parameters=other_params, learning_rate=0.001)
        outer_optim = paddle.optimizer.Adam(
            parameters=margin_params, learning_rate=0.001)

        train_dataset = RecDataset(
            train_pairs, num_users, num_items, user_pos_items)
        test_dataset = RecDataset(
            test_pairs, num_users, num_items, user_pos_items)
        train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)

        for epoch in range(epochs):
            model.train()
            total_loss = 0
            for batch in train_loader:
                users, pos_items, neg_items = batch

                # 内层优化
                inner_optim.clear_grad()
                users = paddle.cast(users, dtype='int64')
                pos_items = paddle.cast(pos_items, dtype='int64')
                neg_items = paddle.cast(neg_items, dtype='int64')
                pos_dist, neg_dist, margin = model(users, pos_items, neg_items)
                loss = paddle.mean(paddle.nn.functional.relu(
                    pos_dist - neg_dist + margin))
                loss.backward()
                inner_optim.step()

                # 外层优化
                outer_optim.clear_grad()
                with paddle.no_grad():
                    pos_dist_fixed, neg_dist_fixed, _ = model(
                        users, pos_items, neg_items)
                    loss_outer = paddle.mean(paddle.nn.functional.relu(
                        pos_dist_fixed - neg_dist_fixed + 1))
                loss_outer.backward()
                outer_optim.step()

                total_loss += loss.item()
            print(
                f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader):.4f}")
        # 验证
        recall = evaluate(model, test_pairs, num_users,
                          num_items, user_pos_items)
        fold_metrics.append(recall)
        print(f"Fold {fold_id + 1} Recall@10: {recall:.4f}")

    print(
        f"\nMean Recall@10: {np.mean(fold_metrics):.4f} (±{np.std(fold_metrics):.4f})")

### 代码执行

In [None]:
csv_path='./dataset/CDs_and_Vinyl.csv'
folds, num_users, num_items, user_items = load_and_split_data(csv_path)
# 5折训练与验证
train_and_evaluate(folds, num_users, num_items, user_items, embed_dim=50, epochs=40)


=== Fold 1 ===
Epoch 1, Loss: 0.6489
Epoch 2, Loss: 0.4037
Epoch 3, Loss: 0.2645
Epoch 4, Loss: 0.2069
Epoch 5, Loss: 0.1715
Epoch 6, Loss: 0.1376
Epoch 7, Loss: 0.1100
Epoch 8, Loss: 0.0914
Epoch 9, Loss: 0.0766
Epoch 10, Loss: 0.0657
Epoch 11, Loss: 0.0574
Epoch 12, Loss: 0.0501
Epoch 13, Loss: 0.0424
Epoch 14, Loss: 0.0376
Epoch 15, Loss: 0.0360
Epoch 16, Loss: 0.0317
Epoch 17, Loss: 0.0283
Epoch 18, Loss: 0.0262
Epoch 19, Loss: 0.0207
Epoch 20, Loss: 0.0194
Epoch 21, Loss: 0.0199
Epoch 22, Loss: 0.0176
Epoch 23, Loss: 0.0148
Epoch 24, Loss: 0.0143
Epoch 25, Loss: 0.0135
Epoch 26, Loss: 0.0118
Epoch 27, Loss: 0.0103
Epoch 28, Loss: 0.0104
Epoch 29, Loss: 0.0103
Epoch 30, Loss: 0.0094
Epoch 31, Loss: 0.0081
Epoch 32, Loss: 0.0072
Epoch 33, Loss: 0.0073
Epoch 34, Loss: 0.0063
Epoch 35, Loss: 0.0054
Epoch 36, Loss: 0.0060
Epoch 37, Loss: 0.0057
Epoch 38, Loss: 0.0058
Epoch 39, Loss: 0.0043
Epoch 40, Loss: 0.0050
Fold 1 Recall@10: 0.0699

=== Fold 2 ===
Epoch 1, Loss: 0.6513
Epoch 2, L

In [None]:
csv_path='./dataset/Books.csv'
folds, num_users, num_items, user_items = load_and_split_data(csv_path)
# 5折训练与验证
train_and_evaluate(folds, num_users, num_items, user_items, embed_dim=50, epochs=40)


=== Fold 1 ===
Epoch 1, Loss: 0.5114
Epoch 2, Loss: 0.2355
Epoch 3, Loss: 0.1813
Epoch 4, Loss: 0.1565
Epoch 5, Loss: 0.1334
Epoch 6, Loss: 0.1112
Epoch 7, Loss: 0.0929
Epoch 8, Loss: 0.0744
Epoch 9, Loss: 0.0657
Epoch 10, Loss: 0.0547
Epoch 11, Loss: 0.0499
Epoch 12, Loss: 0.0408
Epoch 13, Loss: 0.0351
Epoch 14, Loss: 0.0310
Epoch 15, Loss: 0.0269
Epoch 16, Loss: 0.0231
Epoch 17, Loss: 0.0190
Epoch 18, Loss: 0.0161
Epoch 19, Loss: 0.0155
Epoch 20, Loss: 0.0133
Epoch 21, Loss: 0.0116
Epoch 22, Loss: 0.0091
Epoch 23, Loss: 0.0080
Epoch 24, Loss: 0.0080
Epoch 25, Loss: 0.0073
Epoch 26, Loss: 0.0067
Epoch 27, Loss: 0.0060
Epoch 28, Loss: 0.0058
Epoch 29, Loss: 0.0051
Epoch 30, Loss: 0.0045
Epoch 31, Loss: 0.0040
Epoch 32, Loss: 0.0038
Epoch 33, Loss: 0.0034
Epoch 34, Loss: 0.0029
Epoch 35, Loss: 0.0026
Epoch 36, Loss: 0.0029
Epoch 37, Loss: 0.0024
Epoch 38, Loss: 0.0025
Epoch 39, Loss: 0.0022
Epoch 40, Loss: 0.0020
Fold 1 Recall@10: 0.0842

=== Fold 2 ===
Epoch 1, Loss: 0.5143
Epoch 2, L