In [1]:
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import ndcg_score

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 按用户分组计算NDCG
def compute_ndcg(group):
    true_ratings = group["true"].tolist()
    pred_ratings = group["pred"].tolist()
    return ndcg_score([true_ratings], [pred_ratings], k=50)

### 对实验数据进行预处理

In [3]:
import os

DATA_PATH = os.path.dirname("../data/")
PREPROCESSED_DATA_PATH = os.path.join(DATA_PATH, "lightGCN")
BOOK_DATA_PATH = os.path.join(PREPROCESSED_DATA_PATH, "book")
# MOVIE_DATA_PATH = os.path.join(PREPROCESSED_DATA_PATH, "movie")

In [4]:
# 加载原始书籍数据
loaded_data = pd.read_csv(os.path.join(DATA_PATH, "book_score.csv"))
loaded_data

Unnamed: 0,User,Book,Rate,Time,Tag
0,1398478,1467022,0,2011-03-29T12:48:35+08:00,
1,1398478,1777823,0,2011-02-02T21:58:55+08:00,
2,1398478,1902628,0,2011-01-31T15:57:58+08:00,
3,1398478,1878708,0,2011-01-26T11:27:59+08:00,
4,1398478,4238362,0,2011-01-21T13:04:15+08:00,
...,...,...,...,...,...
637249,4507957,1125186,4,2009-07-04T08:02:13+08:00,"张爱玲,半生缘,爱情"
637250,4507957,1002299,5,2009-07-04T08:01:28+08:00,"金庸,武侠,笑傲江湖"
637251,4507957,1001136,4,2009-07-04T07:55:17+08:00,"彼得・潘,童话"
637252,4507957,1021615,5,2009-07-04T07:53:54+08:00,"小王子,童话,经典"


In [5]:
class BookRatingDataset(Dataset):
    def __init__(self, data, user_to_idx, book_to_idx):
        self.data = data
        self.user_to_idx = user_to_idx
        self.book_to_idx = book_to_idx

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        row = self.data.iloc[index]
        user = self.user_to_idx[row["User"]]
        book = self.book_to_idx[row["Book"]]
        rating = row["Rate"].astype("float32")
        return user, book, rating
    
def create_id_mapping(id_list):
    # 从ID列表中删除重复项并创建一个排序的列表
    unique_ids = sorted(set(id_list))

    # 创建将原始ID映射到连续索引的字典
    id_to_idx = {id: idx for idx, id in enumerate(unique_ids, start=1)}

    return id_to_idx

In [6]:
user_ids = loaded_data["User"].unique()
book_ids = loaded_data["Book"].unique()

user_to_idx = create_id_mapping(user_ids)
book_to_idx = create_id_mapping(book_ids)

In [7]:
loaded_data["user_map"] = loaded_data["User"].map(user_to_idx)
loaded_data["book_map"] = loaded_data["Book"].map(book_to_idx)

In [8]:
# 划分数据集
train_data, test_data = train_test_split(loaded_data, test_size=0.5, random_state=42)

# 过滤掉训练集中 Rate < 4 的数据
train_data = train_data[train_data["Rate"] >= 4]

test_dataloader = DataLoader(
    BookRatingDataset(test_data, user_to_idx, book_to_idx),
    batch_size=4096,
    shuffle=False,
    drop_last=True
)

# 将 train_data, test_data 转换为 user -> item 的字典
train_data = train_data.groupby("user_map")["book_map"].apply(list).to_dict()
test_data = test_data.groupby("user_map")["book_map"].apply(list).to_dict()

# 写入 train.txt, test.txt
# 每行格式为 user_id item_id_1 item_id_2 ...
with open(os.path.join(BOOK_DATA_PATH, "train.txt"), "w") as f:
    for user, items in train_data.items():
        f.write(f"{user} {' '.join(map(str, items))}\n")

with open(os.path.join(BOOK_DATA_PATH, "test.txt"), "w") as f:
    for user, items in test_data.items():
        f.write(f"{user} {' '.join(map(str, items))}\n")

### 加载 LightGCN 的 Data Loader

In [9]:
from lightGCN_model import LightGCN, Loader, BPRLoss

In [10]:
CONFIG = {
    "bpr_batch_size": 2048,
    "latent_dim_rec": 64,
    "lightGCN_n_layers": 3,
    "dropout": 0,
    "keep_prob": 0.6,
    "A_n_fold": 100,
    "test_u_batch_size": 100,
    "multicore": 0,
    "lr": 0.001,
    "decay": 1e-4,
    "pretrain": 0,
    "A_split": False,
    "bigdata": False,
}

data_loader = Loader(CONFIG, BOOK_DATA_PATH, device=device)

[0;30;43mloading [../data/lightGCN/book][0m
148151 interactions for training
318627 interactions for testing
Dataset Sparsity : 0.08793162560611256
Dataset is ready to go


### 训练 LightGCN 模型

In [19]:
model = LightGCN(CONFIG, data_loader).to(device)

loading adjacency matrix
lgn is already to go(dropout:0)


In [12]:
# utils for training
def uniform_sample(dataset: Loader):
    user_num = dataset.trainDataSize
    users = np.random.randint(0, dataset.n_users, user_num)
    all_pos = dataset.allPos
    S = []
    for _, user in enumerate(users):
        pos_for_user = all_pos[user]
        if len(pos_for_user) == 0:
            continue
        pos_index = np.random.randint(0, len(pos_for_user))
        pos_item = pos_for_user[pos_index]
        while True:
            neg_item = np.random.randint(0, dataset.m_items)
            if neg_item not in pos_for_user:
                break
        S.append([user, pos_item, neg_item])
    return np.asarray(S)


def shuffle(*arrays, **kwargs):
    require_indices = kwargs.get("require_indices", False)

    if len(set(len(x) for x in arrays)) != 1:
        raise ValueError("All inputs to shuffle must have the same length.")

    shuffle_indices = np.random.permutation(len(arrays[0]))
    np.random.shuffle(shuffle_indices)

    if len(arrays) == 1:
        result = arrays[0][shuffle_indices]
    else:
        result = tuple(x[shuffle_indices] for x in arrays)

    if require_indices:
        return result, shuffle_indices
    else:
        return result


def minibatch(*tensors, **kwargs):
    batch_size = kwargs.get("batch_size", CONFIG["bpr_batch_size"])

    if len(tensors) == 1:
        tensor = tensors[0]
        for i in range(0, len(tensor), batch_size):
            yield tensor[i : i + batch_size]
    else:
        for i in range(0, len(tensors[0]), batch_size):
            yield tuple(x[i : i + batch_size] for x in tensors)

In [20]:
NUM_EPOCHS = 40

for epoch in tqdm(range(NUM_EPOCHS)):
    bpr = BPRLoss(model, CONFIG)
    S = uniform_sample(data_loader)

    users = torch.Tensor(S[:, 0]).long().to(device)
    pos_items = torch.Tensor(S[:, 1]).long().to(device)
    neg_items = torch.Tensor(S[:, 2]).long().to(device)

    users, pos_items, neg_items = shuffle(users, pos_items, neg_items)

    total_batch = len(users) // CONFIG["bpr_batch_size"] + 1

    avg_loss = 0.0

    for _, (batch_users, batch_pos, batch_neg) in enumerate(
        minibatch(users, pos_items, neg_items)
    ):
        cri = bpr.stageOne(batch_users, batch_pos, batch_neg)
        avg_loss += cri
    avg_loss /= total_batch

    model.eval()

    if (epoch + 1) % 5 == 0:
        print(f"Epoch #{epoch + 1}, Average Loss: {avg_loss}")
        # results = []
        # with torch.no_grad():
        #     for user_ids, item_ids, true_ratings in test_dataloader:
        #         user_ids = user_ids.to(device)
        #         item_ids = item_ids.to(device)
        #         true_ratings = true_ratings.to(device)

        #         ratings = model.getUsersRating(user_ids)
        #         pred_ratings = ratings[torch.arange(len(ratings)), item_ids]

        #         user_ids_np = user_ids.long().cpu().numpy().reshape(-1, 1)
        #         pred_ratings_np = pred_ratings.cpu().numpy().reshape(-1, 1)
        #         true_ratings_np = true_ratings.cpu().numpy().reshape(-1, 1)

        #         results.append(
        #             np.column_stack((user_ids_np, pred_ratings_np, true_ratings_np))
        #         )
                
        #     results_df = pd.DataFrame(
        #         np.vstack(results), columns=["user", "pred", "true"]
        #     )
        #     results_df["user"] = results_df["user"].astype(int)

        #     ndcg_scores = results_df.groupby("user").apply(compute_ndcg)
        #     avg_ndcg = ndcg_scores.mean()

        #     print(f"Epoch #{epoch}, Average Loss: {avg_loss}, Average NDCG: {avg_ndcg}")

 12%|█▎        | 5/40 [00:25<02:49,  4.86s/it]

Epoch #5, Average Loss: 0.593530935900552


 25%|██▌       | 10/40 [00:46<02:10,  4.35s/it]

Epoch #10, Average Loss: 0.5475959283964974


 38%|███▊      | 15/40 [01:08<01:48,  4.36s/it]

Epoch #15, Average Loss: 0.497956639954022


 50%|█████     | 20/40 [01:31<01:33,  4.68s/it]

Epoch #20, Average Loss: 0.45337486777986796


 62%|██████▎   | 25/40 [01:53<01:07,  4.48s/it]

Epoch #25, Average Loss: 0.4211744810853686


 75%|███████▌  | 30/40 [02:16<00:46,  4.61s/it]

Epoch #30, Average Loss: 0.3941047808953694


 88%|████████▊ | 35/40 [02:42<00:25,  5.03s/it]

Epoch #35, Average Loss: 0.37099391094275885


100%|██████████| 40/40 [03:05<00:00,  4.64s/it]

Epoch #40, Average Loss: 0.3550679794379643





In [21]:
torch.save(model, f"../model/lightgcn_{NUM_EPOCHS}.pt")

### 预测

In [15]:
epochs = 40
model = torch.load(f"../model/lightgcn_{epochs}.pt")

In [22]:
test_results = []

with torch.no_grad():
    for user_ids, item_ids, true_ratings in tqdm(
        test_dataloader, total=len(test_dataloader)
    ):
        user_ids = user_ids.to(device)
        item_ids = item_ids.to(device)
        true_ratings = true_ratings.to(device)

        ratings = model.getUsersRating(user_ids)
        pred_ratings = ratings[torch.arange(len(ratings)), item_ids]

        user_ids_np = user_ids.long().cpu().numpy().reshape(-1, 1)
        pred_ratings_np = pred_ratings.cpu().numpy().reshape(-1, 1)
        true_ratings_np = true_ratings.cpu().numpy().reshape(-1, 1)

        test_results.append(
            np.column_stack((user_ids_np, pred_ratings_np, true_ratings_np))
        )

test_results_df = pd.DataFrame(
    np.vstack(test_results), columns=["user", "pred", "true"]
)
test_results_df["user"] = test_results_df["user"].astype(int)

ndcg_scores = test_results_df.groupby("user").apply(compute_ndcg)
avg_ndcg = ndcg_scores.mean()

print(f"Average NDCG: {avg_ndcg}")

  0%|          | 0/77 [00:00<?, ?it/s]

100%|██████████| 77/77 [00:24<00:00,  3.16it/s]


Average NDCG: 0.7216618278765373


: 

In [17]:
test_results_df.to_csv("../output/lightgcn_results.csv", index=False)
test_results_df

Unnamed: 0,user,pred,true
0,3326,0.888533,4.0
1,1393,0.934420,4.0
2,1522,0.652645,4.0
3,2534,0.761625,0.0
4,3671,0.818293,0.0
...,...,...,...
315387,229,0.989655,4.0
315388,4304,0.871210,0.0
315389,2268,0.870233,0.0
315390,2880,0.980854,4.0


In [18]:
# 读取预测结果并计算平均 NDCG
loaded_results = pd.read_csv("../output/lightgcn_results.csv")
ndcg_scores = loaded_results.groupby("user").apply(compute_ndcg)
avg_ndcg = ndcg_scores.mean()

print(f"Average NDCG: {avg_ndcg}")

Average NDCG: 0.7339513305770557
