### Process data from csv

In [1]:
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [2]:
import os
import sys

# run this block once only
project_root = os.path.abspath("../../")  # 根据文件层级调整路径
os.chdir(project_root)

if project_root not in sys.path:
    sys.path.insert(0, project_root)

In [3]:
import pandas as pd
import numpy as np

# 读取数据
anime_data = pd.read_csv('./data/anime_info/anime_data.csv')
user_ratings = pd.read_csv('./data/user_animelist/anime_info.csv')

# 清理 anime_data 数据
anime_data["popularity"] = anime_data["popularity"].str.replace("#", "").fillna(0).astype(int)
anime_data["members"] = anime_data["members"].str.replace(",", "").fillna(0).astype(float).astype(int)
anime_data["favorites"] = anime_data["favorites"].str.replace(",", "").fillna(0).astype(float).astype(int)

# 为番剧分配唯一 ID
anime_data["anime_id"] = anime_data.index

# 处理 genres 列，将每个类别分配一个整数 ID
all_genres = set(g for genre_list in anime_data["genres"].dropna() for g in genre_list.split(", "))
genre_to_id = {genre: idx for idx, genre in enumerate(all_genres)}
anime_data["genre_ids"] = anime_data["genres"].apply(
    lambda x: [genre_to_id[g] for g in x.split(", ")] if pd.notna(x) else []
)

# 填充缺失值
anime_data = anime_data.fillna({
    "popularity": 0,
    "members": 0,
    "favorites": 0,
    "genres": ""
})

# 清理 user_ratings 数据
user_ratings = user_ratings[user_ratings["rating"] != "-"]
user_ratings["rating"] = user_ratings["rating"].astype(float)

# 为用户分配唯一 ID
user_to_id = {user: idx for idx, user in enumerate(user_ratings["username"].unique())}
user_ratings["user_id"] = user_ratings["username"].map(user_to_id)

# 为番剧名映射对应的 anime_id
anime_name_to_id = dict(zip(anime_data["title"], anime_data["anime_id"]))
user_ratings["anime_id"] = user_ratings["anime"].map(anime_name_to_id)

# 移除无效的番剧映射
user_ratings = user_ratings.dropna(subset=["anime_id"])
user_ratings["anime_id"] = user_ratings["anime_id"].astype(int)

# 准备模型输入
model_input = user_ratings.merge(anime_data, on="anime_id", how="left")

# 检查并移除缺失值
model_input = model_input.dropna()

# 确保 `genre_ids` 无空值
model_input["genre_ids"] = model_input["genre_ids"].apply(lambda x: x if isinstance(x, list) else [])

# 提取用户特征和番剧特征
X = {
    "user_id": model_input["user_id"].values,
    "anime_id": model_input["anime_id"].values,
    "anime_meta": model_input[["score", "members", "favorites"]].values,
    "genre_id": model_input["genre_ids"].values
}

# 提取目标变量 (评分)
y = model_input["rating"].values

In [4]:
X

{'user_id': array([  1,   1,   1, ..., 559, 559, 559]),
 'anime_id': array([2881, 3445, 4462, ...,  295, 3316, 2478]),
 'anime_meta': array([[7.28000e+00, 3.91550e+04, 4.70000e+01],
        [7.18000e+00, 1.30833e+05, 2.37000e+02],
        [6.97000e+00, 1.20138e+05, 1.56000e+02],
        ...,
        [8.28000e+00, 3.02886e+05, 2.70500e+03],
        [7.20000e+00, 1.19583e+05, 5.73000e+02],
        [7.36000e+00, 6.52970e+04, 3.19000e+02]]),
 'genre_id': array([list([16, 2, 49, 46, 22, 39, 75, 64]), list([16, 45, 35, 49, 64]),
        list([16, 45, 49, 64]), ..., list([21, 2, 35, 46]),
        list([16, 35, 56, 58, 47]), list([16, 36, 56, 38, 47])],
       dtype=object)}

In [5]:
y

array([7., 9., 3., ..., 8., 7., 7.])

In [6]:
len(y)

15420

In [7]:
from itertools import chain

# 计算用户总数
num_users = len(set(X["user_id"]))

# 计算动漫总数
num_animes = len(set(X["anime_id"]))

# 计算类型总数
num_genres = len(set(chain.from_iterable(X["genre_id"])))

# 嵌入维度（超参数，可以调整）
embed_dim = 32  # 或者其他合适的值，如 64

print(f"num_users = {num_users}")
print(f"num_animes = {num_animes}")
print(f"num_genres = {num_genres}")
print(f"embed_dim = {embed_dim}")

num_users = 528
num_animes = 973
num_genres = 72
embed_dim = 32


### Data Loader

In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import numpy as np

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Assume X and y are already defined
unique_user_ids = np.unique(X["user_id"])
unique_anime_ids = np.unique(X["anime_id"])

user_id_to_idx = {user_id: idx for idx, user_id in enumerate(unique_user_ids)}
anime_id_to_idx = {anime_id: idx for idx, anime_id in enumerate(unique_anime_ids)}

X["user_id"] = np.array([user_id_to_idx[uid] for uid in X["user_id"]])
X["anime_id"] = np.array([anime_id_to_idx[aid] for aid in X["anime_id"]])

num_users = len(unique_user_ids)
num_anime = len(unique_anime_ids)
max_genre_id = max([max(genre) for genre in X["genre_id"]])
num_genres = max_genre_id + 1

class AnimeDataset(Dataset):
    def __init__(self, X, y):
        self.user_ids = torch.tensor(X["user_id"], dtype=torch.long)
        self.anime_ids = torch.tensor(X["anime_id"], dtype=torch.long)
        self.anime_meta = torch.tensor(X["anime_meta"], dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)
        self.genre_ids = [torch.tensor(genre, dtype=torch.long) for genre in X["genre_id"]]
    
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return (
            self.user_ids[idx],
            self.anime_ids[idx],
            self.anime_meta[idx],
            self.genre_ids[idx],
            self.y[idx]
        )

def collate_fn(batch):
    user_ids = torch.stack([item[0] for item in batch])
    anime_ids = torch.stack([item[1] for item in batch])
    anime_meta = torch.stack([item[2] for item in batch])
    ratings = torch.stack([item[4] for item in batch])
    
    genre_ids = [item[3] for item in batch]
    max_len = max([g.shape[0] for g in genre_ids])
    padded_genre_ids = torch.full((len(batch), max_len), fill_value=-1, dtype=torch.long)
    for i, g in enumerate(genre_ids):
        padded_genre_ids[i, :g.shape[0]] = g
    
    return user_ids, anime_ids, anime_meta, padded_genre_ids, ratings

dataset = AnimeDataset(X, y)


In [9]:
from torch.utils.data import random_split

# 数据集划分比例
train_ratio = 0.8
test_ratio = 0.2

# 数据集总大小
dataset_size = len(dataset)

# 计算训练集和测试集大小
train_size = int(train_ratio * dataset_size)
test_size = dataset_size - train_size

# 随机划分数据集
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

# 创建训练集和测试集的 DataLoader
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, collate_fn=collate_fn)

print(f"Train dataset size: {len(train_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

Train dataset size: 12336
Test dataset size: 3084


### model

In [30]:
import torch
import torch.nn as nn

class AnimeRecommendationModel(nn.Module):
    def __init__(self, num_users, num_animes, num_genres, embed_dim=64):
        super(AnimeRecommendationModel, self).__init__()
        
        # Embedding layers for user and anime
        self.user_embedding = nn.Embedding(num_embeddings=num_users, embedding_dim=embed_dim)
        self.anime_embedding = nn.Embedding(num_embeddings=num_animes, embedding_dim=embed_dim)
        
        # Linear layer for anime meta features
        self.anime_meta_fc = nn.Linear(3, 16)  # 处理动漫元数据
        
        # Fully connected layers
        self.fc1 = nn.Linear(embed_dim * 2 + 16 + num_genres, 128)  # 第一层全连接
        self.fc2 = nn.Linear(128, 64)  # 第二层全连接
        self.output = nn.Linear(64, 1)  # 输出评分

    def forward(self, user_id, anime_id, anime_meta, genre_ids):
        # User embedding
        user_embedded = self.user_embedding(user_id)
        
        # Anime embedding
        anime_embedded = self.anime_embedding(anime_id)
        
        # Anime meta features
        anime_meta_processed = F.relu(self.anime_meta_fc(anime_meta))
        
        # Genre processing (one-hot encoding and pooling)
        mask = (genre_ids != -1)
        genre_ids = genre_ids * mask
        genre_embedded = F.one_hot(genre_ids, num_classes=num_genres).float()
        genre_embedded = genre_embedded * mask.unsqueeze(-1)
        genre_embedded = torch.mean(genre_embedded, dim=1)
        
        # Concatenate all features
        x = torch.cat([user_embedded, anime_embedded, anime_meta_processed, genre_embedded], dim=1)
        
        # Fully connected layers
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.output(x)
        return x.squeeze(1)

In [31]:
model = AnimeRecommendationModel(num_users=num_users, num_animes=num_animes, num_genres=num_genres).to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [32]:
print(model)

AnimeRecommendationModel(
  (user_embedding): Embedding(528, 64)
  (anime_embedding): Embedding(973, 64)
  (anime_meta_fc): Linear(in_features=3, out_features=16, bias=True)
  (fc1): Linear(in_features=220, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=64, bias=True)
  (output): Linear(in_features=64, out_features=1, bias=True)
)


In [33]:
num_epochs = 100

for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0

    for batch in train_loader:
        user_id, anime_id, anime_meta, genre_ids, ratings = batch
        user_id = user_id.to(device)
        anime_id = anime_id.to(device)
        anime_meta = anime_meta.to(device)
        genre_ids = genre_ids.to(device)
        ratings = ratings.to(device)
        
        outputs = model(user_id, anime_id, anime_meta, genre_ids)
        loss = criterion(outputs.squeeze(), ratings)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {train_loss / len(train_loader):.4f}")

Epoch [1/100], Loss: 393068.2491
Epoch [2/100], Loss: 68.5266
Epoch [3/100], Loss: 2455.5193
Epoch [4/100], Loss: 124517.8724
Epoch [5/100], Loss: 4192.9556
Epoch [6/100], Loss: 90.2329
Epoch [7/100], Loss: 64.2079
Epoch [8/100], Loss: 5088.7036
Epoch [9/100], Loss: 2969.5515
Epoch [10/100], Loss: 38227.3479
Epoch [11/100], Loss: 858.1166
Epoch [12/100], Loss: 68.3244
Epoch [13/100], Loss: 7104.4008
Epoch [14/100], Loss: 4357.4459
Epoch [15/100], Loss: 7031.1282
Epoch [16/100], Loss: 120367.4816
Epoch [17/100], Loss: 24.6030
Epoch [18/100], Loss: 24.1746
Epoch [19/100], Loss: 22.8505
Epoch [20/100], Loss: 22.8234
Epoch [21/100], Loss: 22.7108
Epoch [22/100], Loss: 21.8290
Epoch [23/100], Loss: 20.2231
Epoch [24/100], Loss: 20.3541
Epoch [25/100], Loss: 17.8030
Epoch [26/100], Loss: 16.2849
Epoch [27/100], Loss: 18.1154
Epoch [28/100], Loss: 44.0035
Epoch [29/100], Loss: 21.5900
Epoch [30/100], Loss: 29.7406
Epoch [31/100], Loss: 27.2587
Epoch [32/100], Loss: 27.6242
Epoch [33/100], Los

In [34]:
def evaluate_model(model, test_loader, criterion):
    model.eval()  # 设置模型为评估模式
    test_loss = 0.0
    all_predictions = []
    all_targets = []

    with torch.no_grad():  # 禁用梯度计算
        for batch in test_loader:
            user_id, anime_id, anime_meta, genre_ids, ratings = batch
            user_id = user_id.to(device)
            anime_id = anime_id.to(device)
            anime_meta = anime_meta.to(device)
            genre_ids = genre_ids.to(device)
            ratings = ratings.to(device)
            
            # 模型预测
            outputs = model(user_id, anime_id, anime_meta, genre_ids)
            loss = criterion(outputs.squeeze(), ratings)
            test_loss += loss.item()
            
            # 保存预测值和真实值，用于后续计算指标
            all_predictions.extend(outputs.squeeze().cpu().numpy())
            all_targets.extend(ratings.cpu().numpy())
    
    # 计算平均损失
    avg_loss = test_loss / len(test_loader)
    
    # 转换为 NumPy 数组
    all_predictions = np.array(all_predictions)
    all_targets = np.array(all_targets)
    
    # 计算其他评估指标
    mae = np.mean(np.abs(all_predictions - all_targets))  # 平均绝对误差
    mse = np.mean((all_predictions - all_targets)**2)     # 均方误差
    r2 = 1 - (np.sum((all_targets - all_predictions)**2) / np.sum((all_targets - np.mean(all_targets))**2))  # R²

    return avg_loss, mae, mse, r2

In [36]:
# 训练完成后，评估模型
test_loss, mae, mse, r2 = evaluate_model(model, test_loader, criterion)

print(f"Test Loss (MSE): {test_loss:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"R² Score: {r2:.4f}")

Test Loss (MSE): 15.6292
Mean Absolute Error (MAE): 3.5041
Mean Squared Error (MSE): 15.6238
R² Score: -5.8245
