### Process data from csv

In [1]:
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import numpy as np

In [2]:
import os
import sys

# run this block once only
project_root = os.path.abspath("../../")  # 根据文件层级调整路径
os.chdir(project_root)

if project_root not in sys.path:
    sys.path.insert(0, project_root)

### Loading data

In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

# 检查是否可以使用 CUDA
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 1. 加载数据
anime_data = pd.read_csv('./data/anime_info/anime_data.csv')
user_ratings = pd.read_csv('./data/user_animelist/anime_info.csv')

# 清理数据
anime_data['score'] = anime_data['score'].replace('-', np.nan).astype(float)  # 如果有缺失值处理
user_ratings['rating'] = user_ratings['rating'].replace('-', np.nan).astype(float)

# 去掉没有评分的条目
user_ratings = user_ratings.dropna(subset=['rating'])

# 格式化数值
anime_data['members'] = anime_data['members'].str.replace(',', '').astype(float)
anime_data['favorites'] = anime_data['favorites'].str.replace(',', '').astype(float)
anime_data['popularity'] = anime_data['popularity'].str.replace('#', '').astype(float)
anime_data['ranked'] = anime_data['ranked'].str.replace('#', '').astype(float)

# 2. 提取番剧特征
def preprocess_genres(genres_series):
    """将 genres 列转为 multi-hot 编码"""
    genres_series = genres_series.fillna('')  # 填充空值
    genres_list = genres_series.str.split(', ')
    mlb = MultiLabelBinarizer()
    genres_encoded = mlb.fit_transform(genres_list)
    return genres_encoded, mlb.classes_

# 处理 genres 列
genres_encoded, genres_classes = preprocess_genres(anime_data['genres'])

# 将 genres 编码加入 anime_data
anime_data = anime_data.join(pd.DataFrame(genres_encoded, columns=genres_classes))

# 特征列
anime_features = ['score', 'ranked', 'popularity', 'members', 'favorites'] + list(genres_classes)

# 标准化数值特征
scaler = StandardScaler()
anime_data[anime_features] = scaler.fit_transform(anime_data[anime_features])

# 将特征从 DataFrame 转换为 GPU 上的张量
anime_tensor = torch.tensor(anime_data[anime_features].values, dtype=torch.float32).to(device)

# 将标题映射到 GPU 上
titles = anime_data['title'].values
title_to_index = {title: idx for idx, title in enumerate(titles)}

# 3. 构建用户-番剧评分的训练数据
def construct_training_data(merged_data, anime_tensor, title_to_index):
    """
    构建训练数据 (X, y)，并将所有操作移到 GPU
    """
    X = []
    y = []
    
    grouped = merged_data.groupby('username')  # 按用户名分组
    
    # 使用 tqdm 包装分组数据，显示进度条
    for username, group in tqdm(grouped, desc="Processing Users", total=len(grouped)):
        user_ratings = group[['anime', 'rating']].set_index('anime')['rating'].to_dict()  # 用户的评分历史
        
        for _, row in group.iterrows():
            # 当前目标番剧
            target_anime = row['anime']
            target_rating = row['rating']
            
            # 跳过如果没有评分的目标番剧
            if target_anime not in user_ratings:
                continue
            
            # 构建输入特征
            user_history = {k: v for k, v in user_ratings.items() if k != target_anime}  # 除去目标番剧的历史
            history_features = []
            
            for anime, rating in user_history.items():
                if anime in title_to_index:
                    anime_idx = title_to_index[anime]
                    anime_feature = anime_tensor[anime_idx]
                    history_features.append(anime_feature * rating)  # 特征加权
            
            # 如果用户历史为空，则跳过
            if len(history_features) == 0:
                continue
            
            # 聚合历史特征（例如求平均值）
            history_features = torch.stack(history_features).mean(dim=0)
            
            # 目标番剧的特征
            target_idx = title_to_index[target_anime]
            target_features = anime_tensor[target_idx]
            
            # 拼接特征
            input_features = torch.cat([history_features, target_features])
            
            # 添加到训练集
            X.append(input_features)
            y.append(target_rating)
    
    return torch.stack(X), torch.tensor(y, dtype=torch.float32).to(device)

# 按照 anime title 合并 user_ratings 和 anime_data
merged_data = user_ratings.merge(anime_data, left_on='anime', right_on='title', how='inner')

# 构建训练数据
X, y = construct_training_data(merged_data, anime_tensor, title_to_index)

# 分割训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 转换为 PyTorch 数据集
class AnimeDataset(Dataset):
    def __init__(self, X, y):
        self.X = X  # 已经在 GPU 上，无需再次转换
        self.y = y
    
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# 构建训练和测试数据集
train_dataset = AnimeDataset(X_train, y_train)
test_dataset = AnimeDataset(X_test, y_test)

# 使用 DataLoader 加载数据
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=0)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, num_workers=0)

# 输出数据形状
print(f"训练集 X 形状: {X_train.shape}, y 形状: {y_train.shape}")
print(f"测试集 X 形状: {X_test.shape}, y 形状: {y_test.shape}")

Using device: cuda


Processing Users: 100%|██████████| 528/528 [00:14<00:00, 37.52it/s]

训练集 X 形状: torch.Size([12320, 162]), y 形状: torch.Size([12320])
测试集 X 形状: torch.Size([3080, 162]), y 形状: torch.Size([3080])





In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
import numpy as np

# 模型定义
class AnimeRatingPredictor(nn.Module):
    def __init__(self, input_size, hidden_sizes=[512, 256, 128], dropout=0.3):
        """
        全连接神经网络，适配输入维度
        :param input_size: 输入特征的维度
        :param hidden_sizes: 隐藏层每层的神经元数量
        :param dropout: Dropout 概率
        """
        super(AnimeRatingPredictor, self).__init__()

        # 定义全连接层
        self.fc1 = nn.Linear(input_size, hidden_sizes[0])
        self.bn1 = nn.BatchNorm1d(hidden_sizes[0])  # Batch Normalization
        
        self.fc2 = nn.Linear(hidden_sizes[0], hidden_sizes[1])
        self.bn2 = nn.BatchNorm1d(hidden_sizes[1])
        
        self.fc3 = nn.Linear(hidden_sizes[1], hidden_sizes[2])
        self.bn3 = nn.BatchNorm1d(hidden_sizes[2])
        
        # 输出层
        self.output = nn.Linear(hidden_sizes[2], 1)
        
        # Dropout 层
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x):
        """
        前向传播
        :param x: 输入特征
        :return: 预测评分，范围 [1, 10]
        """
        # 第一层
        x = F.relu(self.bn1(self.fc1(x)))
        x = self.dropout(x)
        
        # 第二层
        x = F.relu(self.bn2(self.fc2(x)))
        x = self.dropout(x)
        
        # 第三层
        x = F.relu(self.bn3(self.fc3(x)))
        x = self.dropout(x)
        
        # 输出层 (限制范围在 [1, 10])
        x = self.output(x)
        x = torch.sigmoid(x) * 9 + 1  # 映射到 [1, 10]
        return x

In [5]:
# 初始化设备和模型
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_size = X_train.shape[1]  # 输入特征的维度
model = AnimeRatingPredictor(input_size=input_size).to(device)

# 定义损失函数和优化器
criterion = nn.MSELoss()  # 均方误差损失，用于回归任务
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [6]:
print(model)

AnimeRatingPredictor(
  (fc1): Linear(in_features=162, out_features=512, bias=True)
  (bn1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc2): Linear(in_features=512, out_features=256, bias=True)
  (bn2): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc3): Linear(in_features=256, out_features=128, bias=True)
  (bn3): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (output): Linear(in_features=128, out_features=1, bias=True)
  (dropout): Dropout(p=0.3, inplace=False)
)


In [7]:
# 训练模型
def train_model(model, train_loader, criterion, optimizer, device, epochs):
    model.train()  # 设置为训练模式
    for epoch in range(epochs):
        total_loss = 0
        for batch_idx, (inputs, targets) in enumerate(train_loader):
            inputs, targets = inputs.to(device), targets.to(device)
            
            # 前向传播
            outputs = model(inputs)
            loss = criterion(outputs.squeeze(), targets)  # 注意 squeeze() 将输出从 (batch_size, 1) 变为 (batch_size)
            
            # 反向传播
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader)}")

# 评估模型
def evaluate_model(model, test_loader, criterion, device):
    model.eval()  # 设置为评估模式
    total_loss = 0
    with torch.no_grad():
        for inputs, targets in test_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            loss = criterion(outputs.squeeze(), targets)
            total_loss += loss.item()
    print(f"Test Loss: {total_loss/len(test_loader)}")

# 检查是否有 GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 模型训练
train_model(model, train_loader, criterion, optimizer, device, epochs=50)

# 模型评估
evaluate_model(model, test_loader, criterion, device)

Using device: cuda
Epoch 1/50, Loss: 2.4568634162912715
Epoch 2/50, Loss: 1.8493364098158525
Epoch 3/50, Loss: 1.730912304912824
Epoch 4/50, Loss: 1.6471173071490668
Epoch 5/50, Loss: 1.5842269516979475
Epoch 6/50, Loss: 1.5407539816100362
Epoch 7/50, Loss: 1.517898058335398
Epoch 8/50, Loss: 1.4890731628694682
Epoch 9/50, Loss: 1.468345111824688
Epoch 10/50, Loss: 1.4518538719631848
Epoch 11/50, Loss: 1.427175509806124
Epoch 12/50, Loss: 1.403290565458604
Epoch 13/50, Loss: 1.3704640386635776
Epoch 14/50, Loss: 1.3415191247673233
Epoch 15/50, Loss: 1.3300476265694812
Epoch 16/50, Loss: 1.3118404282807068
Epoch 17/50, Loss: 1.296394645241258
Epoch 18/50, Loss: 1.2637625716510832
Epoch 19/50, Loss: 1.2699650933705464
Epoch 20/50, Loss: 1.2640947146736896
Epoch 21/50, Loss: 1.2392362736049711
Epoch 22/50, Loss: 1.2094996003289298
Epoch 23/50, Loss: 1.2010153893362054
Epoch 24/50, Loss: 1.1711129147771726
Epoch 25/50, Loss: 1.1686045323628835
Epoch 26/50, Loss: 1.1315312311439316
Epoch 27

In [8]:
def evaluate_model(model, test_loader, criterion):
    model.eval()  # 设置模型为评估模式
    test_loss = 0.0
    all_predictions = []
    all_targets = []

    with torch.no_grad():  # 禁用梯度计算
        for X_batch, y_batch in test_loader:  # 解包数据
            # 将数据加载到设备
            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device)
            
            # 模型预测
            outputs = model(X_batch)
            loss = criterion(outputs.squeeze(), y_batch)
            test_loss += loss.item()
            
            # 保存预测值和真实值，用于后续计算指标
            all_predictions.extend(outputs.squeeze().cpu().numpy())
            all_targets.extend(y_batch.cpu().numpy())
    
    # 计算平均损失
    avg_loss = test_loss / len(test_loader)
    
    # 转换为 NumPy 数组
    all_predictions = np.array(all_predictions)
    all_targets = np.array(all_targets)
    
    # 计算其他评估指标
    mae = np.mean(np.abs(all_predictions - all_targets))  # 平均绝对误差
    mse = np.mean((all_predictions - all_targets)**2)     # 均方误差
    r2 = 1 - (np.sum((all_targets - all_predictions)**2) / np.sum((all_targets - np.mean(all_targets))**2))

    return avg_loss, mae, mse, r2

In [9]:
# 训练完成后，评估模型
test_loss, mae, mse, r2 = evaluate_model(model, test_loader, criterion)

print(f"Test Loss (MSE): {test_loss:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"R² Score: {r2:.4f}")

Test Loss (MSE): 1.6835
Mean Absolute Error (MAE): 0.9887
Mean Squared Error (MSE): 1.6645
R² Score: 0.2630


In [None]:
torch.save(model, "recommenders_DNN/model/model.pth")