### Process data from csv

In [1]:
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import numpy as np

In [2]:
import os
import sys

# run this block once only
project_root = os.path.abspath("../../")  # 根据文件层级调整路径
os.chdir(project_root)

if project_root not in sys.path:
    sys.path.insert(0, project_root)

### Loading data

In [3]:
def load_data(save_path='processed_data/'):
    """
    从文件中加载训练数据和测试数据
    """
    X = np.load(save_path + 'X.npy')
    y = np.load(save_path + 'y.npy')
    X_train = np.load(save_path + 'X_train.npy')
    X_test = np.load(save_path + 'X_test.npy')
    y_train = np.load(save_path + 'y_train.npy')
    y_test = np.load(save_path + 'y_test.npy')
    
    print(f"数据从 {save_path} 加载完成")
    return X, y, X_train, X_test, y_train, y_test
X, y, X_train, X_test, y_train, y_test = load_data()

数据从 processed_data/ 加载完成


In [8]:
X.shape

(15657, 164)

In [7]:
import numpy as np

# 示例数组 X (替换为你的数据)
# 假设 X 是你的数组

# 找出包含 NaN 的行
nan_rows = np.isnan(X).any(axis=1)  # 布尔数组，True 表示该行包含 NaN

# 统计包含 NaN 的行数
num_nan_rows = np.sum(nan_rows)

# 获取包含 NaN 的行号
nan_row_indices = np.where(nan_rows)[0]

print(f"包含 NaN 的行数: {num_nan_rows}")
print(f"包含 NaN 的行号: {nan_row_indices}")

包含 NaN 的行数: 9187
包含 NaN 的行号: [    0     1     2 ... 15654 15655 15656]


In [5]:
import numpy as np

# 删除包含 NaN 的行
def remove_nan_rows(X, y):
    """
    删除 X 和 y 中包含 NaN 的行
    :param X: 特征矩阵
    :param y: 标签数组
    :return: 删除 NaN 后的特征矩阵和标签
    """
    mask = ~np.isnan(X).any(axis=1)  # 找到所有不包含 NaN 的行
    X_clean = X[mask]
    y_clean = y[mask]
    return X_clean, y_clean

# 清理数据
X_clean, y_clean = remove_nan_rows(X, y)

# 对训练集和测试集分别清理
X_train_clean, y_train_clean = remove_nan_rows(X_train, y_train)
X_test_clean, y_test_clean = remove_nan_rows(X_test, y_test)

print(f"清理后数据集形状: X_train: {X_train_clean.shape}, y_train: {y_train_clean.shape}")
print(f"清理后数据集形状: X_test: {X_test_clean.shape}, y_test: {y_test_clean.shape}")

清理后数据集形状: X_train: (5203, 164), y_train: (5203,)
清理后数据集形状: X_test: (1267, 164), y_test: (1267,)


In [13]:
import torch
from torch.utils.data import Dataset, DataLoader

# 自定义 Dataset 类
class AnimeDataset(Dataset):
    def __init__(self, X, y):
        """
        初始化数据集
        :param X: 特征数据
        :param y: 标签数据
        """
        self.X = torch.tensor(X, dtype=torch.float32)  # 转换为 PyTorch 张量
        self.y = torch.tensor(y, dtype=torch.float32)  # 转换为 PyTorch 张量

    def __len__(self):
        """
        返回数据集大小
        """
        return len(self.X)

    def __getitem__(self, idx):
        """
        根据索引返回数据和标签
        :param idx: 索引
        :return: (输入特征, 标签)
        """
        return self.X[idx], self.y[idx]

# 数据集封装
train_dataset = AnimeDataset(X_train_clean, y_train_clean)
test_dataset = AnimeDataset(X_test_clean, y_test_clean)

In [14]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class AnimeRatingPredictor(nn.Module):
    def __init__(self, input_size, hidden_sizes=[512, 256, 128], dropout=0.3):
        """
        初始化网络结构
        :param input_size: 输入特征的维度 (历史特征 + 目标特征)
        :param hidden_sizes: 隐藏层每层的神经元数量
        :param dropout: Dropout 概率，防止过拟合
        """
        super(AnimeRatingPredictor, self).__init__()
        
        # 定义全连接层
        self.fc1 = nn.Linear(input_size, hidden_sizes[0])
        self.bn1 = nn.BatchNorm1d(hidden_sizes[0])  # Batch Normalization
        
        self.fc2 = nn.Linear(hidden_sizes[0], hidden_sizes[1])
        self.bn2 = nn.BatchNorm1d(hidden_sizes[1])
        
        self.fc3 = nn.Linear(hidden_sizes[1], hidden_sizes[2])
        self.bn3 = nn.BatchNorm1d(hidden_sizes[2])
        
        # 输出层
        self.output = nn.Linear(hidden_sizes[2], 1)
        
        # Dropout 层
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x):
        """
        前向传播
        :param x: 输入特征
        :return: 预测评分
        """
        # 第一层
        x = F.relu(self.bn1(self.fc1(x)))
        x = self.dropout(x)
        
        # 第二层
        x = F.relu(self.bn2(self.fc2(x)))
        x = self.dropout(x)
        
        # 第三层
        x = F.relu(self.bn3(self.fc3(x)))
        x = self.dropout(x)
        
        # 输出层
        x = self.output(x)  # 不使用激活函数，直接输出评分
        return x

In [15]:
# 初始化设备和模型
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_size = X_train_clean.shape[1]  # 输入特征的维度
model = AnimeRatingPredictor(input_size=input_size).to(device)

# 定义损失函数和优化器
criterion = nn.MSELoss()  # 均方误差损失，用于回归任务
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 训练和测试数据加载器
batch_size = 32
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)


In [16]:
print(model)

AnimeRatingPredictor(
  (fc1): Linear(in_features=164, out_features=512, bias=True)
  (bn1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc2): Linear(in_features=512, out_features=256, bias=True)
  (bn2): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc3): Linear(in_features=256, out_features=128, bias=True)
  (bn3): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (output): Linear(in_features=128, out_features=1, bias=True)
  (dropout): Dropout(p=0.3, inplace=False)
)


In [24]:
# 训练模型
def train_model(model, train_loader, criterion, optimizer, device, epochs):
    model.train()  # 设置为训练模式
    for epoch in range(epochs):
        total_loss = 0
        for batch_idx, (inputs, targets) in enumerate(train_loader):
            inputs, targets = inputs.to(device), targets.to(device)
            
            # 前向传播
            outputs = model(inputs)
            loss = criterion(outputs.squeeze(), targets)  # 注意 squeeze() 将输出从 (batch_size, 1) 变为 (batch_size)
            
            # 反向传播
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader)}")

# 评估模型
def evaluate_model(model, test_loader, criterion, device):
    model.eval()  # 设置为评估模式
    total_loss = 0
    with torch.no_grad():
        for inputs, targets in test_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            loss = criterion(outputs.squeeze(), targets)
            total_loss += loss.item()
    print(f"Test Loss: {total_loss/len(test_loader)}")

# 检查是否有 GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 模型训练
train_model(model, train_loader, criterion, optimizer, device, epochs=100)

# 模型评估
evaluate_model(model, test_loader, criterion, device)

Using device: cpu
Epoch 1/100, Loss: 0.6168649694853765
Epoch 2/100, Loss: 0.5814767253179491
Epoch 3/100, Loss: 0.6094565961989888
Epoch 4/100, Loss: 0.5914040827678025
Epoch 5/100, Loss: 0.5821005039237028
Epoch 6/100, Loss: 0.5835066090332218
Epoch 7/100, Loss: 0.6077589553558022
Epoch 8/100, Loss: 0.5909883197099884
Epoch 9/100, Loss: 0.577293900807211
Epoch 10/100, Loss: 0.6094248099195445
Epoch 11/100, Loss: 0.608347619603748
Epoch 12/100, Loss: 0.5662950504959726
Epoch 13/100, Loss: 0.5825339462493826
Epoch 14/100, Loss: 0.5614465537246751
Epoch 15/100, Loss: 0.5812684133008945
Epoch 16/100, Loss: 0.5701143582540056
Epoch 17/100, Loss: 0.5699604184715295
Epoch 18/100, Loss: 0.6064769447398332
Epoch 19/100, Loss: 0.5758252833152842
Epoch 20/100, Loss: 0.5667128643375233
Epoch 21/100, Loss: 0.5668161240823429
Epoch 22/100, Loss: 0.5888598177330625
Epoch 23/100, Loss: 0.5667562388752135
Epoch 24/100, Loss: 0.5651646723593671
Epoch 25/100, Loss: 0.560118645338193
Epoch 26/100, Loss:

In [25]:
def evaluate_model(model, test_loader, criterion):
    model.eval()  # 设置模型为评估模式
    test_loss = 0.0
    all_predictions = []
    all_targets = []

    with torch.no_grad():  # 禁用梯度计算
        for X_batch, y_batch in test_loader:  # 解包数据
            # 将数据加载到设备
            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device)
            
            # 模型预测
            outputs = model(X_batch)
            loss = criterion(outputs.squeeze(), y_batch)
            test_loss += loss.item()
            
            # 保存预测值和真实值，用于后续计算指标
            all_predictions.extend(outputs.squeeze().cpu().numpy())
            all_targets.extend(y_batch.cpu().numpy())
    
    # 计算平均损失
    avg_loss = test_loss / len(test_loader)
    
    # 转换为 NumPy 数组
    all_predictions = np.array(all_predictions)
    all_targets = np.array(all_targets)
    
    # 计算其他评估指标
    mae = np.mean(np.abs(all_predictions - all_targets))  # 平均绝对误差
    mse = np.mean((all_predictions - all_targets)**2)     # 均方误差
    r2 = 1 - (np.sum((all_targets - all_predictions)**2) / np.sum((all_targets - np.mean(all_targets))**2))

    return avg_loss, mae, mse, r2

In [26]:
# 训练完成后，评估模型
test_loss, mae, mse, r2 = evaluate_model(model, test_loader, criterion)

print(f"Test Loss (MSE): {test_loss:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"R² Score: {r2:.4f}")

Test Loss (MSE): 1.7745
Mean Absolute Error (MAE): 1.0132
Mean Squared Error (MSE): 1.7679
R² Score: 0.1918


In [2]:
import torch

In [3]:
torch.save(model.state_dict(), "model_weights.pth")

NameError: name 'model' is not defined