In [2]:
import torch

# 检查是否有可用的CUDA设备
if torch.cuda.is_available():
    # 获取设备数量
    num_devices = torch.cuda.device_count()
    print(f"共有 {num_devices} 个CUDA设备")

    # 遍历每个设备并打印其信息
    for i in range(num_devices):
        device_name = torch.cuda.get_device_name(i)
        device_capability = torch.cuda.get_device_capability(i)
        device_memory = torch.cuda.get_device_properties(i).total_memory / (1024 ** 3)  # 以GB为单位
        print(f"设备 {i}:")
        print(f"  名称: {device_name}")
        print(f"  计算能力: {device_capability[0]}.{device_capability[1]}")
        print(f"  总内存: {device_memory:.2f} GB")
else:
    print("没有可用的CUDA设备")


共有 8 个CUDA设备
设备 0:
  名称: NVIDIA GeForce RTX 3090
  计算能力: 8.6
  总内存: 23.69 GB
设备 1:
  名称: NVIDIA GeForce RTX 3090
  计算能力: 8.6
  总内存: 23.69 GB
设备 2:
  名称: NVIDIA GeForce RTX 3090
  计算能力: 8.6
  总内存: 23.69 GB
设备 3:
  名称: NVIDIA GeForce RTX 3090
  计算能力: 8.6
  总内存: 23.69 GB
设备 4:
  名称: NVIDIA GeForce RTX 3090
  计算能力: 8.6
  总内存: 23.69 GB
设备 5:
  名称: NVIDIA GeForce RTX 3090
  计算能力: 8.6
  总内存: 23.69 GB
设备 6:
  名称: NVIDIA GeForce RTX 3090
  计算能力: 8.6
  总内存: 23.69 GB
设备 7:
  名称: NVIDIA GeForce RTX 3090
  计算能力: 8.6
  总内存: 23.69 GB


## Train the model

In [1]:
from net import MnistNet
import os
from datasets import load_dataset
import torch
from torch.utils.data import DataLoader, TensorDataset
import torchvision.transforms as transforms
import numpy as np
from dataloader import MNISTDataloader
import torch.optim as optim
import torch.nn.functional as F

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
batch_size = 512
epochs = 20
device_torch = torch.device("cuda" if torch.cuda.is_available() else "cpu")
net = MnistNet()
net = net.to(device_torch)
optimizer = optim.Adam(net.parameters())


In [7]:
# 定义训练函数
def train(model, device, train_loader, optimizer):
    # model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)  # 将数据和标签转移到GPU/CPU
        # print("data: ", data.shape)
        # print("target: ", target.shape)
        optimizer.zero_grad()  # 清空上一步的梯度
        output = model.forward(data)  # 前向传播
        # print("output: ", output.shape)
        loss = F.nll_loss(output, target)  # 计算损失
        loss.backward()  # 反向传播
        optimizer.step()  # 更新参数
        if batch_idx % 100 == 0:
            print(f'Batch: {batch_idx}, Loss: {loss.item()}')

def test(model, device, test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model.forward(data)
            test_loss += F.nll_loss(output, target, reduction='sum').item()
            pred = output.max(1, keepdim=True)[1] # 找到概率最大的下标
            correct += pred.eq(target.view_as(pred)).sum().item()
 
    test_loss /= len(test_loader.dataset)
    acc = correct / len(test_loader.dataset) * 100.
    print('\n验证集: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct,
        len(test_loader.dataset), acc))


In [4]:

# This file is used to download andpreprocess the dataset for training and testing.

# 数据集路径
dataset_root_path = "/comp_robot/yangyuqin/workplace/startup/dataset/MNIST/raw"
dataset_train_path = os.path.join(dataset_root_path, 'ylecun/mnist/mnist/train-00000-of-00001.parquet')
dataset_test_path = os.path.join(dataset_root_path, 'ylecun/mnist/mnist/test-00000-of-00001.parquet')

# 图像预处理
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
])

# 加载数据集
dataset = load_dataset('parquet', data_files={'train': dataset_train_path, 'test': dataset_test_path})

# 提取数据
train_images = np.array(dataset['train']['image'])
train_labels = np.array(dataset['train']['label'])

test_images = np.array(dataset['test']['image'])
test_labels = np.array(dataset['test']['label'])

# 应用图像预处理
train_images = np.stack([transform(image) for image in train_images])
test_images = np.stack([transform(image) for image in test_images])

# 转换为PyTorch张量
train_images_tensor = torch.tensor(train_images, dtype=torch.float32)
train_labels_tensor = torch.tensor(train_labels, dtype=torch.long)

test_images_tensor = torch.tensor(test_images, dtype=torch.float32)
test_labels_tensor = torch.tensor(test_labels, dtype=torch.long)

# 创建数据集和数据加载器
train_dataset = TensorDataset(train_images_tensor, train_labels_tensor)
test_dataset = TensorDataset(test_images_tensor, test_labels_tensor)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# 打印一些信息来验证
print("Train DataLoader:", len(train_loader))
print("Test DataLoader:", len(test_loader))


# 定义数据加载器
# train_loader, test_loader = MNISTDataloader(dataset)    



  train_images = np.array(dataset['train']['image'])
  train_images = np.array(dataset['train']['image'])
  test_images = np.array(dataset['test']['image'])
  test_images = np.array(dataset['test']['image'])


Train DataLoader: 938
Test DataLoader: 157


In [8]:
for epoch in range(10):
    print(f"Epoch {epoch+1}")
    train(net, device_torch, train_loader, optimizer)
    test(net, device_torch, test_loader)
    model_save_path = r"logs/model_weights_epoch" + str(epoch) + ".pth"
    torch.save(net.state_dict(), model_save_path)
    print(f'Model weights saved to {model_save_path}')


Epoch 1
Batch: 0, Loss: 0.00018155212455894798
Batch: 4, Loss: 0.0014149120543152094
Batch: 8, Loss: 0.0045260777696967125
Batch: 12, Loss: 0.011960601434111595
Batch: 16, Loss: 0.004531370475888252
Batch: 20, Loss: 0.012264830060303211
Batch: 24, Loss: 0.024474620819091797
Batch: 28, Loss: 0.039758533239364624
Batch: 32, Loss: 7.626802835147828e-05
Batch: 36, Loss: 0.0010706980247050524
Batch: 40, Loss: 0.0003390436468180269
Batch: 44, Loss: 0.045172642916440964
Batch: 48, Loss: 0.0013178333174437284
Batch: 52, Loss: 0.01633531227707863
Batch: 56, Loss: 0.023964939638972282
Batch: 60, Loss: 0.002593002747744322
Batch: 64, Loss: 0.0017826100811362267
Batch: 68, Loss: 0.00036938406992703676
Batch: 72, Loss: 0.0021321058738976717
Batch: 76, Loss: 0.00036207467201165855
Batch: 80, Loss: 0.0005656335270032287
Batch: 84, Loss: 0.003970462828874588
Batch: 88, Loss: 0.0001058786510839127
Batch: 92, Loss: 0.0019468284444883466
Batch: 96, Loss: 0.00023128108296077698
Batch: 100, Loss: 0.0038149

In [14]:
train_loader

<torch.utils.data.dataloader.DataLoader at 0x7efc8a8c57f0>