In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt

In [2]:
device = torch.device("cpu") 
input_dim = 784
hidden_dim = 128
output_dim = 10
batch_size = 2048
lr = 0.1
epochs = 10

In [6]:
transform = transforms.Compose([transforms.ToTensor()])
train_loader = DataLoader(
    datasets.MNIST("./data", train=True, download=True, transform=transform),
    batch_size=batch_size, shuffle=True
)
test_loader = DataLoader(
    datasets.MNIST("./data", train=False, transform=transform),
    batch_size=batch_size, shuffle=False
)


In [7]:
W1 = nn.Parameter(torch.randn(input_dim, hidden_dim) * 0.01, requires_grad=True).to(device)
b1 = nn.Parameter(torch.zeros(hidden_dim), requires_grad=True).to(device)
W2 = nn.Parameter(torch.randn(hidden_dim, output_dim) * 0.01, requires_grad=True).to(device)
b2 = nn.Parameter(torch.zeros(output_dim), requires_grad=True).to(device)

trainable_params = [p for p in [W1, b1, W2, b2] if p.requires_grad]
print(f"可训练参数数量: {len(trainable_params)}")  # 输出4
for p in trainable_params:
    print(f"参数: {p.shape}, requires_grad: {p.requires_grad}, is_leaf: {p.is_leaf}")  # 均为True

可训练参数数量: 4
参数: torch.Size([784, 128]), requires_grad: True, is_leaf: True
参数: torch.Size([128]), requires_grad: True, is_leaf: True
参数: torch.Size([128, 10]), requires_grad: True, is_leaf: True
参数: torch.Size([10]), requires_grad: True, is_leaf: True


In [8]:
def forward(x):
    x_flat = x.view(x.shape[0], -1)  # 展平：(batch,1,28,28)→(batch,784)
    hidden = torch.matmul(x_flat, W1) + b1  # 手动全连接1
    hidden_act = F.relu(hidden)             # ReLU激活
    output = torch.matmul(hidden_act, W2) + b2  # 手动全连接2
    return output

In [10]:
optimizer = optim.SGD(trainable_params, lr=lr)
train_losses, test_losses, test_accs = [], [], []

for epoch in range(epochs):
    # 训练阶段
    total_train_loss = 0.0
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)  # 数据移至设备
        optimizer.zero_grad()  # 清零梯度（必做）
        outputs = forward(images)
        loss = F.cross_entropy(outputs, labels)
        loss.backward()  # 反向传播求梯度
        optimizer.step()  # 更新参数
        total_train_loss += loss.item() * images.size(0)
    avg_train_loss = total_train_loss / len(train_loader.dataset)
    train_losses.append(avg_train_loss)

    # 测试阶段
    total_test_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():  # 关闭梯度计算
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = forward(images)
            loss = F.cross_entropy(outputs, labels)
            total_test_loss += loss.item() * images.size(0)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    avg_test_loss = total_test_loss / len(test_loader.dataset)
    test_acc = 100 * correct / total
    test_losses.append(avg_test_loss)
    test_accs.append(test_acc)

print(f"Epoch {epoch+1}/{epochs} | 训练损失: {avg_train_loss:.4f} | 测试损失: {avg_test_loss:.4f} | 测试准确率: {test_acc:.2f}%")

Epoch 10/10 | 训练损失: 0.3193 | 测试损失: 0.3051 | 测试准确率: 91.42%
