The highest should be around 99.40-99.50%, and takes about 55 minutes to run.

Set the random seed:
although a seed is setted, due to problems like cpu running, batchnorm,OneCycleLR and so on, there still remains slight fluxuation among different trainings.

In [1]:
#the highest could reach 99.34%!!Epoch 114: Val Acc: 99.34%
import random
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, Subset
from torch.optim.lr_scheduler import OneCycleLR
import os

# ==================== 确定性设置 ====================
SEED = 42

def set_deterministic(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8'
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.use_deterministic_algorithms(True, warn_only=True)

set_deterministic(SEED)

getting data, and also reinforcing the data

In [2]:
def get_dataloaders():
    # 增强训练集变换
    transform_train = transforms.Compose([
        transforms.RandomAffine(
            degrees=3,
            translate=(0.05, 0.05),
            shear=5,
        ),
        transforms.ToTensor(),
        transforms.Normalize((0.5,), (0.5,)),
        transforms.Lambda(lambda x: x.view(-1))
    ])
    
    # 测试集使用基础变换
    transform_test = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.5,), (0.5,)),
        transforms.Lambda(lambda x: x.view(-1))
    ])

    # 固定数据集划分
    full_train = datasets.MNIST(root='./data', train=True, download=True, transform=transform_train)
    train_indices = list(range(55000))
    val_indices = list(range(55000, 60000))
    
    train_dataset = Subset(full_train, train_indices)
    val_dataset = Subset(
        datasets.MNIST(root='./data', train=True, download=True, transform=transform_test),
        val_indices
    )
    test_dataset = datasets.MNIST(root='./data', train=False, download=True, transform=transform_test)

    # 确定性DataLoader
    train_loader = DataLoader(
        train_dataset, batch_size=64, shuffle=False,
        num_workers=0, worker_init_fn=lambda _: np.random.seed(SEED)
    )
    val_loader = DataLoader(val_dataset, batch_size=1000, shuffle=False, num_workers=0)
    test_loader = DataLoader(test_dataset, batch_size=1000, shuffle=False, num_workers=0)
    
    return train_loader, val_loader, test_loader

I used swish as an activateion function, it is smoother than relu and has a better stablilty than others. I changed the cross entropy loss into the focal loss,which is better at finding the complex samples. I increased the learning depth to 2048, so I also had to add the batchnorm to  help it converge.

In [3]:
# ==================== 模型定义 ====================
class Swish(nn.Module):
    def __init__(self, beta=1.0):
        super().__init__()
        self.beta = nn.Parameter(torch.tensor(float(beta)))
        
    def forward(self, x):
        return x * torch.sigmoid(self.beta * x)

class FocalLoss(nn.Module):
    def __init__(self, gamma=1.5, alpha=0.25):
        super().__init__()
        self.gamma = gamma
        self.alpha = alpha

    def forward(self, inputs, targets):
        ce_loss = F.cross_entropy(inputs, targets, reduction='none')
        pt = torch.exp(-ce_loss)
        return (self.alpha * (1 - pt) ** self.gamma * ce_loss).mean()

class UltimateMLP(nn.Module):
    def __init__(self, hidden_dim=2048):
        super().__init__()
        self.layers = nn.Sequential(
            self._make_block(28 * 28, hidden_dim, 0.4),
            self._make_block(hidden_dim, hidden_dim//2, 0.35),
            self._make_block(hidden_dim//2, hidden_dim//4, 0.3),
            nn.Linear(hidden_dim//4, 10)
        )
        self._init_weights()

    def _make_block(self, in_dim, out_dim, dropout):
        return nn.Sequential(
            nn.Linear(in_dim, out_dim),
            nn.BatchNorm1d(out_dim),
            Swish(),
            nn.Dropout(dropout)
        )

    def _init_weights(self):
        # Swish激活函数的增益值近似为1.1（根据经验值）
        gain = 1.1
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.kaiming_normal_(m.weight, mode='fan_in', nonlinearity='relu')
                nn.init.constant_(m.bias, 0.01)

    def forward(self, x):
        return self.layers(x)

The final training: I added an optimizer, schedualed the learning rate, and increased the number of training to 250 and choose the best one. 

In [4]:
# ==================== 训练流程 ====================
def main():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    train_loader, val_loader, test_loader = get_dataloaders()
    
    # 模型配置
    model = UltimateMLP(hidden_dim=2048).to(device)
    
    # 优化器配置
    optimizer = torch.optim.AdamW(
        model.parameters(),
        lr=0.005,
        weight_decay=5e-4,
        betas=(0.95, 0.99)
    )
    
    # 学习率调度
    scheduler = OneCycleLR(
        optimizer,
        max_lr=0.02,
        pct_start=0.25,
        div_factor=25,
        final_div_factor=1e4,
        total_steps=len(train_loader)*250
    )
    
    criterion = FocalLoss()

    # 训练循环
    best_acc = 0.0
    for epoch in range(250):
        model.train()
        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)
            
            optimizer.zero_grad(set_to_none=True)
            outputs = model(images)
            loss = criterion(outputs, labels)
            
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()

        # varify 
        model.eval()
        with torch.no_grad():
            correct = total = 0
            for images, labels in val_loader:
                images, labels = images.to(device), labels.to(device)
                outputs = model(images)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
            
            val_acc = 100 * correct / total
            print(f"Epoch {epoch+1}: Val Acc: {val_acc:.2f}%")
            
            if val_acc > best_acc:
                best_acc = val_acc
                torch.save(model.state_dict(), 'best_model.pth')

    # 测试最佳模型
    model.load_state_dict(torch.load('best_model.pth'))
    model.eval()
    with torch.no_grad():
        correct = total = 0
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    print(f"\n最终测试准确率: {100 * correct / total:.2f}%")

if __name__ == "__main__":
    main()

Epoch 1: Val Acc: 96.84%
Epoch 2: Val Acc: 97.40%
Epoch 3: Val Acc: 97.98%
Epoch 4: Val Acc: 97.92%
Epoch 5: Val Acc: 97.96%
Epoch 6: Val Acc: 98.26%
Epoch 7: Val Acc: 98.28%
Epoch 8: Val Acc: 97.58%
Epoch 9: Val Acc: 98.04%
Epoch 10: Val Acc: 98.18%
Epoch 11: Val Acc: 98.08%
Epoch 12: Val Acc: 97.98%
Epoch 13: Val Acc: 98.18%
Epoch 14: Val Acc: 98.24%
Epoch 15: Val Acc: 98.24%
Epoch 16: Val Acc: 98.12%
Epoch 17: Val Acc: 98.48%
Epoch 18: Val Acc: 98.38%
Epoch 19: Val Acc: 98.48%
Epoch 20: Val Acc: 98.12%
Epoch 21: Val Acc: 98.20%
Epoch 22: Val Acc: 98.40%
Epoch 23: Val Acc: 98.78%
Epoch 24: Val Acc: 98.26%
Epoch 25: Val Acc: 98.56%
Epoch 26: Val Acc: 98.50%
Epoch 27: Val Acc: 98.44%
Epoch 28: Val Acc: 98.58%
Epoch 29: Val Acc: 98.56%
Epoch 30: Val Acc: 98.22%
Epoch 31: Val Acc: 98.44%
Epoch 32: Val Acc: 98.46%
Epoch 33: Val Acc: 98.14%
Epoch 34: Val Acc: 98.72%
Epoch 35: Val Acc: 98.60%
Epoch 36: Val Acc: 98.30%
Epoch 37: Val Acc: 98.22%
Epoch 38: Val Acc: 98.44%
Epoch 39: Val Acc: 98

  model.load_state_dict(torch.load('best_model.pth'))



最终测试准确率: 99.24%
