# Homework

Apply `BatchNorm`, `Dropout` and `Residual` on MLP networks for CIFAR-10 classification.

For BatchNorm and Dropout, design models with `BatchNorm Layer`, `Dropout Layer` and both the layers. Compare the results with a plain MLP, and with each other.

For Residual, design a simple `Redisual Block` based on a deeper MLP. Compare the results and see whether adding residual works.

Follow the pipeline in your Homework 2 to finish model designing, training and testing.

### Step 1: Load Dataset

In [1]:
import torch
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import numpy as np
import torch.optim as optim

# 设置 matplotlib 显示格式
%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0)  # 设置默认图像大小
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# CIFAR-10 数据集的下载和加载
transform = transforms.Compose(
    [transforms.ToTensor(),  # 转换为 Tensor
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]  # 归一化
)

In [105]:
from torch.utils.data import Dataset, DataLoader
trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform)

testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                       download=True, transform=transform)
class OursDatasetwithTransforms(Dataset):
    def __init__(self, data, labels, transforms):
        """
        初始化数据集
        data: 输入数据，例如一个 NumPy 数组或 PyTorch 张量
        labels: 对应的标签
        """
        self.data = data
        self.labels = labels
        self.transforms = transforms

    def __len__(self):
        # 每个Dataset都必须写，返回数据集的大小
        return len(self.data)

    def __getitem__(self, idx):
        # 每个Dataset都必须写，获取指定索引idx的数据和标签
        sample = self.data[idx]
        label = self.labels[idx]
        sample = self.transforms(sample)
        return torch.tensor(sample, dtype=torch.float32), torch.tensor(label, dtype=torch.long)
        
transform = transforms.Compose(
    [transforms.ToTensor(),  # [0, 255], PIL Image / ndarray --> [0, 1], torch.tensor
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]  # 归一化, (input[channel] - mean[channel]) / std[channel]
)
traindata = trainset.data[:5000]
trainlabels = trainset.targets[:5000]
testdata = testset.data[:500]
testlabels = testset.targets[:500]

# 初始化自定义数据集
trainset = OursDatasetwithTransforms(traindata, trainlabels, transform)
testset = OursDatasetwithTransforms(testdata, testlabels, transform)
trainloader = DataLoader(trainset, batch_size=256, shuffle=False, drop_last=False)
testloader = DataLoader(testset, batch_size=256, shuffle=False, drop_last=False)

Files already downloaded and verified
Files already downloaded and verified


In [106]:
# Helper functions for plotting the loss values
from typing import List
def plot_loss(num_epochs: int, train_losses: List, test_losses: List) -> None:
    plt.figure(figsize=(10, 5))
    plt.plot(range(1, num_epochs + 1), train_losses, label='Training Loss', marker='o')
    plt.plot(range(1, num_epochs + 1), test_losses, label='Testing Loss', marker='x')
    plt.title('Training and Testing Loss over Epochs')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.grid()
    plt.show()

### Step 2: Design a 3-Layer MLP with BatchNorm and Dropout

In [107]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# Here is the simple version of A 3-layer MLP
class SimpleMLP(nn.Module):
    def __init__(self, input_size=32*32*3, hidden_size=512):
        super(SimpleMLP, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)  # For CIFAR-10, input size is 32x32x3
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, 10)  # Output size is 10 classes for CIFAR-10
    
    def forward(self, x):
        x = x.view(-1, 32 * 32 * 3)  # Flatten the image
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Implementation of MLP with BatchNorm only
# Use nn.BatchNorm1d
class MLPWithBatchNorm(nn.Module):
    def __init__(self, input_size=32*32*3, hidden_size=512):
        # your code here
        super(MLPWithBatchNorm, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.batchnorm1 = nn.BatchNorm1d(hidden_size)  # BatchNorm layer1
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.batchnorm2 = nn.BatchNorm1d(hidden_size)  # BatchNorm layer2
        self.fc3 = nn.Linear(hidden_size, 10)
        

    def forward(self, x):
        x = x.view(-1, 32 * 32 * 3)
        x = self.fc1(x)
        x = self.batchnorm1(x)  # use BatchNorm
        x = F.relu(x)
        x = self.fc2(x)
        x = self.batchnorm2(x)  # use BatchNorm
        x = F.relu(x)
        x = self.fc3(x)
        return x

# Implementation of MLP with Dropout only 
# Use nn.Dropout
class MLPWithDropout(nn.Module):
    def __init__(self, input_size=32*32*3, hidden_size=512, dropout_prob=0.5):
        # your code here
        super(MLPWithDropout, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, 10)
        self.dropout = nn.Dropout(p=dropout_prob)  # Dropout layer
    
    def forward(self, x):
        x = x.view(-1, 32 * 32 * 3)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)  # use Dropout
        x = F.relu(self.fc2(x))
        x = self.dropout(x)  # use Dropout
        x = self.fc3(x)
        return x
        
    
# Implementation of MLP with both BatchNorm and Dropout
# Use nn.BatchNorm1d and nn.Dropout  
class MLPWithBatchNormDropout(nn.Module):
    def __init__(self, input_size=32*32*3, hidden_size=512, dropout_prob=0.5):
        # your code here
        super(MLPWithBatchNormDropout, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.batchnorm1 = nn.BatchNorm1d(hidden_size)  # BatchNorm layer1
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.batchnorm2 = nn.BatchNorm1d(hidden_size)  # BatchNorm layer2
        self.fc3 = nn.Linear(hidden_size, 10)
        self.dropout = nn.Dropout(p=dropout_prob)  # Dropout layer

    def forward(self, x):
        x = x.view(-1, 32 * 32 * 3)
        x = self.fc1(x)
        x = self.batchnorm1(x)  # use BatchNorm
        x = F.relu(x)
        x = self.dropout(x)  # use Dropout
        x = self.fc2(x)
        x = self.batchnorm2(x)  # use BatchNorm
        x = F.relu(x)
        x = self.dropout(x)  # use Dropout
        x = self.fc3(x)
        return x
   

In [108]:
# Training and Testing Loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def train(model, train_loader):
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.CrossEntropyLoss()

    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()
    
    train_loss = running_loss / len(train_loader)
    train_acc = 100. * correct / total
    return train_loss, train_acc

def test(model, test_loader):
    criterion = nn.CrossEntropyLoss()
    
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            
            outputs = model(images)
            loss = criterion(outputs, labels)
            running_loss += loss.item()
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()
    
    test_loss = running_loss / len(test_loader)
    test_acc = 100. * correct / total
    return test_loss, test_acc

### Step 3: Train the networks

In [109]:
# Hyper parameters
num_epochs = 10

In [110]:
# Train SimpleMLP
# your code here
def train_with_test(model, trainloader, testloader, num_epochs):
    for epoch in range(num_epochs):
        train_loss, train_acc = train(model, trainloader)
        test_loss, test_acc = test(model, testloader)
        print(f'Epoch {epoch + 1}, Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%,',
            f'Test Loss: {test_loss:.4f}, Test Acc: {test_acc}%')
    
model_simple = SimpleMLP().to(device)
train_with_test(model_simple, trainloader, testloader, num_epochs)

  return torch.tensor(sample, dtype=torch.float32), torch.tensor(label, dtype=torch.long)


Epoch 1, Train Loss: 1.6402, Train Acc: 42.03%, Test Loss: 1.4854, Test Acc: 48.4%
Epoch 2, Train Loss: 1.4112, Train Acc: 50.60%, Test Loss: 1.3871, Test Acc: 52.8%
Epoch 3, Train Loss: 1.2760, Train Acc: 55.45%, Test Loss: 1.3653, Test Acc: 54.6%
Epoch 4, Train Loss: 1.1606, Train Acc: 59.49%, Test Loss: 1.3706, Test Acc: 53.8%
Epoch 5, Train Loss: 1.0544, Train Acc: 63.14%, Test Loss: 1.3841, Test Acc: 52.8%
Epoch 6, Train Loss: 0.9507, Train Acc: 66.95%, Test Loss: 1.4440, Test Acc: 51.4%
Epoch 7, Train Loss: 0.8582, Train Acc: 70.28%, Test Loss: 1.4713, Test Acc: 51.6%
Epoch 8, Train Loss: 0.7781, Train Acc: 73.35%, Test Loss: 1.5333, Test Acc: 52.8%
Epoch 9, Train Loss: 0.7145, Train Acc: 75.54%, Test Loss: 1.6688, Test Acc: 50.4%
Epoch 10, Train Loss: 0.6499, Train Acc: 77.74%, Test Loss: 1.7774, Test Acc: 48.8%


In [111]:
# Train MLPWithBatchNorm
model_with_BN = MLPWithBatchNorm().to(device)
train_with_test(model_with_BN, trainloader, testloader, num_epochs)

  return torch.tensor(sample, dtype=torch.float32), torch.tensor(label, dtype=torch.long)


Epoch 1, Train Loss: 1.5909, Train Acc: 43.49%, Test Loss: 1.4309, Test Acc: 49.2%
Epoch 2, Train Loss: 1.3737, Train Acc: 51.31%, Test Loss: 1.3606, Test Acc: 52.8%
Epoch 3, Train Loss: 1.2425, Train Acc: 56.04%, Test Loss: 1.3414, Test Acc: 52.8%
Epoch 4, Train Loss: 1.1370, Train Acc: 59.90%, Test Loss: 1.3258, Test Acc: 53.4%
Epoch 5, Train Loss: 1.0407, Train Acc: 63.52%, Test Loss: 1.3432, Test Acc: 53.0%
Epoch 6, Train Loss: 0.9485, Train Acc: 67.09%, Test Loss: 1.3591, Test Acc: 53.6%
Epoch 7, Train Loss: 0.8579, Train Acc: 70.53%, Test Loss: 1.3855, Test Acc: 54.8%
Epoch 8, Train Loss: 0.7727, Train Acc: 73.72%, Test Loss: 1.4505, Test Acc: 53.0%
Epoch 9, Train Loss: 0.6964, Train Acc: 76.76%, Test Loss: 1.5497, Test Acc: 52.2%
Epoch 10, Train Loss: 0.6284, Train Acc: 79.15%, Test Loss: 1.6411, Test Acc: 52.0%


In [112]:
# Training MLPWithDropout
# your code here
model_with_Dropout = MLPWithDropout().to(device)
train_with_test(model_with_Dropout, trainloader, testloader, num_epochs)

  return torch.tensor(sample, dtype=torch.float32), torch.tensor(label, dtype=torch.long)


Epoch 1, Train Loss: 1.8065, Train Acc: 35.53%, Test Loss: 1.5994, Test Acc: 44.4%
Epoch 2, Train Loss: 1.6568, Train Acc: 41.06%, Test Loss: 1.5426, Test Acc: 44.8%
Epoch 3, Train Loss: 1.6018, Train Acc: 43.50%, Test Loss: 1.4990, Test Acc: 47.0%
Epoch 4, Train Loss: 1.5621, Train Acc: 44.94%, Test Loss: 1.4766, Test Acc: 51.2%
Epoch 5, Train Loss: 1.5308, Train Acc: 45.91%, Test Loss: 1.4633, Test Acc: 49.8%
Epoch 6, Train Loss: 1.5114, Train Acc: 46.84%, Test Loss: 1.4397, Test Acc: 48.6%
Epoch 7, Train Loss: 1.4801, Train Acc: 47.90%, Test Loss: 1.4349, Test Acc: 48.2%
Epoch 8, Train Loss: 1.4618, Train Acc: 48.35%, Test Loss: 1.4153, Test Acc: 49.2%
Epoch 9, Train Loss: 1.4443, Train Acc: 48.89%, Test Loss: 1.4274, Test Acc: 50.6%
Epoch 10, Train Loss: 1.4255, Train Acc: 49.70%, Test Loss: 1.4048, Test Acc: 50.2%


In [113]:
# Training MLPWithBatchNormDropout
# your code here
model_with_BN_Dropout = MLPWithBatchNormDropout().to(device)
train_with_test(model_with_BN_Dropout, trainloader, testloader, num_epochs)

  return torch.tensor(sample, dtype=torch.float32), torch.tensor(label, dtype=torch.long)


Epoch 1, Train Loss: 1.7823, Train Acc: 35.85%, Test Loss: 1.5633, Test Acc: 44.6%
Epoch 2, Train Loss: 1.6027, Train Acc: 42.60%, Test Loss: 1.4736, Test Acc: 48.2%
Epoch 3, Train Loss: 1.5246, Train Acc: 45.50%, Test Loss: 1.4158, Test Acc: 50.0%
Epoch 4, Train Loss: 1.4729, Train Acc: 47.28%, Test Loss: 1.3857, Test Acc: 51.2%
Epoch 5, Train Loss: 1.4282, Train Acc: 48.96%, Test Loss: 1.3694, Test Acc: 50.8%
Epoch 6, Train Loss: 1.3981, Train Acc: 50.14%, Test Loss: 1.3248, Test Acc: 53.6%
Epoch 7, Train Loss: 1.3688, Train Acc: 51.08%, Test Loss: 1.3226, Test Acc: 52.8%
Epoch 8, Train Loss: 1.3413, Train Acc: 52.02%, Test Loss: 1.3009, Test Acc: 52.8%
Epoch 9, Train Loss: 1.3166, Train Acc: 52.89%, Test Loss: 1.2798, Test Acc: 52.2%
Epoch 10, Train Loss: 1.2878, Train Acc: 54.06%, Test Loss: 1.2813, Test Acc: 52.4%


#### Question: Analysis the results and find out whether any of the above techniques is useful for training this MLP for CIFAR-10 classification. Explain why/how they work?

#### Your answer:

Regard `SimpleMLP()` as the baseline, it can be seen from the training output above that the three advanced models did   improve the classification result.

- As for `MLPWithBatchNorm()` , 2 `BatchNorm` layers are incorporated. By standardizing the output of the hidden layer, training was accelerated and the model stability can be improved. We can see that after 10 epochs of training, both Train and Test loss were lower, both accuracy were higher compared to the `SimpleMLP()` (those improvements were slight, though).
- As for `MLPWithBatchNorm()` , `Dropout` is used after activation. By randomly discarding some neurons, overfitting can be effectively reduced and the model's generalization ability can be increased. We can see that the Train accuracy was obviously lower than `SimpleMLP()` , indicating less overfitting. Also, the Test accuracy was higher, slightly though.
- As for `MLPWithBatchNormDropout()` , it combines the advantages of `MLPWithBatchNorm()` and `MLPWithBatchNorm()` , so its training result showed a better generalization ability on Test dataloader and a lower overfitting level on Train dataloader.

### Step4: Design a MLP that has deeper layers with Residual Blocks

In [114]:
# A deeper MLP with 50 layers
class DeepMLP(nn.Module):
    def __init__(self, input_size=32*32*3, num_classes=10, hidden_size=512, num_layers=50):
        super(DeepMLP, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.bn1 = nn.BatchNorm1d(hidden_size)
        
        # Stack multiple hidden layers
        self.hidden_layers = nn.ModuleList([
            nn.Linear(hidden_size, hidden_size) for _ in range(num_layers)
        ])
        self.bn_layers = nn.ModuleList([
            nn.BatchNorm1d(hidden_size) for _ in range(num_layers)
        ])
        
        self.fc_out = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        
        x = x.view(-1, 32*32*3)
        x = F.relu(self.bn1(self.fc1(x)))

        # Pass through deep hidden layers
        for i in range(len(self.hidden_layers)):
            x = F.relu(self.bn_layers[i](self.hidden_layers[i](x)))
        
        # Output layer for classification
        x = self.fc_out(x)
        
        return x

In [115]:
# Design a residual block
# input -> Linear -> batchnorm -> activation -> dropout -> Linear -> batchnorm -> skip connection -> activation -> output
class ResidualMLPBlock(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_prob=0.5):
        # your code here
        pass

# Design MLP with residual blocks
# Use nn.ModuleList 
# input -> Linear -> batchnorm -> activation -> residual layers -> classifacation layer
class DeepResidualMLP(nn.Module):
    def __init__(self, input_size=32*32*3, num_classes=10, hidden_size=512, num_layers=50, dropout_prob=0.5):
        # your code here
        pass


In [116]:
# Training DeepMLP
# your code here


# Save the losses during training and testing and plot them using plot_loss()
# your code here

In [117]:
# Training DeepMLP
# your code here


# Save the losses during training and testing and plot them using plot_loss()
# your code here

#### Bonus: Try to see the gradient flow when training the above networks