In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.datasets as datasets
import torchvision.transforms as transforms
import torch.nn.functional as F
import matplotlib.pyplot as plt

# Step 1: Read dataset and create dataloaders

# Define transformations for the training set
train_transforms = transforms.Compose([
    transforms.RandomCrop(32, padding=4),      # Randomly crop the image to 32x32 and pad with 4 pixels on each side
    transforms.RandomHorizontalFlip(),         # Randomly flip the image horizontally
    transforms.ToTensor(),                      # Convert the image to a PyTorch tensor
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))   # Normalize the tensor with mean and standard deviation
])

# Define transformations for the test set
test_transforms = transforms.Compose([
    transforms.ToTensor(),                      # Convert the image to a PyTorch tensor
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))   # Normalize the tensor with mean and standard deviation
])

# Load the CIFAR10 training set and apply the defined transformations
trainset = datasets.CIFAR10(root='./data', train=True, download=True, transform=train_transforms)

# Create a dataloader for the training set with a batch size of 128
trainloader = torch.utils.data.DataLoader(trainset, batch_size=128, shuffle=True, num_workers=2)

# Load the CIFAR10 test set and apply the defined transformations
testset = datasets.CIFAR10(root='./data', train=False, download=True, transform=test_transforms)

# Create a dataloader for the test set with a batch size of 128
testloader = torch.utils.data.DataLoader(testset, batch_size=128, shuffle=False, num_workers=2)

In [None]:
class Block(nn.Module):
    def __init__(self, in_channels, out_channels, k):
        super(Block, self).__init__()
        # create a linear layer to generate attention coefficients
        self.linear = nn.Linear(in_channels, out_channels*k)
        # create a list of convolutional layers
        self.convs = nn.ModuleList([nn.Conv2d(in_channels, out_channels, 3, padding=1) for _ in range(k)])
        # create a ReLU activation function
        self.activation = nn.ReLU()
        # create an adaptive average pooling layer to generate a feature vector of size (batch_size, in_channels)
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))

    def forward(self, x):
        # pass the input tensor through the adaptive average pooling layer and squeeze the resulting tensor
        a = self.activation(self.linear(self.avgpool(x).squeeze()))
        # pass the feature vector through the linear layer and apply the ReLU activation function
        k = len(self.convs)
        # split the attention coefficients into k chunks and apply each chunk to one of the convolutional layers
        o = sum(ai.unsqueeze(2).unsqueeze(3) * conv(x) for ai, conv in zip(a.chunk(k, 1), self.convs))
        return o

class Net(nn.Module):
    def __init__(self, num_blocks=3, in_channels=3, out_channels=10, block_out_channels=[256, 1024, 4096], block_k=4, dropout_rate=0.2):
        super(Net, self).__init__()
        # Number of blocks in the network
        self.num_blocks = num_blocks
        # Number of input channels for the first block
        self.in_channels = in_channels
        # Number of output classes
        self.out_channels = out_channels
        # List of output channels for each block
        self.block_out_channels = block_out_channels
        # Value of k for each block
        self.block_k = block_k
        # Dropout rate for the classifier
        self.dropout_rate = dropout_rate
        # Define the blocks of the network
        self.blocks = nn.ModuleList([Block(self.in_channels if i==0 else self.block_out_channels[i-1], self.block_out_channels[i], self.block_k) for i in range(num_blocks)])
        # Define batch normalization layers for each block
        self.bn = nn.ModuleList([nn.BatchNorm2d(self.block_out_channels[i]) for i in range(num_blocks)])
        # Define ReLU activation layers for each block
        self.relu = nn.ModuleList([nn.ReLU() for _ in range(num_blocks)])
        # Define pooling layers for each block
        self.pool = nn.ModuleList([nn.MaxPool2d(2, 2) for _ in range(num_blocks)])
        # Define the classifier layers
        self.classifier = nn.Sequential(
            nn.Linear(self.block_out_channels[-1], 1024),
            nn.ReLU(),
            nn.Linear(1024, 256),
            nn.ReLU(),
            nn.Dropout(self.dropout_rate),
            nn.Linear(256, self.out_channels)
        )

    def forward(self, x):
        # Apply each block in the network
        for i in range(self.num_blocks):
            x = self.blocks[i](x)
            x = self.relu[i](x)
            x = self.pool[i](x)
            x = self.bn[i](x)
        # Apply adaptive average pooling to the output of the last block
        x = F.adaptive_avg_pool2d(x, (1,1))
        # Flatten the output of the last block
        x = x.view(x.size(0), -1)
        # Apply the classifier layers
        out = self.classifier(x)
        return out

#Define the model, loss function, and optimizer
model = Net()
loss = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=0.0001)

def train(net, train_iter, test_iter, loss, num_epochs, optimizer, device):
    """Train and evaluate a model with CPU or GPU."""
    # Move the model to the specified device
    net.to(device)
    # Initialize lists to store the loss and accuracy during training and testing
    train_loss_list = []
    train_acc_list = []
    test_acc_list = []
    # Iterate over the specified number of epochs
    for epoch in range(num_epochs):
        train_loss = 0.0
        train_correct = 0.0
        train_total = 0.0
        # Iterate over the training data loader
        for i, (X, y) in enumerate(train_iter):
          # Set the model to training mode and reset the optimizer gradients
            net.train()
            optimizer.zero_grad()
            # Move the data and labels to the specified device
            X, y = X.to(device), y.to(device)
            # Forward pass to get the predicted labels and compute the loss
            y_hat = net(X)
            l = loss(y_hat, y)
            # Backward pass to compute the gradients and update the model parameters
            l.backward()
            optimizer.step()
            # Compute the training loss and accuracy
            with torch.no_grad():
                train_loss += l.item()
                train_correct += (y_hat.argmax(dim=1) == y).sum().item()
                train_total += y.size(0)
        train_loss /= (i + 1)
        train_acc = train_correct / train_total
        train_loss_list.append(train_loss)
        train_acc_list.append(train_acc)
        # Compute the test accuracy
        test_acc = evaluate_accuracy(net, test_iter, device)
        # Append the loss and accuracy values to the corresponding lists
        test_acc_list.append(test_acc)
        # Print the epoch number and the loss and accuracy values
        print(f"epoch {epoch + 1}, train loss {train_loss:.3f}, train acc {train_acc:.3f}, test acc {test_acc:.3f}")
    return train_loss_list, train_acc_list, test_acc_list

def evaluate_accuracy(net, data_iter, device=None):
    """Evaluate accuracy of a model on the given data set."""
    # Use the device of the model if not specified
    if device is None and isinstance(net, torch.nn.Module):
        device = list(net.parameters())[0].device
    # Set the model to evaluation mode
    net.eval()  
    correct, total = 0.0, 0.0
    with torch.no_grad():
      # Iterate over the data loader
        for X, y in data_iter:
          # Move the data and labels to the specified device
            X, y = X.to(device), y.to(device)
            # Forward pass to get the predicted labels and compute the accuracy
            correct += (net(X).argmax(dim=1) == y).float().sum().item()
            total += y.size(0)
            # Return the accuracy value
    return correct / total
    
num_epochs = 20

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
if torch.cuda.is_available(): print(torch.cuda.get_device_name(0))

train(model, trainloader, testloader, loss, num_epochs, optimizer, device)

In [None]:
#Train the model and obtain the loss and accuracy lists
train_loss_list, train_acc_list, test_acc_list = train(model, trainloader, testloader, loss, num_epochs, optimizer, device)
#Create a plot of the training loss and accuracy against the number of epochs
fig, ax1 = plt.subplots()

color = 'tab:red'
ax1.set_xlabel('epoch')
ax1.set_ylabel('loss')
ax1.plot(range(1, num_epochs + 1), train_loss_list, color=color, label='train loss')
ax1.tick_params(axis='y', labelcolor=color)

ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis

color = 'tab:blue'
ax2.set_ylabel('accuracy')
ax2.plot(range(1, num_epochs + 1), train_acc_list, color=color, label='train acc')
ax2.plot(range(1, num_epochs + 1), test_acc_list, color='purple', label='test acc')
ax2.tick_params(axis='y', labelcolor=color)

fig.tight_layout()  # otherwise the right y-label is slightly clipped
plt.legend()
plt.show()
#Evaluate the final testing accuracy of the model
final_test_acc = evaluate_accuracy(model, testloader, device)
print(f"Final testing accuracy: {final_test_acc:.3f}")