In [None]:
import torch
from torch import nn
from torch import optim
from torch.utils.data import random_split, DataLoader
from torchvision import datasets, transforms
from torchvision.transforms import ToTensor

In [None]:
# I needed to run this on T4 GPU to work
torch.randn(5).cuda()

tensor([ 1.0783, -1.1575,  0.0319,  0.0214,  0.1593], device='cuda:0')

In [None]:
# Train, determine the split between training and validation
train_data = datasets.MNIST(root='./data', train=True, download=True, transform=transforms.ToTensor())
# test_data = dataset.MNIST(root='./data', train=False, download=True, transform=transforms.ToTensor())

train, val = random_split(train_data, [55000, 5000])
train_loader = DataLoader(train, batch_size=32)
val_loader = DataLoader(val, batch_size=32)

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 404: Not Found

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz to ./data/MNIST/raw/train-images-idx3-ubyte.gz


100%|██████████| 9.91M/9.91M [00:00<00:00, 17.8MB/s]


Extracting ./data/MNIST/raw/train-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 404: Not Found

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz to ./data/MNIST/raw/train-labels-idx1-ubyte.gz


100%|██████████| 28.9k/28.9k [00:00<00:00, 504kB/s]


Extracting ./data/MNIST/raw/train-labels-idx1-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 404: Not Found

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw/t10k-images-idx3-ubyte.gz


100%|██████████| 1.65M/1.65M [00:00<00:00, 4.50MB/s]


Extracting ./data/MNIST/raw/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 404: Not Found

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz


100%|██████████| 4.54k/4.54k [00:00<00:00, 5.12MB/s]

Extracting ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw






In [None]:
# Define model
model = nn.Sequential(
    nn.Linear(28 * 28, 64),
    nn.ReLU(),
    nn.Linear(64, 64),
    nn.ReLU(),
    nn.Linear(64, 64),
    # Add the stuff below for a 'fancy' model
    nn.Dropout(0.1), # if we are overfitting, helps us treat this
    nn.Linear(64, 10) # the ending layer should manifest into 10 neurons
)

Residuals can create a direct connection between a layer x and a layer x + c by setting certain biases to negative numbers

Residuals are useful because they allow the model to extract more information from the original data

Residuals treat the vanishing gradient problem, which is when the gradient changes become really small

So, residuals basically help us create deeper ML models

In [None]:
# Define a more flexible model using residuals
class ResNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.l1 = nn.Linear(28 * 28, 64)
        self.l2 = nn.Linear(64, 64)
        self.l3 = nn.Linear(64, 10)
        self.do = nn.Dropout(0.1)

    def forward(self, x):
        h1 = nn.functional.relu(self.l1(x))
        h2 = nn.functional.relu(self.l2(h1))
        do = self.do(h2 + h1) # highway networks
        logits = self.l3(do)
        return logits

model = ResNet().cuda() # model is generated, then moved to GPU

In [None]:
# Define optimizer
optimizer = torch.optim.SGD(model.parameters(), lr = 1e-2)

In [None]:
# Define loss as an objective function
loss = nn.CrossEntropyLoss()

In [None]:
# Training and validation loops
nb_epochs = 5
for epoch in range(nb_epochs): # full pass through the dataset
    losses = list() # initialize empty list for which to store the last 10 losses
    accuracies = list()
    model.train() # put the model in training mode (this line is needed because we used dropout to make our model)

    for batch in train_loader:
        x, y = batch

        # x is b x 1 x 28 x 28 (need to reshape it)
        b = x.size(0) # flatten it from an image format
        x = x.view(b, -1).cuda() # the -1 for some reason means (28x28)

        # Forward; computing objective function
        l = loss(model(x), y.cuda()) # since the model is already in the GPU, do the processing in the same piece of hardware

        # Clean up the gradients (not always necessary), but we want to clear them for now
        optimizer.zero_grad()

        # Acculumate the partial derivatives of the loss wrt params
        l.backward() # note that we are ACCUMULATING, not just SUMMING

        # Step in the opposite direction of the gradient (why opposite?)
        optimizer.step()

    # We need to see what the loss is doing though
    # We are otherwise bleeding memory in the program because l is big

        losses.append(l.item())
        accuracies.append(y.eq(model(x).detach().argmax(dim=1).cpu()).float().mean())

    print(f'Epoch {epoch + 1}', end = ', ')
    print(f'Training Loss: {torch.tensor(losses).mean():.2f}', end = ', ')
    print(f'Training Accuracy: {torch.tensor(accuracies).mean():.2f}')

    # Validation
    model.eval() # put the model in validation mode (this line is needed because we used dropout to make our model)
    losses = list() # initialize empty list for which to store the last 10 losses
    accuracies = list()

    for batch in val_loader:
        x, y = batch

        # x is b x 1 x 28 x 28 (need to reshape it)
        b = x.size(0) # flatten it from an image format
        x = x.view(b, -1).cuda() # the -1 for some reason means (28x28)

        # Forward (disable gradient computation for validation)
        with torch.no_grad():
            l = loss(model(x), y.cuda())

        # Printing out the loss
        losses.append(l.item())
        accuracies.append(y.eq(model(x).detach().argmax(dim=1).cpu()).float().mean())

    print(f'Epoch {epoch + 1}', end = ', ')
    print(f'Validation Loss: {torch.tensor(losses).mean():.2f}', end = ', ')
    print(f'Validation Accuracy: {torch.tensor(accuracies).mean():.2f}')

Epoch 1, Training Loss: 0.21, Training Accuracy: 0.94
Epoch 1, Validation Loss: 0.20, Validation Accuracy: 0.94
Epoch 2, Training Loss: 0.19, Training Accuracy: 0.95
Epoch 2, Validation Loss: 0.19, Validation Accuracy: 0.95
Epoch 3, Training Loss: 0.18, Training Accuracy: 0.95
Epoch 3, Validation Loss: 0.17, Validation Accuracy: 0.95
Epoch 4, Training Loss: 0.16, Training Accuracy: 0.96
Epoch 4, Validation Loss: 0.16, Validation Accuracy: 0.95
Epoch 5, Training Loss: 0.15, Training Accuracy: 0.96
Epoch 5, Validation Loss: 0.15, Validation Accuracy: 0.96


# Explanation of Code (From Perplexity)

**Overall Goal:**

The code trains a neural network to classify handwritten digits from the MNIST dataset. MNIST contains images of digits 0 through 9. The network learns to take an image as input and output a prediction of which digit it represents.

**Key Components:**

1.  **Data Loading and Preprocessing:**
    *   The code uses `torchvision.datasets.MNIST` to download and load the MNIST dataset.
    *   It splits the training data into a training set (55,000 images) and a validation set (5,000 images).  The validation set is used to monitor the model's performance on unseen data during training, helping to prevent overfitting.
    *   `DataLoader` is used to create batches of data.  This is more efficient than processing images one at a time.

2.  **Model Definition (ResNet):**
    *   The code defines a `ResNet` class, which is a type of neural network architecture that uses residual connections.
    *   **Layers:** The `ResNet` consists of linear layers (`nn.Linear`), ReLU activation functions (`nn.functional.relu`), and a dropout layer (`nn.Dropout`).
        *   The first linear layer (`l1`) takes the flattened image (28x28 pixels = 784 inputs) and transforms it to 64 features.
        *   The second linear layer (`l2`) transforms the 64 features to another 64 features.
        *   The third linear layer (`l3`) transforms the 64 features to 10 outputs (one for each digit 0-9).  These outputs are often called "logits".
    *   **Residual Connection:** The line `do = self.do(h2 + h1)` is the key part.  It adds the output of the first layer (`h1`) to the output of the second layer (`h2`) *before* applying dropout (`do`).  This creates a shortcut connection (a "highway") that allows information from earlier layers to flow more easily to later layers.

3.  **Why Residuals?**
    *   **Vanishing Gradients:** Residual connections help mitigate the vanishing gradient problem, which can occur in deep networks.  When gradients become very small, it becomes difficult for earlier layers to learn. Residuals provide an alternate path for gradients to flow.
    *   **Easier Optimization:** Residual connections can make the optimization landscape (the "shape" of the loss function) smoother, making it easier for the optimizer to find a good solution.
    *   **Deeper Models:** They allow you to train deeper networks more effectively.

4.  **Loss Function and Optimizer:**
    *   **Loss:** `nn.CrossEntropyLoss()` is the loss function.  It measures the difference between the model's predicted probabilities and the true labels.
    *   **Optimizer:** `torch.optim.SGD` is the optimizer (Stochastic Gradient Descent).  It's responsible for updating the model's parameters (weights and biases) to minimize the loss function. The learning rate (`lr=1e-2`) controls the step size during optimization.

5.  **Training Loop:**
    *   The code iterates through the training data for a specified number of epochs (`nb_epochs = 5`).
    *   **Forward Pass:** For each batch of images:
        *   The input images (`x`) are reshaped (flattened into a vector).
        *   The images are moved to the GPU (using `.cuda()`).
        *   The model makes a prediction (outputs "logits").
        *   The loss is calculated.
    *   **Backward Pass (Backpropagation):**
        *   `optimizer.zero_grad()` clears the gradients from the previous iteration.
        *   `l.backward()` calculates the gradients of the loss function with respect to the model's parameters.
        *   `optimizer.step()` updates the model's parameters using the calculated gradients.
    *   **Validation:** After each epoch, the model is evaluated on the validation set *without* updating the parameters. This gives an estimate of how well the model generalizes to unseen data.

6.  **Evaluation:**
    *   During validation, `torch.no_grad()` is used to disable gradient calculations, which saves memory and speeds up the process.
    *   The code calculates the validation loss and accuracy.

7.  **Output:**
    *   The code prints the training loss/accuracy and validation loss/accuracy for each epoch. This allows you to monitor the training process and see if the model is improving.

**In Summary:**

This code implements a basic image classification task using a ResNet-like neural network in PyTorch. It loads the MNIST dataset, defines a neural network architecture with residual connections, trains the network using stochastic gradient descent, and evaluates its performance on a validation set. The goal is to learn a model that can accurately classify handwritten digits.