# Optimizer

## Initialize the dataset

In [None]:
import torch
import torchvision
import torchvision.transforms as transforms
import tqdm

def load_mnist_data(root_path='./data', batch_size=4):
    transform = transforms.Compose(
        [transforms.ToTensor(),
        transforms.Normalize((0.5), (0.5))]
    )

    trainset = torchvision.datasets.MNIST(root=root_path, train=True, download=True, transform=transform)
    trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=2)

    testset = torchvision.datasets.MNIST(root=root_path, train=False, download=True, transform=transform)
    testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=2)

    return trainloader, testloader

## Define the neural network structure

In [None]:
import torch.nn as nn

model = nn.Sequential(
    nn.Linear(784, 32),  # input layer (do not change the in_features size of this layer - we need it later)
    #nn.ReLU(),
    nn.Linear(32, 32),
    #nn.ReLU(),
    # your layers
    nn.Linear(32, 10)  # you can change the in_features of this layer but let the out_features at size 10 here - we need it layer
)

## Training loop

In [None]:
from typing import Callable
from torch.optim import Optimizer

def train_model(
    model: nn.Module, loss_fn: Callable, optimizer: Optimizer,
    batch_size: int = 4, epochs: int = 10
):
    # we only consider the mnist train data for this example
    train_loader, _ = load_mnist_data(batch_size=batch_size)

    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    criterion = loss_fn

    for epoch in range(epochs):
        running_loss = 0.0
        running_accuracy = []
        for imgs, targets in tqdm.tqdm(train_loader, desc=f'Training iteration {epoch + 1}'):
            imgs, targets = imgs.to(device=device), targets.to(device=device)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = model(imgs.reshape(imgs.shape[0], -1))
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()

            # Calculate the Accuracy (how many of all samples are correctly classified?)
            max_outputs = torch.max(outputs, dim=1).indices
            accuracy = (max_outputs.detach() == targets.detach()).to(dtype=torch.float32).mean()
            running_accuracy.append(accuracy)

        print(f'Epoch {epoch + 1} finished with loss: {running_loss / len(train_loader):.3f} and accuracy {torch.tensor(running_accuracy).mean():.3f}')

## Define the optimizers

In [59]:
from torch.optim import SGD, Adagrad, RMSprop, Adam

# Your code here

## Run the training loop with different optimizers

In [61]:
# Store the average loss of every 50th iteration in some iterable structure, e. g. dictionaries
# Create a mapping between the optimizer and the loss, so that you know which losses where achieved for which optimizer
# Update the method train_model accordingly to save the losses

# Update the 'train_model' method above and run the method with different optimizers


# Your code here

## Visualize the results of the optimizers / losses

In [60]:
# Plot the results with matplotlib and show the difference in convergence speed with different optimizers.
# Plot the decreasing loss of each model with each optimizer

# Your code here