# Optimizer

## Initialize the dataset

In [1]:
import copy
import torch
import torchvision
import torchvision.transforms as transforms
import tqdm

def load_mnist_data(root_path='./data', batch_size=4):
    transform = transforms.Compose(
        [transforms.ToTensor(),
        transforms.Normalize((0.5), (0.5))]
    )

    trainset = torchvision.datasets.MNIST(root=root_path, train=True, download=True, transform=transform)
    trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=2)

    testset = torchvision.datasets.MNIST(root=root_path, train=False, download=True, transform=transform)
    testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=2)

    return trainloader, testloader

## Define the neural network structure

In [10]:
import torch.nn as nn



model = nn.Sequential(
    nn.Linear(784, 32),  # input layer (do not change the in_features size of this layer - we need it later)
    #nn.ReLU(),
    nn.Linear(32, 32),
    #nn.ReLU(),
    nn.Linear(32, 10)  # you can change the in_features of this layer but let the out_features at size 10 here - we need it layer
)

## Training loop

In [15]:
from typing import Callable
from torch.optim import Optimizer

avg_losses = {'iterations': [], 'loss': []}

def train_model(
    model: nn.Module, loss_fn: Callable, optimizer: Optimizer,
    batch_size: int = 4, epochs: int = 10
):
    # we only consider the mnist train data for this example
    train_loader, _ = load_mnist_data(batch_size=batch_size)

    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    criterion = loss_fn

    avg_losses = {'iterations': [], 'loss': []}

    for epoch in range(epochs):
        running_loss = 0.0
        running_accuracy = []
        for i, (imgs, targets) in enumerate(tqdm.tqdm(train_loader, desc=f'Training iteration {epoch + 1}')):
            imgs, targets = imgs.to(device=device), targets.to(device=device)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = model(imgs.reshape(imgs.shape[0], -1))
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()

            # Calculate the Accuracy (how many of all samples are correctly classified?)
            max_outputs = torch.max(outputs, dim=1).indices
            accuracy = (max_outputs.detach() == targets.detach()).to(dtype=torch.float32).mean()
            running_accuracy.append(accuracy)
#this part is for printing avg values
        if (i + 1) % 50 == 0:
            avg_loss = running_loss / 50
            avg_losses['iterations'].append(i + 1)
            avg_losses['loss'].append(avg_loss)
            print(f'Epoch {epoch + 1}, Iteration {i + 1}, Avg Loss: {avg_loss:.3f}')
            running_loss = 0.0

        print(f'Epoch {epoch + 1} finished with loss: {running_loss / len(train_loader):.3f} and accuracy {torch.tensor(running_accuracy).mean():.3f}')

## Define the optimizers

In [16]:
from torch.optim import SGD, Adagrad, RMSprop, Adam
loss_fn = nn.CrossEntropyLoss()

epochs = 5

optimizers = [SGD, Adagrad, RMSprop, Adam]

for optimizer_class in optimizers:
    current_model = copy.deepcopy(model)
    optimizer = optimizer_class(current_model.parameters(), lr = 0.01)

train_model(current_model, loss_fn, optimizer, epochs = epochs,)



Training iteration 1: 100%|██████████| 15000/15000 [00:29<00:00, 516.26it/s]


Epoch 1, Iteration 15000, Avg Loss: 626.353
Epoch 1 finished with loss: 0.000 and accuracy 0.750


Training iteration 2:  36%|███▌      | 5434/15000 [00:12<00:16, 578.60it/s]

## Run the training loop with different optimizers

''I did it in previous step with for loop''

In [None]:
# Store the average loss of every 50th iteration in some iterable structure, e. g. dictionaries
# Create a mapping between the optimizer and the loss, so that you know which losses where achieved for which optimizer
# Update the method train_model accordingly to save the losses

# Update the 'train_model' method above and run the method with different optimizers


avg_losses_sgd = train_model(model, loss_fn, SGD, epochs=5)
avg_losses_adagrad = train_model(model, loss_fn, Adagrad, epochs=5)
avg_losses_rmsprop = train_model(model, loss_fn, RMSprop, epochs=5)
avg_losses_adam = train_model(model, loss_fn, Adam, epochs=5)

print("SGD Losses:", avg_losses_sgd)
print("Adagrad Losses:", avg_losses_adagrad)
print("RMSprop Losses:", avg_losses_rmsprop)
print("Adam Losses:", avg_losses_adam)

## Visualize the results of the optimizers / losses

In [None]:
# Plot the results with matplotlib and show the difference in convergence speed with different optimizers.
# Plot the decreasing loss of each model with each optimizer
import matplotlib.pyplot as plt

# Function to plot the results
def plot_results(avg_losses_list, optimizer_name):
    iterations = avg_losses['iterations']
    losses = avg_losses['loss']

    
    plt.plot(iterations, avg_losses, label=optimizer_name)
    plt.xlabel('Iteration')
    plt.ylabel('Average Loss')
    plt.title(f'Convergence with {optimizer_name}')
    plt.legend()
    
plt.figure(figsize=(12, 8))

plot_results(avg_losses_sgd, 'SGD')
plot_results(avg_losses_adagrad, 'Adagrad')
plot_results(avg_losses_rmsprop, 'RMSprop')
plot_results(avg_losses_adam, 'Adam')

plt.show()

NameError: name 'avg_losses_sgd' is not defined

<Figure size 1200x800 with 0 Axes>