In [4]:
import torch
import matplotlib.pyplot as plt
import numpy as np

from utils import visualize_optimizer
from checker import test_optimizer

In [5]:
from typing import List

class Optimizer:
    """Base class for each optimizer"""

    def __init__(self, initial_params):
        # store model weights
        self.params = initial_params

    def step(self):
        """Updates the weights stored in self.params"""
        raise NotImplementedError()

    def zero_grad(self):
        """Torch accumulates gradients, so we need to clear them after every update"""
        for param in self.params:
            if param.grad is not None:
                param.grad.detach_()
                param.grad.zero_()


class GradientDescent(Optimizer):

    def __init__(self, initial_params: List[torch.tensor], learning_rate):
        super().__init__(initial_params)
        self.learning_rate = learning_rate

    @torch.no_grad()
    def step(self):
        for param in self.params:
            # Please note that it's important to change the parameters in-place (-=) so the original tensors are modified
            param -= self.learning_rate * param.grad

In [22]:
class Momentum(Optimizer):

    def __init__(self, initial_params, learning_rate, gamma):
        super().__init__(initial_params)

        self.learning_rate = learning_rate
        self.gamma = gamma
        # how do I know self.params is a list?
        self.deltas = [torch.full(self.params[i].size(), 0) for i in range(len(self.params))]

    @torch.no_grad()
    def step(self):
        i = 0
        for param in self.params:
            delta = self.gamma * self.deltas[i] + self.learning_rate * param.grad
            param -= delta
            self.deltas[i] = delta
            i += 1

test_optimizer(Momentum)

In [29]:
class Adagrad(Optimizer):

    def __init__(self, initial_params, learning_rate, epsilon):
        super().__init__(initial_params)
        self.learning_rate = learning_rate
        self.epsilon = epsilon
        self.sums_of_sq_grads = [torch.full(self.params[i].size(), 0.) for i in range(len(self.params))]

    @torch.no_grad()
    def step(self):
        for i in range(len(self.params)):
            squared_grad = self.params[i].grad * self.params[i].grad
            self.sums_of_sq_grads[i] += squared_grad
            adapted_lr = self.learning_rate / torch.sqrt(self.sums_of_sq_grads[i]+self.epsilon)
            self.params[i] -= adapted_lr * self.params[i].grad

test_optimizer(Adagrad)

In [32]:
class RMSProp(Optimizer):

    def __init__(self, initial_params, learning_rate, gamma, epsilon):
        super().__init__(initial_params)
        self.learning_rate = learning_rate
        self.gamma = gamma
        self.epsilon = epsilon
        self.moving_avg = [torch.full(self.params[i].size(), 0.) for i in range(len(self.params))]

    @torch.no_grad()
    def step(self):
        for i in range(len(self.params)):
            new_moving_avg = self.gamma * self.moving_avg[i] + (1 - self.gamma) * self.params[i].grad * self.params[i].grad
            self.moving_avg[i] = new_moving_avg
            adapted_lr = self.learning_rate / torch.sqrt(self.moving_avg[i]+self.epsilon)
            self.params[i] -= adapted_lr * self.params[i].grad


test_optimizer(RMSProp)

In [39]:
class Adadelta(Optimizer):
    def __init__(self, initial_params, gamma, epsilon):
        super().__init__(initial_params)
        self.gamma = gamma
        self.epsilon = epsilon
        self.moving_avg_grads = [torch.full(self.params[i].size(), 0.) for i in range(len(self.params))]
        self.moving_avg_deltas = [torch.full(self.params[i].size(), 0.) for i in range(len(self.params))]

    @torch.no_grad()
    def step(self):
        for i in range(len(self.params)):
            grad = self.params[i].grad
            self.moving_avg_grads[i] = self.gamma * self.moving_avg_grads[i] + (1 - self.gamma) * grad * grad
            e = self.epsilon
            delta = (torch.sqrt(self.moving_avg_deltas[i] + e) / torch.sqrt(self.moving_avg_grads[i] + e)) * self.params[i].grad
            self.params[i] -= delta
            self.moving_avg_deltas[i] = self.gamma * self.moving_avg_deltas[i] + (1 - self.gamma) * (delta * delta)

test_optimizer(Adadelta)

In [60]:
class Adam(Optimizer):

    def __init__(self, initial_params, learning_rate, beta1, beta2, epsilon):
        super().__init__(initial_params)
        self.learning_rate = learning_rate
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon
        self.m = [torch.full(self.params[i].size(), 0.) for i in range(len(self.params))]
        self.v = [torch.full(self.params[i].size(), 0.) for i in range(len(self.params))]
        self.step_counter = 0

    @torch.no_grad()
    def step(self,):
        for i in range(len(self.params)):
            self.step_counter += 1
            grad = self.params[i].grad
            self.m[i] = self.beta1 * self.m[i] + (1 - self.beta1) * grad
            self.v[i] = self.beta2 * self.v[i] + (1 - self.beta2) * grad * grad
            m_hat = self.m[i] / (1 - self.beta1 ** self.step_counter)
            v_hat = self.v[i] / (1 - self.beta2 ** self.step_counter)
            self.params[i] -= self.learning_rate * (m_hat / (torch.sqrt(v_hat) + self.epsilon))

test_optimizer(Adam)

AssertionError: 