In [52]:
import numpy as np

import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from torch.optim import Optimizer

In [25]:
# Génération du jeu de données linéaire
np.random.seed(0)
n_samples = 100
x_linear = np.linspace(-10, 10, n_samples)
y_linear = 3 * x_linear + 5 + np.random.normal(0, 2, n_samples)

 # Génération du jeu de données non linéaire
y_nonlinear = 0.5 * x_linear **2 - 4 * x_linear + np.random.normal(0 ,5 ,n_samples)

# ***Utils***

In [65]:
def optimizer_testing_loop(parameters : dict[str,]):
    model = parameters['model']

    criterion = parameters['criterion']
    optimizer = parameters['optimizer']

    x_tensor = parameters['x_tensor']
    y_tensor = parameters['y_tensor']

    epochs = parameters['epochs']
    for epoch in range(epochs):
        optimizer.zero_grad()
        predictions = model(x_tensor)
        loss = criterion(predictions, y_tensor)
        loss.backward()
        optimizer.step()

        if (epoch + 1) % 10 == 0:
            print(f"Epoch {epoch + 1}, Loss: {loss.item()}")

    for name, param in model.named_parameters():
        print(f"{name}: {param.data}")

# ***SGD***

## **Implementation de SGD**

In [72]:
class SGD(Optimizer):
    def __init__(self, params, learning_rate=0.01):
        hyperparams = {'lr': learning_rate}
        super().__init__(params=params, defaults=hyperparams)

    @torch.no_grad()
    def step(self):
        for group in self.param_groups:
            lr = group['lr']

            for theta_t in group['params']:
                if theta_t.grad is None:
                    continue

                theta_t -= lr * theta_t.grad

## **Test de SGD**

In [73]:
model = nn.Linear(1, 1)
linear_parameters = {
    'model': model,
    'criterion': nn.MSELoss(),
    'optimizer': SGD(model.parameters(), learning_rate=0.01),
    'x_tensor': torch.from_numpy(x_linear).float().view(-1, 1),
    'y_tensor': torch.from_numpy(y_linear).float().view(-1, 1),
    'epochs': 100
}

optimizer_testing_loop(linear_parameters)

Epoch 10, Loss: 23.399639129638672
Epoch 20, Loss: 16.962400436401367
Epoch 30, Loss: 12.664852142333984
Epoch 40, Loss: 9.795772552490234
Epoch 50, Loss: 7.880355358123779
Epoch 60, Loss: 6.601606369018555
Epoch 70, Loss: 5.747902870178223
Epoch 80, Loss: 5.177964687347412
Epoch 90, Loss: 4.797468185424805
Epoch 100, Loss: 4.5434441566467285
weight: tensor([[2.9703]])
bias: tensor([4.4196])


In [74]:
model = nn.Linear(1, 1)
linear_parameters = {
    'model': model,
    'criterion': nn.MSELoss(),
    'optimizer': SGD(model.parameters(), learning_rate=0.01),
    'x_tensor': torch.from_numpy(x_linear).float().view(-1, 1),
    'y_tensor': torch.from_numpy(y_nonlinear).float().view(-1, 1),
    'epochs': 100
}

optimizer_testing_loop(linear_parameters)

Epoch 10, Loss: 483.17266845703125
Epoch 20, Loss: 414.6569519042969
Epoch 30, Loss: 368.91534423828125
Epoch 40, Loss: 338.3778381347656
Epoch 50, Loss: 317.9908142089844
Epoch 60, Loss: 304.3802795410156
Epoch 70, Loss: 295.29376220703125
Epoch 80, Loss: 289.2275695800781
Epoch 90, Loss: 285.1777038574219
Epoch 100, Loss: 282.47393798828125
weight: tensor([[-4.1445]])
bias: tensor([15.1297])


# ***RMSProp***

## **Implementation de RMSProp**

In [76]:
class RMSProp(Optimizer):
    def __init__(self, params, learning_rate=0.01, decay=0.9):
        hyperparams = {'lr': learning_rate, 'decay': decay}
        super().__init__(params=params, defaults=hyperparams)

    @torch.no_grad()
    def step(self):
        for group in self.param_groups:
            decay = group['decay']
            lr = group['lr']

            for theta_t in group['params']:
                if theta_t.grad is None:
                    continue

                state = self.state[theta_t]
                if 'square_avg' not in state:
                    state['square_avg'] = torch.zeros_like(theta_t)

                square_avg = state['square_avg']
                square_avg = decay * square_avg + (1 - decay) * (theta_t.grad ** 2)
                state['square_avg'] = square_avg

                theta_t -= lr * theta_t.grad / square_avg.sqrt()

## **Test de RMSProp**

In [81]:
model = nn.Linear(1, 1)
linear_parameters = {
    'model': model,
    'criterion': nn.MSELoss(),
    'optimizer': RMSProp(model.parameters(), learning_rate=0.01, decay=0.8),
    'x_tensor': torch.from_numpy(x_linear).float().view(-1, 1),
    'y_tensor': torch.from_numpy(y_linear).float().view(-1, 1),
    'epochs': 100
}

optimizer_testing_loop(linear_parameters)

Epoch 10, Loss: 166.65988159179688
Epoch 20, Loss: 152.1524658203125
Epoch 30, Loss: 138.68487548828125
Epoch 40, Loss: 125.93838500976562
Epoch 50, Loss: 113.8799819946289
Epoch 60, Loss: 102.50534057617188
Epoch 70, Loss: 91.81302642822266
Epoch 80, Loss: 81.8017807006836
Epoch 90, Loss: 72.47024536132812
Epoch 100, Loss: 63.81686019897461
weight: tensor([[1.8320]])
bias: tensor([1.2606])


In [82]:
model = nn.Linear(1, 1)
linear_parameters = {
    'model': model,
    'criterion': nn.MSELoss(),
    'optimizer': RMSProp(model.parameters(), learning_rate=0.01, decay=0.8),
    'x_tensor': torch.from_numpy(x_linear).float().view(-1, 1),
    'y_tensor': torch.from_numpy(y_nonlinear).float().view(-1, 1),
    'epochs': 100
}

optimizer_testing_loop(linear_parameters)

Epoch 10, Loss: 1153.8677978515625
Epoch 20, Loss: 1122.767578125
Epoch 30, Loss: 1093.081298828125
Epoch 40, Loss: 1064.1612548828125
Epoch 50, Loss: 1035.94189453125
Epoch 60, Loss: 1008.4160766601562
Epoch 70, Loss: 981.5826416015625
Epoch 80, Loss: 955.44140625
Epoch 90, Loss: 929.9918823242188
Epoch 100, Loss: 905.2341918945312
weight: tensor([[-1.0310]])
bias: tensor([0.2064])


# ***Adagrad***

## **Implementation de Adagrad**

In [92]:
class Adagrad(Optimizer):
    def __init__(self, params, learning_rate=0.01):
        hyperparams = {'lr': learning_rate}
        super().__init__(params=params, defaults=hyperparams)

    @torch.no_grad()
    def step(self):
        for group in self.param_groups:
            lr = group['lr']

            for theta_t in group['params']:
                if theta_t.grad is None:
                    continue

                state = self.state[theta_t]
                if 'sum_squared_grads' not in state:
                    state['sum_squared_grads'] = torch.zeros_like(theta_t)

                sum_squared_grads = state['sum_squared_grads']
                sum_squared_grads += theta_t.grad ** 2
                state['sum_squared_grads'] = sum_squared_grads

                adjusted_lr = lr / sum_squared_grads.sqrt()

                theta_t -= adjusted_lr * theta_t.grad

## **Test de Adagrad**

In [93]:
model = nn.Linear(1, 1)
linear_parameters = {
    'model': model,
    'criterion': nn.MSELoss(),
    'optimizer': Adagrad(model.parameters(), learning_rate=0.5),
    'x_tensor': torch.from_numpy(x_linear).float().view(-1, 1),
    'y_tensor': torch.from_numpy(y_linear).float().view(-1, 1),
    'epochs': 100
}

optimizer_testing_loop(linear_parameters)

Epoch 10, Loss: 53.2936897277832
Epoch 20, Loss: 15.242056846618652
Epoch 30, Loss: 7.162252902984619
Epoch 40, Loss: 5.094719409942627
Epoch 50, Loss: 4.459561347961426
Epoch 60, Loss: 4.224538326263428
Epoch 70, Loss: 4.124268531799316
Epoch 80, Loss: 4.077759742736816
Epoch 90, Loss: 4.055278778076172
Epoch 100, Loss: 4.0442070960998535
weight: tensor([[2.9694]])
bias: tensor([5.0186])


In [94]:
model = nn.Linear(1, 1)
linear_parameters = {
    'model': model,
    'criterion': nn.MSELoss(),
    'optimizer': Adagrad(model.parameters(), learning_rate=0.5),
    'x_tensor': torch.from_numpy(x_linear).float().view(-1, 1),
    'y_tensor': torch.from_numpy(y_nonlinear).float().view(-1, 1),
    'epochs': 100
}

optimizer_testing_loop(linear_parameters)

Epoch 10, Loss: 703.102783203125
Epoch 20, Loss: 552.7655639648438
Epoch 30, Loss: 485.4203186035156
Epoch 40, Loss: 448.4309387207031
Epoch 50, Loss: 425.0162353515625
Epoch 60, Loss: 408.40093994140625
Epoch 70, Loss: 395.54473876953125
Epoch 80, Loss: 384.9854431152344
Epoch 90, Loss: 375.9720458984375
Epoch 100, Loss: 368.0896911621094
weight: tensor([[-4.0899]])
bias: tensor([7.9156])


# ***Adam***

## **Implementation de Adam**

In [109]:
class Adam(Optimizer):
    def __init__(self, params, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8):
        hyperparams = {'lr': learning_rate, 'beta1': beta1, 'beta2': beta2, 'epsilon': epsilon}
        super().__init__(params=params, defaults=hyperparams)

    @torch.no_grad()
    def step(self):
        for group in self.param_groups:
            lr = group['lr']
            beta1 = group['beta1']
            beta2 = group['beta2']
            epsilon = group['epsilon']

            for theta_t in group['params']:
                if theta_t.grad is None:
                    continue

                state = self.state[theta_t]
                if 'm' not in state: # Moment d'ordre 1
                    state['m'] = torch.zeros_like(theta_t)
                if 'v' not in state: # Moment d'ordre 2
                    state['v'] = torch.zeros_like(theta_t)
                if 't' not in state: # Temps
                    state['t'] = 0

                # Premier Moment
                m = state['m']
                m_t = beta1 * m + (1 - beta1) * theta_t.grad
                state['m'] = m_t

                # Second Moment
                v = state['v']
                v_t = beta2 * v + (1 - beta2) * theta_t.grad ** 2
                state['v'] = v_t

                # Temps
                t = state['t'] + 1
                state['t'] = t

                # Correction des biais
                m_hat = m_t / (1 - beta1 ** t)
                v_hat = v_t / (1 - beta2 ** t)

                theta_t -= lr * m_hat / (v_hat.sqrt() + epsilon)

## **Test de Adam**

In [110]:
model = nn.Linear(1, 1)
linear_parameters = {
    'model': model,
    'criterion': nn.MSELoss(),
    'optimizer': Adam(model.parameters(), learning_rate=0.1, beta1=0.9, beta2=0.999, epsilon=1e-8),
    'x_tensor': torch.from_numpy(x_linear).float().view(-1, 1),
    'y_tensor': torch.from_numpy(y_linear).float().view(-1, 1),
    'epochs': 100
}

optimizer_testing_loop(linear_parameters)

Epoch 10, Loss: 89.61981201171875
Epoch 20, Loss: 22.54928207397461
Epoch 30, Loss: 6.636232852935791
Epoch 40, Loss: 6.613010883331299
Epoch 50, Loss: 5.385785102844238
Epoch 60, Loss: 4.148060321807861
Epoch 70, Loss: 4.068060874938965
Epoch 80, Loss: 4.093616485595703
Epoch 90, Loss: 4.043006420135498
Epoch 100, Loss: 4.036981105804443
weight: tensor([[2.9789]])
bias: tensor([5.1578])


In [111]:
model = nn.Linear(1, 1)
linear_parameters = {
    'model': model,
    'criterion': nn.MSELoss(),
    'optimizer': Adam(model.parameters(), learning_rate=0.1, beta1=0.9, beta2=0.999, epsilon=1e-8),
    'x_tensor': torch.from_numpy(x_linear).float().view(-1, 1),
    'y_tensor': torch.from_numpy(y_nonlinear).float().view(-1, 1),
    'epochs': 100
}

optimizer_testing_loop(linear_parameters)

Epoch 10, Loss: 1076.4212646484375
Epoch 20, Loss: 826.5248413085938
Epoch 30, Loss: 653.0672607421875
Epoch 40, Loss: 543.5895385742188
Epoch 50, Loss: 479.2992248535156
Epoch 60, Loss: 441.5941467285156
Epoch 70, Loss: 416.8678894042969
Epoch 80, Loss: 397.67169189453125
Epoch 90, Loss: 381.0584411621094
Epoch 100, Loss: 366.2650451660156
weight: tensor([[-4.1924]])
bias: tensor([8.0455])


# ***AdamW***

In [113]:
class AdamW(Optimizer):
    def __init__(self, params, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8, weight_decay=0.01):
        hyperparams = {'lr': learning_rate, 'beta1': beta1, 'beta2': beta2, 'epsilon': epsilon, 'weight_decay': weight_decay}
        super().__init__(params=params, defaults=hyperparams)

    @torch.no_grad()
    def step(self):
        for group in self.param_groups:
            lr = group['lr']
            beta1 = group['beta1']
            beta2 = group['beta2']
            epsilon = group['epsilon']
            weight_decay = group['weight_decay']

            for theta_t in group['params']:
                if theta_t.grad is None:
                    continue

                state = self.state[theta_t]
                if 'm' not in state: # Moment d'ordre 1
                    state['m'] = torch.zeros_like(theta_t)
                if 'v' not in state: # Moment d'ordre 2
                    state['v'] = torch.zeros_like(theta_t)
                if 't' not in state: # Temps
                    state['t'] = 0

                # Premier Moment
                m = state['m']
                m_t = beta1 * m + (1 - beta1) * theta_t.grad
                state['m'] = m_t

                # Second Moment
                v = state['v']
                v_t = beta2 * v + (1 - beta2) * theta_t.grad ** 2
                state['v'] = v_t

                # Temps
                t = state['t'] + 1
                state['t'] = t

                # Correction des biais
                m_hat = m_t / (1 - beta1 ** t)
                v_hat = v_t / (1 - beta2 ** t)

                theta_t -= lr * m_hat / (v_hat.sqrt() + epsilon) - lr * weight_decay * theta_t

## ***Test de AdamW***

In [114]:
model = nn.Linear(1, 1)
linear_parameters = {
    'model': model,
    'criterion': nn.MSELoss(),
    'optimizer': AdamW(model.parameters(), learning_rate=0.1, beta1=0.9, beta2=0.999, epsilon=1e-8, weight_decay=0.01),
    'x_tensor': torch.from_numpy(x_linear).float().view(-1, 1),
    'y_tensor': torch.from_numpy(y_linear).float().view(-1, 1),
    'epochs': 100
}

optimizer_testing_loop(linear_parameters)

Epoch 10, Loss: 60.561180114746094
Epoch 20, Loss: 15.564854621887207
Epoch 30, Loss: 10.691732406616211
Epoch 40, Loss: 8.701184272766113
Epoch 50, Loss: 5.131641387939453
Epoch 60, Loss: 4.197311878204346
Epoch 70, Loss: 4.089041709899902
Epoch 80, Loss: 4.040408134460449
Epoch 90, Loss: 4.091141223907471
Epoch 100, Loss: 4.080957889556885
weight: tensor([[2.9939]])
bias: tensor([5.2811])


In [115]:
model = nn.Linear(1, 1)
linear_parameters = {
    'model': model,
    'criterion': nn.MSELoss(),
    'optimizer': AdamW(model.parameters(), learning_rate=0.1, beta1=0.9, beta2=0.999, epsilon=1e-8, weight_decay=0.01),
    'x_tensor': torch.from_numpy(x_linear).float().view(-1, 1),
    'y_tensor': torch.from_numpy(y_linear).float().view(-1, 1),
    'epochs': 100
}

optimizer_testing_loop(linear_parameters)

Epoch 10, Loss: 178.56480407714844
Epoch 20, Loss: 66.68789672851562
Epoch 30, Loss: 19.171457290649414
Epoch 40, Loss: 8.25043773651123
Epoch 50, Loss: 7.443469047546387
Epoch 60, Loss: 6.193104267120361
Epoch 70, Loss: 4.727298259735107
Epoch 80, Loss: 4.123226165771484
Epoch 90, Loss: 4.0350494384765625
Epoch 100, Loss: 4.044890880584717
weight: tensor([[2.9782]])
bias: tensor([5.2258])


# ***Evaluation des Optimiseurs***