# ***TP3 - Clovis Lechien***

In [156]:
import numpy as np

import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from torch.optim import Optimizer

In [157]:
# Génération du jeu de données linéaire
np.random.seed(0)
n_samples = 100
x_linear = np.linspace(-10, 10, n_samples)
y_linear = 3 * x_linear + 5 + np.random.normal(0, 2, n_samples)

 # Génération du jeu de données non linéaire
y_nonlinear = 0.5 * x_linear **2 - 4 * x_linear + np.random.normal(0 ,5 ,n_samples)

# ***Utils***

In [158]:
def optimizer_testing_loop(parameters : dict[str,]):
    model = parameters['model']

    criterion = parameters['criterion']
    optimizer = parameters['optimizer']

    x_tensor = parameters['x_tensor']
    y_tensor = parameters['y_tensor']

    epochs = parameters['epochs']
    for epoch in range(epochs):
        optimizer.zero_grad()
        predictions = model(x_tensor)
        loss = criterion(predictions, y_tensor)
        loss.backward()
        optimizer.step()

        if (epoch + 1) % 10 == 0:
            print(f"Epoch {epoch + 1}, Loss: {loss.item()}")

    for name, param in model.named_parameters():
        print(f"{name}: {param.data}")

# ***SGD***

## **Implementation de SGD**

In [159]:
class SGD(Optimizer):
    def __init__(self, params, learning_rate=0.01):
        hyperparams = {'lr': learning_rate}
        super().__init__(params=params, defaults=hyperparams)

    @torch.no_grad()
    def step(self):
        for group in self.param_groups:
            lr = group['lr']

            for theta_t in group['params']:
                if theta_t.grad is None:
                    continue

                theta_t -= lr * theta_t.grad

## **Test de SGD**

In [160]:
model = nn.Linear(1, 1)
linear_parameters = {
    'model': model,
    'criterion': nn.MSELoss(),
    'optimizer': SGD(model.parameters(), learning_rate=0.01),
    'x_tensor': torch.from_numpy(x_linear).float().view(-1, 1),
    'y_tensor': torch.from_numpy(y_linear).float().view(-1, 1),
    'epochs': 100
}

optimizer_testing_loop(linear_parameters)

Epoch 10, Loss: 25.781906127929688
Epoch 20, Loss: 18.552831649780273
Epoch 30, Loss: 13.726634979248047
Epoch 40, Loss: 10.504631042480469
Epoch 50, Loss: 8.353592872619629
Epoch 60, Loss: 6.917543888092041
Epoch 70, Loss: 5.958826065063477
Epoch 80, Loss: 5.318777561187744
Epoch 90, Loss: 4.891477584838867
Epoch 100, Loss: 4.606206893920898
weight: tensor([[2.9703]])
bias: tensor([4.3778])


In [161]:
model = nn.Linear(1, 1)
linear_parameters = {
    'model': model,
    'criterion': nn.MSELoss(),
    'optimizer': SGD(model.parameters(), learning_rate=0.01),
    'x_tensor': torch.from_numpy(x_linear).float().view(-1, 1),
    'y_tensor': torch.from_numpy(y_nonlinear).float().view(-1, 1),
    'epochs': 100
}

optimizer_testing_loop(linear_parameters)

Epoch 10, Loss: 499.5698547363281
Epoch 20, Loss: 425.6038818359375
Epoch 30, Loss: 376.2235412597656
Epoch 40, Loss: 343.2569580078125
Epoch 50, Loss: 321.2481384277344
Epoch 60, Loss: 306.55487060546875
Epoch 70, Loss: 296.7455139160156
Epoch 80, Loss: 290.19677734375
Epoch 90, Loss: 285.8247375488281
Epoch 100, Loss: 282.90594482421875
weight: tensor([[-4.1445]])
bias: tensor([15.0406])


# ***RMSProp***

## **Implementation de RMSProp**

In [162]:
class RMSProp(Optimizer):
    def __init__(self, params, learning_rate=0.01, decay=0.9):
        hyperparams = {'lr': learning_rate, 'decay': decay}
        super().__init__(params=params, defaults=hyperparams)

    @torch.no_grad()
    def step(self):
        for group in self.param_groups:
            decay = group['decay']
            lr = group['lr']

            for theta_t in group['params']:
                if theta_t.grad is None:
                    continue

                state = self.state[theta_t]
                if 'square_avg' not in state:
                    state['square_avg'] = torch.zeros_like(theta_t)

                square_avg = state['square_avg']
                square_avg = decay * square_avg + (1 - decay) * (theta_t.grad ** 2)
                state['square_avg'] = square_avg

                theta_t -= lr * theta_t.grad / square_avg.sqrt()

## **Test de RMSProp**

In [163]:
model = nn.Linear(1, 1)
linear_parameters = {
    'model': model,
    'criterion': nn.MSELoss(),
    'optimizer': RMSProp(model.parameters(), learning_rate=0.01, decay=0.8),
    'x_tensor': torch.from_numpy(x_linear).float().view(-1, 1),
    'y_tensor': torch.from_numpy(y_linear).float().view(-1, 1),
    'epochs': 100
}

optimizer_testing_loop(linear_parameters)

Epoch 10, Loss: 303.2937316894531
Epoch 20, Loss: 283.24786376953125
Epoch 30, Loss: 264.368408203125
Epoch 40, Loss: 246.2269287109375
Epoch 50, Loss: 228.77969360351562
Epoch 60, Loss: 212.02163696289062
Epoch 70, Loss: 195.95184326171875
Epoch 80, Loss: 180.56961059570312
Epoch 90, Loss: 165.87445068359375
Epoch 100, Loss: 151.8656768798828
weight: tensor([[1.0768]])
bias: tensor([0.1647])


In [164]:
model = nn.Linear(1, 1)
linear_parameters = {
    'model': model,
    'criterion': nn.MSELoss(),
    'optimizer': RMSProp(model.parameters(), learning_rate=0.01, decay=0.8),
    'x_tensor': torch.from_numpy(x_linear).float().view(-1, 1),
    'y_tensor': torch.from_numpy(y_nonlinear).float().view(-1, 1),
    'epochs': 100
}

optimizer_testing_loop(linear_parameters)

Epoch 10, Loss: 1004.470458984375
Epoch 20, Loss: 976.4335327148438
Epoch 30, Loss: 949.7418212890625
Epoch 40, Loss: 923.80908203125
Epoch 50, Loss: 898.5750732421875
Epoch 60, Loss: 874.0335693359375
Epoch 70, Loss: 850.1834106445312
Epoch 80, Loss: 827.0245361328125
Epoch 90, Loss: 804.5563354492188
Epoch 100, Loss: 782.7786865234375
weight: tensor([[-1.4310]])
bias: tensor([1.5010])


# ***Adagrad***

## **Implementation de Adagrad**

In [165]:
class Adagrad(Optimizer):
    def __init__(self, params, learning_rate=0.01):
        hyperparams = {'lr': learning_rate}
        super().__init__(params=params, defaults=hyperparams)

    @torch.no_grad()
    def step(self):
        for group in self.param_groups:
            lr = group['lr']

            for theta_t in group['params']:
                if theta_t.grad is None:
                    continue

                state = self.state[theta_t]
                if 'sum_squared_grads' not in state:
                    state['sum_squared_grads'] = torch.zeros_like(theta_t)

                sum_squared_grads = state['sum_squared_grads']
                sum_squared_grads += theta_t.grad ** 2
                state['sum_squared_grads'] = sum_squared_grads

                adjusted_lr = lr / sum_squared_grads.sqrt()

                theta_t -= adjusted_lr * theta_t.grad

## **Test de Adagrad**

In [166]:
model = nn.Linear(1, 1)
linear_parameters = {
    'model': model,
    'criterion': nn.MSELoss(),
    'optimizer': Adagrad(model.parameters(), learning_rate=0.5),
    'x_tensor': torch.from_numpy(x_linear).float().view(-1, 1),
    'y_tensor': torch.from_numpy(y_linear).float().view(-1, 1),
    'epochs': 100
}

optimizer_testing_loop(linear_parameters)

Epoch 10, Loss: 19.23186492919922
Epoch 20, Loss: 6.453787803649902
Epoch 30, Loss: 4.7115607261657715
Epoch 40, Loss: 4.281811714172363
Epoch 50, Loss: 4.131573677062988
Epoch 60, Loss: 4.072866916656494
Epoch 70, Loss: 4.049282550811768
Epoch 80, Loss: 4.0397443771362305
Epoch 90, Loss: 4.035879135131836
Epoch 100, Loss: 4.0343122482299805
weight: tensor([[2.9703]])
bias: tensor([5.0884])


In [167]:
model = nn.Linear(1, 1)
linear_parameters = {
    'model': model,
    'criterion': nn.MSELoss(),
    'optimizer': Adagrad(model.parameters(), learning_rate=0.5),
    'x_tensor': torch.from_numpy(x_linear).float().view(-1, 1),
    'y_tensor': torch.from_numpy(y_nonlinear).float().view(-1, 1),
    'epochs': 100
}

optimizer_testing_loop(linear_parameters)

Epoch 10, Loss: 662.9287719726562
Epoch 20, Loss: 539.48095703125
Epoch 30, Loss: 485.64019775390625
Epoch 40, Loss: 455.394775390625
Epoch 50, Loss: 435.1168212890625
Epoch 60, Loss: 419.7544860839844
Epoch 70, Loss: 407.2055969238281
Epoch 80, Loss: 396.5013732910156
Epoch 90, Loss: 387.1409912109375
Epoch 100, Loss: 378.831787109375
weight: tensor([[-4.1229]])
bias: tensor([7.3640])


# ***Adam***

## **Implementation de Adam**

In [168]:
class Adam(Optimizer):
    def __init__(self, params, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8):
        hyperparams = {'lr': learning_rate, 'beta1': beta1, 'beta2': beta2, 'epsilon': epsilon}
        super().__init__(params=params, defaults=hyperparams)

    @torch.no_grad()
    def step(self):
        for group in self.param_groups:
            lr = group['lr']
            beta1 = group['beta1']
            beta2 = group['beta2']
            epsilon = group['epsilon']

            for theta_t in group['params']:
                if theta_t.grad is None:
                    continue

                state = self.state[theta_t]
                if 'm' not in state: # Moment d'ordre 1
                    state['m'] = torch.zeros_like(theta_t)
                if 'v' not in state: # Moment d'ordre 2
                    state['v'] = torch.zeros_like(theta_t)
                if 't' not in state: # Temps
                    state['t'] = 0

                # Premier Moment
                m = state['m']
                m_t = beta1 * m + (1 - beta1) * theta_t.grad
                state['m'] = m_t

                # Second Moment
                v = state['v']
                v_t = beta2 * v + (1 - beta2) * theta_t.grad ** 2
                state['v'] = v_t

                # Temps
                t = state['t'] + 1
                state['t'] = t

                # Correction des biais
                m_hat = m_t / (1 - beta1 ** t)
                v_hat = v_t / (1 - beta2 ** t)

                theta_t -= lr * m_hat / (v_hat.sqrt() + epsilon)

## **Test de Adam**

In [169]:
model = nn.Linear(1, 1)
linear_parameters = {
    'model': model,
    'criterion': nn.MSELoss(),
    'optimizer': Adam(model.parameters(), learning_rate=0.1, beta1=0.9, beta2=0.999, epsilon=1e-8),
    'x_tensor': torch.from_numpy(x_linear).float().view(-1, 1),
    'y_tensor': torch.from_numpy(y_linear).float().view(-1, 1),
    'epochs': 100
}

optimizer_testing_loop(linear_parameters)

Epoch 10, Loss: 152.90664672851562
Epoch 20, Loss: 51.90208435058594
Epoch 30, Loss: 12.439159393310547
Epoch 40, Loss: 5.2970733642578125
Epoch 50, Loss: 5.51474142074585
Epoch 60, Loss: 4.866863250732422
Epoch 70, Loss: 4.160236358642578
Epoch 80, Loss: 4.036962985992432
Epoch 90, Loss: 4.063048839569092
Epoch 100, Loss: 4.046626091003418
weight: tensor([[2.9538]])
bias: tensor([5.1655])


In [170]:
model = nn.Linear(1, 1)
linear_parameters = {
    'model': model,
    'criterion': nn.MSELoss(),
    'optimizer': Adam(model.parameters(), learning_rate=0.1, beta1=0.9, beta2=0.999, epsilon=1e-8),
    'x_tensor': torch.from_numpy(x_linear).float().view(-1, 1),
    'y_tensor': torch.from_numpy(y_nonlinear).float().view(-1, 1),
    'epochs': 100
}

optimizer_testing_loop(linear_parameters)

Epoch 10, Loss: 1092.7698974609375
Epoch 20, Loss: 835.7798461914062
Epoch 30, Loss: 655.69189453125
Epoch 40, Loss: 540.6663208007812
Epoch 50, Loss: 472.496337890625
Epoch 60, Loss: 432.7613525390625
Epoch 70, Loss: 407.51483154296875
Epoch 80, Loss: 388.6913146972656
Epoch 90, Loss: 372.80157470703125
Epoch 100, Loss: 358.7831726074219
weight: tensor([[-4.1903]])
bias: tensor([8.4491])


# ***AdamW***

In [171]:
class AdamW(Optimizer):
    def __init__(self, params, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8, weight_decay=0.01):
        hyperparams = {'lr': learning_rate, 'beta1': beta1, 'beta2': beta2, 'epsilon': epsilon, 'weight_decay': weight_decay}
        super().__init__(params=params, defaults=hyperparams)

    @torch.no_grad()
    def step(self):
        for group in self.param_groups:
            lr = group['lr']
            beta1 = group['beta1']
            beta2 = group['beta2']
            epsilon = group['epsilon']
            weight_decay = group['weight_decay']

            for theta_t in group['params']:
                if theta_t.grad is None:
                    continue

                state = self.state[theta_t]
                if 'm' not in state: # Moment d'ordre 1
                    state['m'] = torch.zeros_like(theta_t)
                if 'v' not in state: # Moment d'ordre 2
                    state['v'] = torch.zeros_like(theta_t)
                if 't' not in state: # Temps
                    state['t'] = 0

                # Premier Moment
                m = state['m']
                m_t = beta1 * m + (1 - beta1) * theta_t.grad
                state['m'] = m_t

                # Second Moment
                v = state['v']
                v_t = beta2 * v + (1 - beta2) * theta_t.grad ** 2
                state['v'] = v_t

                # Temps
                t = state['t'] + 1
                state['t'] = t

                # Correction des biais
                m_hat = m_t / (1 - beta1 ** t)
                v_hat = v_t / (1 - beta2 ** t)

                theta_t -= lr * m_hat / (v_hat.sqrt() + epsilon) - lr * weight_decay * theta_t

## ***Test de AdamW***

In [172]:
model = nn.Linear(1, 1)
linear_parameters = {
    'model': model,
    'criterion': nn.MSELoss(),
    'optimizer': AdamW(model.parameters(), learning_rate=0.1, beta1=0.9, beta2=0.999, epsilon=1e-8, weight_decay=0.01),
    'x_tensor': torch.from_numpy(x_linear).float().view(-1, 1),
    'y_tensor': torch.from_numpy(y_linear).float().view(-1, 1),
    'epochs': 100
}

optimizer_testing_loop(linear_parameters)

Epoch 10, Loss: 70.88858032226562
Epoch 20, Loss: 17.531997680664062
Epoch 30, Loss: 9.452136039733887
Epoch 40, Loss: 8.697772979736328
Epoch 50, Loss: 5.476622104644775
Epoch 60, Loss: 4.144920825958252
Epoch 70, Loss: 4.08082389831543
Epoch 80, Loss: 4.044642448425293
Epoch 90, Loss: 4.080211162567139
Epoch 100, Loss: 4.091483116149902
weight: tensor([[3.0000]])
bias: tensor([5.2814])


In [173]:
model = nn.Linear(1, 1)
linear_parameters = {
    'model': model,
    'criterion': nn.MSELoss(),
    'optimizer': AdamW(model.parameters(), learning_rate=0.1, beta1=0.9, beta2=0.999, epsilon=1e-8, weight_decay=0.01),
    'x_tensor': torch.from_numpy(x_linear).float().view(-1, 1),
    'y_tensor': torch.from_numpy(y_linear).float().view(-1, 1),
    'epochs': 100
}

optimizer_testing_loop(linear_parameters)

Epoch 10, Loss: 292.70428466796875
Epoch 20, Loss: 135.1478271484375
Epoch 30, Loss: 48.26994323730469
Epoch 40, Loss: 13.050810813903809
Epoch 50, Loss: 4.737183570861816
Epoch 60, Loss: 4.707895278930664
Epoch 70, Loss: 5.05279541015625
Epoch 80, Loss: 4.710052967071533
Epoch 90, Loss: 4.320013046264648
Epoch 100, Loss: 4.137866973876953
weight: tensor([[3.0154]])
bias: tensor([5.2814])


# ***Evaluation des Optimiseurs***

In [174]:
def f(x):
    return (x - 2) ** 2


def f_nonconvexe(x):
    return 3*x ** 2 - 2*x

In [175]:
def eval_optim(x : torch.Tensor, convexe : bool = True):
    if convexe:
        print(f"Optimisation de la fonction convexe f(x) = (x - 2)²")
        y = f(x)
    else:
        print(f"Optimisation de la fonction non convexe f(x) = 3x² - 2x")
        y = f_nonconvexe(x)

    y.backward()
    print(f"Gradient de f en x={x.item()}: x.grad={x.grad.item()}")

    optimizers = [
        SGD,
        RMSProp,
        Adagrad,
        Adam,
        AdamW
    ]

    for optimizer in optimizers:
        x = torch.tensor([100.], requires_grad=True)
        optimizer = optimizer([x])
        for i in range(100):
            optimizer.zero_grad()
            if convexe:
                y = f(x)
            else:
                y = f_nonconvexe(x)
            y.backward()
            optimizer.step()
        print(f"Optimiseur {optimizer.__class__.__name__}: x={x.item()}, f(x)={f(x).item()}")

In [176]:
x = torch.tensor([69.], requires_grad=True)
eval_optim(x, convexe=True)
print()
eval_optim(x, convexe=False)

Optimisation de la fonction convexe f(x) = (x - 2)²
Gradient de f en x=69.0: x.grad=134.0
Optimiseur SGD: x=14.996723175048828, f(x)=168.91481018066406
Optimiseur RMSProp: x=98.90937805175781, f(x)=9391.427734375
Optimiseur Adagrad: x=99.81420135498047, f(x)=9567.6181640625
Optimiseur Adam: x=99.90005493164062, f(x)=9584.4208984375
Optimiseur AdamW: x=100.0, f(x)=9604.0

Optimisation de la fonction non convexe f(x) = 3x² - 2x
Gradient de f en x=69.0: x.grad=546.0
Optimiseur SGD: x=0.5381360054016113, f(x)=2.1370463371276855
Optimiseur RMSProp: x=98.90937805175781, f(x)=9391.427734375
Optimiseur Adagrad: x=99.81420135498047, f(x)=9567.6181640625
Optimiseur Adam: x=99.90005493164062, f(x)=9584.4208984375
Optimiseur AdamW: x=100.0, f(x)=9604.0


# ***Réseau de Neurones***

In [177]:
def func_nn(x, W1, b1, W2, b2):
    h1 = W1 * x + b1
    y = W2 * h1 + b2
    return y


def mse(y, y_hat):
    return (y - y_hat) ** 2

In [178]:
def eval_nn_optim():

    optimizers = [
        SGD,
        RMSProp,
        Adagrad,
        Adam,
        AdamW
    ]

    for optimizer in optimizers:
        W1 = torch.tensor([1.], requires_grad=True)
        b1 = torch.tensor([1.], requires_grad=True)
        W2 = torch.tensor([1.], requires_grad=True)
        b2 = torch.tensor([1.], requires_grad=True)

        x = torch.tensor([1.], requires_grad=True)
        y = torch.tensor([10.])

        optimizer = optimizer([W1, b1, W2, b2])

        for i in range(100):
            optimizer.zero_grad()
            y_hat = func_nn(x, W1, b1, W2, b2)
            loss = mse(y, y_hat)
            loss.backward()
            optimizer.step()
        print(f"Optimiseur {optimizer.__class__.__name__}:\nW1={W1.item()}, b1={b1.item()}, W2={W2.item()}, b2={b2.item()}")
        print(f"Optimisation du réseau de neurones:\nW1={W1.item()}, b1={b1.item()}, W2={W2.item()}, b2={b2.item()}\n")

In [179]:
eval_nn_optim()

Optimiseur SGD:
W1=1.7965515851974487, b1=1.7965515851974487, W2=2.356534481048584, b2=1.532727837562561
Optimisation du réseau de neurones:
W1=1.7965515851974487, b1=1.7965515851974487, W2=2.356534481048584, b2=1.532727837562561

Optimiseur RMSProp:
W1=1.956400752067566, b1=1.956400752067566, W2=1.956400752067566, b2=1.899566411972046
Optimisation du réseau de neurones:
W1=1.956400752067566, b1=1.956400752067566, W2=1.956400752067566, b2=1.899566411972046

Optimiseur Adagrad:
W1=1.186563491821289, b1=1.186563491821289, W2=1.186563491821289, b2=1.1806892156600952
Optimisation du réseau de neurones:
W1=1.186563491821289, b1=1.186563491821289, W2=1.186563491821289, b2=1.1806892156600952

Optimiseur Adam:
W1=1.1003371477127075, b1=1.1003371477127075, W2=1.1003371477127075, b2=1.0987093448638916
Optimisation du réseau de neurones:
W1=1.1003371477127075, b1=1.1003371477127075, W2=1.1003371477127075, b2=1.0987093448638916

Optimiseur AdamW:
W1=1.1013890504837036, b1=1.1013890504837036, W2=1.

# ***Scheduler de Taux d'apprentissage***

## **Implementation de LRScheduler**

In [181]:
class LRScheduler:
    def __init__(self, optimizer, initial_lr):
        self.optimizer = optimizer
        self.initial_lr = initial_lr
        # TODO: Implementer le reste.

## **Implementation de LRSchedulerOnPlateau**

In [182]:
class LRSchedulerOnPlateau(LRScheduler):
    def __init__(self, optimizer, initial_lr, patience=10, factor=0.1, min_lr=1e-6, mode='min', threshold=1e-4):
        super().__init__(optimizer, initial_lr)
        self.patience = patience
        self.factor = factor
        self.min_lr = min_lr
        self.mode = mode
        self.threshold = threshold
        # TODO: Implementer le reste.