# TP3: Optimisation : Implémentation et Evaluation
- Maël Reynaud
- Alexandre Devaux-Rivière

### Import de la classe Tensor depuis le dernier TP (TP2)

In [13]:
class Tensor:
    def __init__(self, data, _children=(), _op=''):
        self.data = data
        self.grad = 0.0

        # internal variables used for autograd graph construction
        self._backward = lambda: None
        self._prev = set(_children)
        self._op = _op # the op that produced this node, for graphviz / debugging / etc

    def __add__(self, other):
        other = other if isinstance(other, Tensor) else Tensor(other)
        out = Tensor(self.data + other.data, (self, other), '+')

        def _backward():
            self.grad += out.grad
            other.grad += out.grad

        out._backward = _backward

        return out

    def __mul__(self, other):
        other = other if isinstance(other, Tensor) else Tensor(other)
        out = Tensor(self.data * other.data, [self, other], '*')

        def _backward():
            self.grad += other.data * out.grad
            other.grad += self.data * out.grad

        out._backward = _backward

        return out

    def __pow__(self, other):
        assert isinstance(other, (int, float)), "only supporting int/float powers for now"

        out = Tensor(self.data**other, (self,), f'**{other}')

        def _backward():
            self.grad += (other * self.data**(other-1)) * out.grad

        out._backward = _backward

        return out


    def build_topo(self, visited=None, topo=None):
        if self not in visited:
            visited.add(self)
            for child in self._prev:
                child.build_topo(visited=visited, topo=topo)
            topo.append(self)
        return topo

    def backward(self):
        # topological order all of the children in the graph
        topo = []
        visited = set()
        topo = self.build_topo(topo=topo, visited=visited)

        # go one variable at a time and apply the chain rule to get its gradient
        self.grad = 1.0
        for v in reversed(topo):
            v._backward()

    def __neg__(self): # -self
        return self * -1

    def __radd__(self, other): # other + self
        return self + other

    def __sub__(self, other): # self - other
        return self + (-other)

    def __rsub__(self, other): # other - self
        return other + (-self)

    def __rmul__(self, other): # other * self
        return self * other

    def __repr__(self):
        return f"Tensor(data={self.data}, grad={self.grad})"

### Travail pour le TP3

In [113]:
import numpy as np
import torch

from typing import List, Dict
from abc import ABC, abstractmethod

np.random.seed(0)

## Préparation des jeux de données synthétiques pour l'évaluation

### Données linaires

Le jeu de données linéaire est généré à partir d’une relation linéaire entre les variables x et y avec l’ajout d’un bruit gaussien.

In [114]:
n_samples = 100
x_linear = np.linspace(-10, 10, n_samples)
y_linear = 3 * x_linear + 5 + np.random.normal(0, 2, n_samples)

### Données non linéaires

Le jeu de données non linéaire est basé sur une relation quadratique entre x et y avec un bruit ajouté.

In [115]:
y_nonlinear = 0.5 * x_linear **2 - 4 * x_linear + np.random.normal(0, 5, n_samples)

## Implémentation des algorithmes

Préparation de la classe optimiseur pour l'implémentation de nos optimizer customisés.

In [123]:
class Optimizer(ABC):
    def __init__(self, params: List[Tensor], lr: float):
        self.params = params
        self.lr = lr

    @abstractmethod
    def step(self):
        pass

    def zero_grad(self):
        for param in self.params:
            param.grad = 0

In [124]:
class LinearModel(torch.nn.Module):
  def __init__(self):
    super().__init__()
    self.W = torch.nn.parameter.Parameter(torch.tensor(1.0))
    self.b = torch.nn.parameter.Parameter(torch.tensor(0.0))

  def forward(self, x):
    return self.W * x + self.b

In [125]:
def test_optimizer(optim: Optimizer, torch_optim: torch.optim.Optimizer, optim_parameters: Dict):
  W = Tensor(1.0)
  b = Tensor(0.0)
  torch_model = LinearModel()

  optimizer = optim([W, b], **optim_parameters)
  torch_optimizer = torch_optim(torch_model.parameters(), **optim_parameters)

  print("Linear dataset (3 * x + 5)")
  for _ in range(100):
    for x, y in zip(x_linear, y_linear):
      loss = (y - (W * x + b)) ** 2
      torch_loss = (y - torch_model(x)) ** 2

      optimizer.zero_grad()
      torch_optimizer.zero_grad()

      loss.backward()
      torch_loss.backward()

      optimizer.step()
      torch_optimizer.step()

  print(f"W : {W.data}, b : {b.data}")
  print(f"[PYTORCH] W : {torch_model.W.data}, b : {torch_model.b.data}")


  W = Tensor(1.0)
  b = Tensor(0.0)
  torch_model = LinearModel()

  optimizer = optim([W, b], **optim_parameters)
  torch_optimizer = torch_optim(torch_model.parameters(), **optim_parameters)

  print("Non linear dataset (0.5 * x^2 - 4 * x)")
  for _ in range(100):
    for x, y in zip(x_linear, y_nonlinear):
      loss = (y - (W * x + b)) ** 2
      torch_loss = (y - torch_model(x)) ** 2

      optimizer.zero_grad()
      torch_optimizer.zero_grad()

      loss.backward()
      torch_loss.backward()

      optimizer.step()
      torch_optimizer.step()

  print(f"W : {W.data}, b : {b.data}")
  print(f"[PYTORCH] W : {torch_model.W.data}, b : {torch_model.b.data}")

### Stochastic Gradient Descent (SGD)

TODO EXPLAIN

In [126]:
class SGD(Optimizer):
    def __init__(self, params: List[Tensor], lr: float = 0.01):
        super().__init__(params=params, lr=lr)

    def step(self):
        for param in self.params:
            param.data -= self.lr * param.grad

In [127]:
test_optimizer(SGD, torch.optim.SGD, {"lr": 0.001})

Linear dataset (3 * x + 5)
W : 3.1124139870722645, b : 4.931713920861934
[PYTORCH] W : 3.1124141216278076, b : 4.931710720062256
Non linear dataset (0.5 * x^2 - 4 * x)
W : -0.7479125849883421, b : 13.664536448952166
[PYTORCH] W : -0.7479139566421509, b : 13.664549827575684


TODO EXPLAIN

### Root Mean Square Propagation (RMSProp)

TODO EXPLAIN

In [155]:
class RMSProp(Optimizer):
    def __init__(self, params: List[Tensor], lr: float = 0.001, weight_decay: float = 0.9, alpha: float = 0.99, eps: int = 1e-8):
        super().__init__(params=params, lr=lr)
        self.weight_decay = weight_decay
        self.v = [0] * len(params)
        self.alpha = alpha
        self.eps = eps

    def step(self):
        for i, param in enumerate(self.params):
            gt = param.grad + self.weight_decay * param.data
            self.v[i] = self.alpha * self.v[i] + (1 - self.alpha) * (param.grad ** 2) 
            param.data -= self.lr * gt / (np.sqrt(self.v[i]) + self.eps)

In [156]:
test_optimizer(RMSProp, torch.optim.RMSprop, {"lr": 0.001, "weight_decay": 0.9, "alpha": 0.99, "eps": 1e-8})

Linear dataset (3 * x + 5)
W : 2.9699739947949975, b : 3.4696535126733967
[PYTORCH] W : 2.970080852508545, b : 3.521855592727661
Non linear dataset (0.5 * x^2 - 4 * x)
W : -2.681837225663783, b : 4.731417453278267
[PYTORCH] W : -2.684574842453003, b : 4.821386337280273


TODO EXPLAIN

### Adagrad

TODO EXPLAIN

In [134]:
class Adagrad(Optimizer):
    def __init__(self , params: List[Tensor], lr: float = 0.01):
        super().__init__(params=params, lr=lr)
        self.G = [np.zeros_like(param.data) for param in params]
        self.epsilon = 1e-8

    def step(self):
        for index, param in enumerate(self.params):
            self.G[index] += param.grad ** 2
            coef = np.sqrt(self.G[index]) + self.epsilon
            param.data -= (self.lr / coef) * param.grad

In [135]:
test_optimizer(Adagrad, torch.optim.Adagrad, {"lr": 0.001})

Linear dataset (3 * x + 5)
W : 1.1379994326592422, b : 0.07407535004631868
[PYTORCH] W : 1.1380008459091187, b : 0.07407517731189728
Non linear dataset (0.5 * x^2 - 4 * x)
W : 0.8920318490186844, b : 0.08891015717408494
[PYTORCH] W : 0.8920310735702515, b : 0.08890986442565918


TODO EXPLAIN

### Adam

TODO EXPLAIN

In [None]:
# OLD IMPLEM

# class Adam(Optimizer):
#     def __init__(self, params: list, lr: float = 0.001, betas: float = (0.9, 0.999), eps: float = 1e-8):
#         super().__init__(params=params, lr=lr)
#         self.epsilon = eps
#         self.beta1 = betas[0]
#         self.beta2 = betas[1]
#         self.m = [np.zeros_like(param.data) for param in params]
#         self.v = [np.zeros_like(param.data) for param in params]
# 
#     def step(self):
#         for index, param in enumerate(self.params):
#             self.m[index] = self.beta1 * self.m[index] + (1 - self.beta1) * param.grad
#             self.v[index] = self.beta2 * self.v[index] + (1 - self.beta2) * (param.grad ** 2)
#             coef = np.sqrt(self.v[index]) + self.epsilon
#             param.data -= self.lr * self.m[index] / coef 

In [144]:
# SAME AS PYTORCH IMPLEM

class Adam(Optimizer):
    def __init__(self, params: list, lr: float = 0.001, betas: float = (0.9, 0.999), eps: float = 1e-8):
        super().__init__(params=params, lr=lr)
        self.timestep = 1
        self.epsilon = eps
        self.beta1 = betas[0]
        self.beta2 = betas[1]
        self.m = [np.zeros_like(param.data) for param in params]
        self.v = [np.zeros_like(param.data) for param in params]

    def step(self):
        for index, param in enumerate(self.params):
            self.m[index] = self.beta1 * self.m[index] + (1 - self.beta1) * param.grad
            self.v[index] = self.beta2 * self.v[index] + (1 - self.beta2) * (param.grad ** 2)
            mhat = self.m[index] / (1 - self.beta1 ** self.timestep)
            vhat = self.v[index] / (1 - self.beta2 ** self.timestep)
            coef = np.sqrt(vhat) + self.epsilon
            param.data -= self.lr * mhat / coef 
        self.timestep += 1

In [145]:
test_optimizer(Adam, torch.optim.Adam, {"lr": 0.001, "betas": (0.9, 0.999), "eps": 1e-08})

Linear dataset (3 * x + 5)
W : 2.974818069941666, b : 4.70047297094164
[PYTORCH] W : 2.974820613861084, b : 4.70046329498291
Non linear dataset (0.5 * x^2 - 4 * x)
W : -2.8413365377254745, b : 4.957100575280201
[PYTORCH] W : -2.841322422027588, b : 4.957065105438232


TODO EXPLAIN

### AdamW

TODO EXPLAIN

In [None]:
class AdamW(Optimizer):
    def __init__(self, params: list, lr: float = 0.001 , beta1: float = 0.9, beta2: float = 0.999 , eps: float = 1e-8, weight_decay: float = 0.01):
        super().__init__(params=params, lr=lr)
        self.epsilon = eps
        self.beta1 = beta1
        self.beta2 = beta2
        self.weight_decay = weight_decay
        self.m = [np.zeros_like(param.data) for param in params]
        self.v = [np.zeros_like(param.data) for param in params]

    def step(self):
        for index, param in enumerate(self.params):
            self.m[index] = self.beta1 * self.m[index] + (1 - self.beta1) * param.grad
            self.v[index] = self.beta2 * self.v[index] + (1 - self.beta2) * param.grad ** 2
            coef = np.sqrt(self.v[index]) + self.epsilon
            param -= self.lr / coef * self.m[index] - self.lr * self.weight_decay * param

TODO EXPLAIN

## Evaluation des Optimiseur

### Fonctions de Perte

TODO EXPLAIN

In [None]:
def f(x):
    return (x - 2)**2

def f_nonconvexe(x):
    return 3*x**2 - 2*x

TODO EXPLAIN

### Expérimentation

TODO EXPLAIN

In [None]:
def eval_optim():
    pass

TODO EXPLAIN

## Implémentation de réseau de Neurones

### Définition du modèle

In [None]:
def func_nn(x, W1 , b1 , W2 , b2):
    h1 = W1 * x + b1
    y = W2 * h1 + b2
    return y

def mse(y, y_hat):
    return (y - y_hat) ** 2

### Entraînement du réseau

TODO EXPLAIN

In [None]:
def eval_nn_optim ():
    pass

TODO EXPLAIN

## Implémentation des scheduler de taux d’apprentissage

### LRScheduler

TODO EXPLAIN

In [None]:
class LRScheduler:
    def __init__(self , optimizer , initial_lr):
        pass

TODO EXPLAIN

### LRSchedulerOnPlateau

TODO EXPLAIN

In [None]:
class LRSchedulerOnPlateau(LRScheduler):
    def __init__(self , optimizer , initial_lr , patience =10, factor =0.1, min_lr =1e-6, mode="min", threshold =1e-4):
        pass

TODO EXPLAIN