# TP3: Optimisation : Implémentation et Evaluation
- Maël Reynaud
- Alexandre Devaux-Rivière

### Import de la classe Tensor depuis le dernier TP (TP2)

In [None]:
class Tensor:
    def __init__(self, data, _children=(), _op=''):
        self.data = data
        self.grad = 0.0

        # internal variables used for autograd graph construction
        self._backward = lambda: None
        self._prev = set(_children)
        self._op = _op # the op that produced this node, for graphviz / debugging / etc

    def __add__(self, other):
        other = other if isinstance(other, Tensor) else Tensor(other)
        out = Tensor(self.data + other.data, (self, other), '+')

        def _backward():
            self.grad += out.grad
            other.grad += out.grad

        out._backward = _backward

        return out

    def __mul__(self, other):
        other = other if isinstance(other, Tensor) else Tensor(other)
        out = Tensor(self.data * other.data, [self, other], '*')

        def _backward():
            self.grad += other.data * out.grad
            other.grad += self.data * out.grad

        out._backward = _backward

        return out

    def __pow__(self, other):
        assert isinstance(other, (int, float)), "only supporting int/float powers for now"

        out = Tensor(self.data**other, (self,), f'**{other}')

        def _backward():
            self.grad += (other * self.data**(other-1)) * out.grad

        out._backward = _backward

        return out


    def build_topo(self, visited=None, topo=None):
        if self not in visited:
            visited.add(self)
            for child in self._prev:
                child.build_topo(visited=visited, topo=topo)
            topo.append(self)
        return topo

    def backward(self):
        # topological order all of the children in the graph
        topo = []
        visited = set()
        topo = self.build_topo(topo=topo, visited=visited)
        print(topo)

        # go one variable at a time and apply the chain rule to get its gradient
        self.grad = 1.0
        for v in reversed(topo):
            v._backward()

    def __neg__(self): # -self
        return self * -1

    def __radd__(self, other): # other + self
        return self + other

    def __sub__(self, other): # self - other
        return self + (-other)

    def __rsub__(self, other): # other - self
        return other + (-self)

    def __rmul__(self, other): # other * self
        return self * other

    def __repr__(self):
        return f"Tensor(data={self.data}, grad={self.grad})"

### Travail pour le TP3

In [65]:
import numpy as np
import matplotlib.pyplot as plt

from abc import ABC, abstractmethod

np.random.seed(0)

## Préparation des jeux de données synthétiques pour l'évaluation

### Données linaires

Le jeu de données linéaire est généré à partir d’une relation linéaire entre les variables x et y avec l’ajout d’un bruit gaussien.

In [62]:
n_samples = 100
x_linear = np.linspace(-10, 10, n_samples)
y_linear = 3 * x_linear + 5 + np.random.normal(0, 2, n_samples)

### Données non linéaires

Le jeu de données non linéaire est basé sur une relation quadratique entre x et y avec un bruit ajouté.

In [63]:
y_nonlinear = 0.5 * x_linear **2 - 4 * x_linear + np.random.normal(0, 5, n_samples)

## Implémentation des algorithmes

Préparation de la classe optimiseur pour l'implémentation de nos optimizer customisés.

In [66]:
class Optimizer(ABC):
    def __init__(self, params, learning_rate):
        self.params = params
        self.learning_rate = learning_rate

    @abstractmethod
    def step(self):
        pass

### Stochastic Gradient Descent (SGD)

TODO EXPLAIN

In [None]:
class SGD(Optimizer):
    def __init__(self, params: list, learning_rate: float = 0.01):
        super().__init__(params=params, learning_rate=learning_rate)

    def step(self):
        for param in self.params:
            param -= self.learning_rate * param.grad

TODO EXPLAIN

### Root Mean Square Propagation (RMSProp)

TODO EXPLAIN

In [None]:
class RMSProp(Optimizer):
    def __init__(self, params, learning_rate=0.01, decay=0.9):
        pass

TODO EXPLAIN

### Adagrad

TODO EXPLAIN

In [None]:
class Adagrad(Optimizer):
    def __init__(self , params: list, learning_rate: float = 0.01):
        super().__init__(params=params, learning_rate=learning_rate)
        self.G = [np.zeros_like(param.data) for param in params]
        self.epsilon = 1e-8

    def step(self):
        for index, param in enumerate(self.params):
            self.G[index] += param.grad ** 2
            coef = np.sqrt(self.G[index]) + self.epsilon
            param -= (self.learning_rate / coef) * param.grad

TODO EXPLAIN

### Adam

TODO EXPLAIN

In [None]:
class Adam(Optimizer):
    def __init__(self, params: list, learning_rate: float = 0.001, beta1: float = 0.9, beta2: float = 0.999, eps: float = 1e-8):
        super().__init__(params=params, learning_rate=learning_rate)
        self.epsilon = eps
        self.beta1 = beta1
        self.beta2 = beta2
        self.m = [np.zeros_like(param.data) for param in params]
        self.v = [np.zeros_like(param.data) for param in params]

    def step(self):
        for index, param in enumerate(self.params):
            self.m[index] = self.beta1 * self.m[index] + (1 - self.beta1) * param.grad
            self.v[index] = self.beta2 * self.v[index] + (1 - self.beta2) * param.grad ** 2
            coef = np.sqrt(self.v[index]) + self.epsilon
            param -= self.learning_rate / coef * self.m[index]

TODO EXPLAIN

### AdamW

TODO EXPLAIN

In [None]:
class AdamW(Optimizer):
    def __init__(self, params: list, learning_rate: float = 0.001 , beta1: float = 0.9, beta2: float = 0.999 , eps: float = 1e-8, weight_decay: float = 0.01):
        super().__init__(params=params, learning_rate=learning_rate)
        self.epsilon = eps
        self.beta1 = beta1
        self.beta2 = beta2
        self.weight_decay = weight_decay
        self.m = [np.zeros_like(param.data) for param in params]
        self.v = [np.zeros_like(param.data) for param in params]

    def step(self):
        for index, param in enumerate(self.params):
            self.m[index] = self.beta1 * self.m[index] + (1 - self.beta1) * param.grad
            self.v[index] = self.beta2 * self.v[index] + (1 - self.beta2) * param.grad ** 2
            coef = np.sqrt(self.v[index]) + self.epsilon
            param -= self.learning_rate / coef * self.m[index] - self.learning_rate * self.weight_decay * param

TODO EXPLAIN

## Evaluation des Optimiseur

### Fonctions de Perte

TODO EXPLAIN

In [None]:
def f(x):
    return (x - 2)**2

def f_nonconvexe(x):
    return 3*x**2 - 2*x

TODO EXPLAIN

### Expérimentation

TODO EXPLAIN

In [None]:
def eval_optim():
    pass

TODO EXPLAIN

## Implémentation de réseau de Neurones

### Définition du modèle

In [None]:
def func_nn(x, W1 , b1 , W2 , b2):
    h1 = W1 * x + b1
    y = W2 * h1 + b2
    return y

def mse(y, y_hat):
    return (y - y_hat) ** 2

### Entraînement du réseau

TODO EXPLAIN

In [None]:
def eval_nn_optim ():
    pass

TODO EXPLAIN

## Implémentation des scheduler de taux d’apprentissage

### LRScheduler

TODO EXPLAIN

In [None]:
class LRScheduler:
    def __init__(self , optimizer , initial_lr):
        pass

TODO EXPLAIN

### LRSchedulerOnPlateau

TODO EXPLAIN

In [None]:
class LRSchedulerOnPlateau(LRScheduler):
    def __init__(self , optimizer , initial_lr , patience =10, factor =0.1, min_lr =1e-6, mode="min", threshold =1e-4):
        pass

TODO EXPLAIN