# ***TP3 - Clovis Lechien***

1. [Utils](#Utils)
2. [SGD](#SGD)
3. [RMSProp](#RMSProp)
4. [Adagrad](#Adagrad)
5. [Adam](#Adam)
6. [AdamW](#AdamW)
7. [Evaluation des Optimiseurs](#evaluation-des-optimiseurs) FIXME
8. [Réseau de Neurones](#réseau-de-neurones) FIXME
9. [Scheduler de Taux d'Apprentissage](#schedulers) FIXME

In [20]:
import numpy as np
import torch
import torch.nn as nn
from torch.optim import Optimizer

In [21]:
# Génération du jeu de données linéaire
np.random.seed(0)
n_samples = 100
x_linear = np.linspace(-10, 10, n_samples)
y_linear = 3 * x_linear + 5 + np.random.normal(0, 2, n_samples)

 # Génération du jeu de données non linéaire
y_nonlinear = 0.5 * x_linear **2 - 4 * x_linear + np.random.normal(0 ,5 ,n_samples)

# ***Custom Tensor class***

In [109]:
class Tensor:

    """ stores a single scalar Tensor and its gradient """

    def __init__(self, data, _children=(), _op=''):

        self.data = data
        self.grad = 0.0
        # internal variables used for autograd graph construction
        self._backward = lambda: None
        self._prev = set(_children)
        self._op = _op # the op that produced this node, for graphviz / debugging / etc

    def __add__(self, other):
        other = other if isinstance(other, Tensor) else Tensor(other)

        out = Tensor(self.data + other.data, (self, other), '+')

        def _backward():
            self.grad += out.grad
            other.grad += out.grad

        out._backward = _backward

        out._prev = set([self, other])
        return out

    def __mul__(self, other):

        other = other if isinstance(other, Tensor) else Tensor(other)

        out = Tensor(self.data * other.data, [self, other], '*')

        def _backward():
            self.grad += other.data * out.grad
            other.grad += self.data * out.grad
        out._backward = _backward

        return out

    def __pow__(self, other):

        assert isinstance(other, (int, float)), "only supporting int/float powers for now"

        out = Tensor(self.data**other, (self,), f'**{other}')

        def _backward():
            self.grad += (other * self.data**(other-1)) * out.grad

        out._backward = _backward

        return out

    def relu(self):
        # FIXME: implement relu
        pass

    def build_topo(self, visited=None, topo=None):
        if self not in visited:
            visited.add(self)
            for child in self._prev:
                child.build_topo(visited=visited, topo=topo)
            topo.append(self)
        return topo

    def backward(self):
        # topological order all of the children in the graph
        topo = []
        visited = set()
        topo = self.build_topo(topo=topo, visited=visited)

        # go one variable at a time and apply the chain rule to get its gradient
        self.grad = 1.0
        for v in reversed(topo):
            v._backward()

    def __neg__(self): # -self
        return self * -1

    def __radd__(self, other): # other + self
        return self + other

    def __sub__(self, other): # self - other
        return self + (-other)

    def __rsub__(self, other): # other - self
        return other + (-self)

    def __rmul__(self, other): # other * self
        return self * other

    def __truediv__(self, other): # self / other
        return self * other**-1

    def __rtruediv__(self, other): # other / self
        return other * self**-1

    def __repr__(self):
        return f"Tensor(data={self.data}, grad={self.grad})"

## ***Custom operations***

In [111]:
def log_d(dual_number: Tensor):
    out = Tensor(np.log(dual_number.data), (dual_number,), 'log')

    def _backward():
        dual_number.grad += (1 / dual_number.data) * out.grad

    out._backward = _backward
    return out

def exp_d(dual_number: Tensor):
    out = Tensor(np.exp(dual_number.data), (dual_number,), 'exp')

    def _backward():
        dual_number.grad += np.exp(dual_number.data) * out.grad

    out._backward = _backward
    return out

def sin_d(dual_number: Tensor):
    out = Tensor(np.sin(dual_number.data), (dual_number,), 'sin')

    def _backward():
        dual_number.grad += np.cos(dual_number.data) * out.grad

    out._backward = _backward
    return out

def cos_d(dual_number: Tensor):
    out = Tensor(np.cos(dual_number.data), (dual_number,), 'cos')

    def _backward():
        dual_number.grad += -np.sin(dual_number.data) * out.grad

    out._backward = _backward
    return out

def sigmoid_d(dual_number: Tensor):
    sig = 1 / (1 + np.exp(-dual_number.data))
    out = Tensor(sig, (dual_number,), 'sigmoid')

    def _backward():
        dual_number.grad += sig * (1 - sig) * out.grad

    out._backward = _backward
    return out

def tanh_d(dual_number: Tensor):
    tanh = np.tanh(dual_number.data)
    out = Tensor(tanh, (dual_number,), 'tanh')

    def _backward():
        dual_number.grad += (1 - tanh**2) * out.grad

    out._backward = _backward
    return out

def tan_d(dual_number: Tensor):
    out = Tensor(np.tan(dual_number.data), (dual_number,), 'tan')

    def _backward():
        dual_number.grad += (1 / np.cos(dual_number.data)**2) * out.grad

    out._backward = _backward
    return out

def sqrt_d(dual_number: Tensor):
    out = Tensor(np.sqrt(dual_number.data), (dual_number,), 'sqrt')

    def _backward():
        dual_number.grad += (0.5 / np.sqrt(dual_number.data)) * out.grad

    out._backward = _backward
    return out


def pow_d(dual_number: Tensor, power: int):
    out = Tensor(dual_number.data**power, (dual_number,), f'pow{power}')

    def _backward():
        dual_number.grad += (power * dual_number.data**(power-1)) * out.grad

    out._backward = _backward
    return out

def softmax_d(dual_number: Tensor):
    e = np.exp(dual_number.data - np.max(dual_number.data))
    out = Tensor(e / np.sum(e), (dual_number,), 'softmax')

    def _backward():
        for i in range(len(dual_number.data)):
            for j in range(len(dual_number.data)):
                if i == j:
                    dual_number.grad[i] += out.data[i] * (1 - out.data[i]) * out.grad[i]
                else:
                    dual_number.grad[i] += -out.data[i] * out.data[j] * out.grad[j]

    out._backward = _backward
    return out

# ***Utils<a name="Utils"></a>***

In [154]:
def optimizer_testing_loop(parameters : dict[str,]):
    model = parameters['model']

    criterion = parameters['criterion']
    optimizer = parameters['optimizer']

    x_tensor = parameters['x_tensor']
    y_tensor = parameters['y_tensor']

    epochs = parameters['epochs']
    for epoch in range(epochs):
        optimizer.zero_grad()
        predictions = model(x_tensor)
        loss = criterion(predictions, y_tensor)
        loss.backward()
        optimizer.step()

        if (epoch + 1) % 10 == 0:
            print(f"Epoch {epoch + 1}, Loss: {loss.item()}")

    for name, param in model.named_parameters():
        print(f"{name}: {param.data}")


def check_diffs(a : list[float], b : list[float], tol : float = 1e-4):
    res = np.allclose(a, b, atol=tol)
    if res:
        print(f"All elements between\n{a}\and\n{b}\nare close within a tolerance of {tol}")
    else:
        print("Test failed")

# ***SGD<a name="SGD"></a>***

## **Implementation de SGD**

In [25]:
class SGD_torch(Optimizer):
    def __init__(self, params, learning_rate=0.01):
        hyperparams = {'lr': learning_rate}
        super().__init__(params=params, defaults=hyperparams)

    @torch.no_grad()
    def step(self):
        for group in self.param_groups:
            lr = group['lr']

            for theta_t in group['params']:
                if theta_t.grad is None:
                    continue

                theta_t -= lr * theta_t.grad


class SGD_custom:
    def __init__(self, params, learning_rate=0.01):
        self.params = params
        self.learning_rate = learning_rate

    def step(self):
        for param in self.params:
            if param.grad is not None:
                param.data -= self.learning_rate * param.grad

    def zero_grad(self):
        for param in self.params:
            param.grad = 0.0

## **Test de SGD**

In [26]:
model = nn.Linear(1, 1)
linear_parameters = {
    'model': model,
    'criterion': nn.MSELoss(),
    'optimizer': SGD_torch(model.parameters(), learning_rate=0.01),
    'x_tensor': torch.from_numpy(x_linear).float().view(-1, 1),
    'y_tensor': torch.from_numpy(y_linear).float().view(-1, 1),
    'epochs': 100
}

optimizer_testing_loop(linear_parameters)

Epoch 10, Loss: 18.837690353393555
Epoch 20, Loss: 13.9168119430542
Epoch 30, Loss: 10.63158893585205
Epoch 40, Loss: 8.438353538513184
Epoch 50, Loss: 6.974130153656006
Epoch 60, Loss: 5.996603965759277
Epoch 70, Loss: 5.343999862670898
Epoch 80, Loss: 4.908313751220703
Epoch 90, Loss: 4.617447376251221
Epoch 100, Loss: 4.4232635498046875
weight: tensor([[2.9703]])
bias: tensor([4.5076])


In [27]:
model = nn.Linear(1, 1)
linear_parameters = {
    'model': model,
    'criterion': nn.MSELoss(),
    'optimizer': SGD_torch(model.parameters(), learning_rate=0.01),
    'x_tensor': torch.from_numpy(x_linear).float().view(-1, 1),
    'y_tensor': torch.from_numpy(y_nonlinear).float().view(-1, 1),
    'epochs': 100
}

optimizer_testing_loop(linear_parameters)

Epoch 10, Loss: 464.37384033203125
Epoch 20, Loss: 402.1067199707031
Epoch 30, Loss: 360.53668212890625
Epoch 40, Loss: 332.7842102050781
Epoch 50, Loss: 314.2564697265625
Epoch 60, Loss: 301.8872375488281
Epoch 70, Loss: 293.62933349609375
Epoch 80, Loss: 288.1163635253906
Epoch 90, Loss: 284.43585205078125
Epoch 100, Loss: 281.97869873046875
weight: tensor([[-4.1445]])
bias: tensor([15.2363])


# ***RMSProp<a name="RMSProp"></a>***

## **Implementation de RMSProp**

In [114]:
class RMSProp_torch(Optimizer):
    def __init__(self, params, learning_rate=0.01, decay=0.9):
        hyperparams = {'lr': learning_rate, 'decay': decay}
        super().__init__(params=params, defaults=hyperparams)

    @torch.no_grad()
    def step(self):
        for group in self.param_groups:
            decay = group['decay']
            lr = group['lr']

            for theta_t in group['params']:
                if theta_t.grad is None:
                    continue

                state = self.state[theta_t]
                if 'square_avg' not in state:
                    state['square_avg'] = torch.zeros_like(theta_t)

                square_avg = state['square_avg']
                square_avg = decay * square_avg + (1 - decay) * (theta_t.grad ** 2)
                state['square_avg'] = square_avg

                theta_t -= lr * theta_t.grad / square_avg.sqrt()


class RMSProp_custom:
    def __init__(self, params, learning_rate=0.01, decay=0.9):
        self.params = params
        self.learning_rate = learning_rate
        self.decay = decay
        self.state = {param: {'square_avg': Tensor(0.0)} for param in params}

    def step(self):
        for theta_t in self.params:
            if theta_t.grad is None:
                continue

            state = self.state[theta_t]

            square_avg = state['square_avg']
            square_avg.data = self.decay * square_avg.data + (1 - self.decay) * (theta_t.grad ** 2)
            state['square_avg'] = square_avg

            theta_t.data -= self.learning_rate * theta_t.grad / np.sqrt(square_avg.data)

    def zero_grad(self):
        for param in self.params:
            param.grad = 0.0

## **Test de RMSProp**

In [29]:
model = nn.Linear(1, 1)
linear_parameters = {
    'model': model,
    'criterion': nn.MSELoss(),
    'optimizer': RMSProp_torch(model.parameters(), learning_rate=0.01, decay=0.8),
    'x_tensor': torch.from_numpy(x_linear).float().view(-1, 1),
    'y_tensor': torch.from_numpy(y_linear).float().view(-1, 1),
    'epochs': 100
}

optimizer_testing_loop(linear_parameters)

Epoch 10, Loss: 416.08563232421875
Epoch 20, Loss: 392.09796142578125
Epoch 30, Loss: 369.3656921386719
Epoch 40, Loss: 347.38226318359375
Epoch 50, Loss: 326.0961608886719
Epoch 60, Loss: 305.5015869140625
Epoch 70, Loss: 285.5976257324219
Epoch 80, Loss: 266.3839416503906
Epoch 90, Loss: 247.86012268066406
Epoch 100, Loss: 230.02578735351562
weight: tensor([[0.4664]])
bias: tensor([1.7960])


In [30]:
model = nn.Linear(1, 1)
linear_parameters = {
    'model': model,
    'criterion': nn.MSELoss(),
    'optimizer': RMSProp_torch(model.parameters(), learning_rate=0.01, decay=0.8),
    'x_tensor': torch.from_numpy(x_linear).float().view(-1, 1),
    'y_tensor': torch.from_numpy(y_nonlinear).float().view(-1, 1),
    'epochs': 100
}

optimizer_testing_loop(linear_parameters)

Epoch 10, Loss: 924.3912353515625
Epoch 20, Loss: 900.039306640625
Epoch 30, Loss: 876.9500122070312
Epoch 40, Loss: 854.6093139648438
Epoch 50, Loss: 832.9650268554688
Epoch 60, Loss: 812.0113525390625
Epoch 70, Loss: 791.7471923828125
Epoch 80, Loss: 772.171875
Epoch 90, Loss: 753.2850952148438
Epoch 100, Loss: 735.08642578125
weight: tensor([[-1.9983]])
bias: tensor([0.1038])


# ***Adagrad<a name="Adagrad"></a>***

## **Implementation de Adagrad**

In [116]:
class Adagrad_torch(Optimizer):
    def __init__(self, params, learning_rate=0.01):
        hyperparams = {'lr': learning_rate}
        super().__init__(params=params, defaults=hyperparams)

    @torch.no_grad()
    def step(self):
        for group in self.param_groups:
            lr = group['lr']

            for theta_t in group['params']:
                if theta_t.grad is None:
                    continue

                state = self.state[theta_t]
                if 'sum_squared_grads' not in state:
                    state['sum_squared_grads'] = torch.zeros_like(theta_t)

                sum_squared_grads = state['sum_squared_grads']
                sum_squared_grads += theta_t.grad ** 2
                state['sum_squared_grads'] = sum_squared_grads

                adjusted_lr = lr / sum_squared_grads.sqrt()

                theta_t -= adjusted_lr * theta_t.grad


class Adagrad_custom:
    def __init__(self, params, learning_rate=0.01):
        self.params = params
        self.learning_rate = learning_rate
        self.state = {param: {'sum_squared_grads': Tensor(0.0)} for param in params}

    def step(self):
        for theta_t in self.params:
            if theta_t.grad is None:
                continue

            state = self.state[theta_t]

            sum_squared_grads = state['sum_squared_grads']
            sum_squared_grads.data += theta_t.grad ** 2
            state['sum_squared_grads'] = sum_squared_grads

            adjusted_lr = self.learning_rate / np.sqrt(sum_squared_grads.data)

            theta_t.data -= adjusted_lr * theta_t.grad

    def zero_grad(self):
        for param in self.params:
            param.grad = 0.0

## **Test de Adagrad**

In [32]:
model = nn.Linear(1, 1)
linear_parameters = {
    'model': model,
    'criterion': nn.MSELoss(),
    'optimizer': Adagrad_torch(model.parameters(), learning_rate=0.5),
    'x_tensor': torch.from_numpy(x_linear).float().view(-1, 1),
    'y_tensor': torch.from_numpy(y_linear).float().view(-1, 1),
    'epochs': 100
}

optimizer_testing_loop(linear_parameters)

Epoch 10, Loss: 60.79481887817383
Epoch 20, Loss: 17.981473922729492
Epoch 30, Loss: 8.222616195678711
Epoch 40, Loss: 5.550739765167236
Epoch 50, Loss: 4.680315017700195
Epoch 60, Loss: 4.341108322143555
Epoch 70, Loss: 4.1887288093566895
Epoch 80, Loss: 4.114058494567871
Epoch 90, Loss: 4.075798034667969
Epoch 100, Loss: 4.055779933929443
weight: tensor([[2.9690]])
bias: tensor([4.9744])


In [33]:
model = nn.Linear(1, 1)
linear_parameters = {
    'model': model,
    'criterion': nn.MSELoss(),
    'optimizer': Adagrad_torch(model.parameters(), learning_rate=0.5),
    'x_tensor': torch.from_numpy(x_linear).float().view(-1, 1),
    'y_tensor': torch.from_numpy(y_nonlinear).float().view(-1, 1),
    'epochs': 100
}

optimizer_testing_loop(linear_parameters)

Epoch 10, Loss: 728.67822265625
Epoch 20, Loss: 556.0093994140625
Epoch 30, Loss: 476.72149658203125
Epoch 40, Loss: 433.33746337890625
Epoch 50, Loss: 406.7573547363281
Epoch 60, Loss: 388.84918212890625
Epoch 70, Loss: 375.7533874511719
Epoch 80, Loss: 365.5209045410156
Epoch 90, Loss: 357.1200866699219
Epoch 100, Loss: 349.97967529296875
weight: tensor([[-4.0441]])
bias: tensor([8.9320])


# ***Adam<a name="Adam"></a>***

## **Implementation de Adam**

In [124]:
class Adam_torch(Optimizer):
    def __init__(self, params, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8):
        hyperparams = {'lr': learning_rate, 'beta1': beta1, 'beta2': beta2, 'epsilon': epsilon}
        super().__init__(params=params, defaults=hyperparams)

    @torch.no_grad()
    def step(self):
        for group in self.param_groups:
            lr = group['lr']
            beta1 = group['beta1']
            beta2 = group['beta2']
            epsilon = group['epsilon']

            for theta_t in group['params']:
                if theta_t.grad is None:
                    continue

                state = self.state[theta_t]
                if 'm' not in state: # Moment d'ordre 1
                    state['m'] = torch.zeros_like(theta_t)
                if 'v' not in state: # Moment d'ordre 2
                    state['v'] = torch.zeros_like(theta_t)
                if 't' not in state: # Temps
                    state['t'] = 0

                # Premier Moment
                m = state['m']
                m_t = beta1 * m + (1 - beta1) * theta_t.grad
                state['m'] = m_t

                # Second Moment
                v = state['v']
                v_t = beta2 * v + (1 - beta2) * theta_t.grad ** 2
                state['v'] = v_t

                # Temps
                t = state['t'] + 1
                state['t'] = t

                # Correction des biais
                m_hat = m_t / (1 - beta1 ** t)
                v_hat = v_t / (1 - beta2 ** t)

                theta_t -= lr * m_hat / (v_hat.sqrt() + epsilon)


class Adam_custom:
    def __init__(self, params, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8):
        self.params = params
        self.learning_rate = learning_rate
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon
        self.state = {param: {'m': Tensor(0.0), 'v': Tensor(0.0), 't': 0} for param in params}

    def step(self):
        for theta_t in self.params:
            if theta_t.grad is None:
                continue

            state = self.state[theta_t]

            # Premier Moment
            m = state['m']
            m_t = self.beta1 * m.data + (1 - self.beta1) * theta_t.grad
            state['m'].data = m_t

            # Second Moment
            v = state['v']
            v_t = self.beta2 * v.data + (1 - self.beta2) * theta_t.grad ** 2
            state['v'].data = v_t

            # Temps
            t = state['t'] + 1
            state['t'] = t

            # Correction des biais
            m_hat = m_t / (1 - self.beta1 ** t)
            v_hat = v_t / (1 - self.beta2 ** t)

            theta_t.data -= self.learning_rate * m_hat / (np.sqrt(v_hat) + self.epsilon)

    def zero_grad(self):
        for param in self.params:
            param.grad = 0.0

## **Test de Adam**

In [35]:
model = nn.Linear(1, 1)
linear_parameters = {
    'model': model,
    'criterion': nn.MSELoss(),
    'optimizer': Adam_torch(model.parameters(), learning_rate=0.1, beta1=0.9, beta2=0.999, epsilon=1e-8),
    'x_tensor': torch.from_numpy(x_linear).float().view(-1, 1),
    'y_tensor': torch.from_numpy(y_linear).float().view(-1, 1),
    'epochs': 100
}

optimizer_testing_loop(linear_parameters)

Epoch 10, Loss: 306.93963623046875
Epoch 20, Loss: 145.0484619140625
Epoch 30, Loss: 54.37504577636719
Epoch 40, Loss: 15.743021011352539
Epoch 50, Loss: 5.091212272644043
Epoch 60, Loss: 4.114846706390381
Epoch 70, Loss: 4.423768997192383
Epoch 80, Loss: 4.31403923034668
Epoch 90, Loss: 4.114806652069092
Epoch 100, Loss: 4.03961181640625
weight: tensor([[2.9796]])
bias: tensor([5.1556])


In [36]:
model = nn.Linear(1, 1)
linear_parameters = {
    'model': model,
    'criterion': nn.MSELoss(),
    'optimizer': Adam_torch(model.parameters(), learning_rate=0.1, beta1=0.9, beta2=0.999, epsilon=1e-8),
    'x_tensor': torch.from_numpy(x_linear).float().view(-1, 1),
    'y_tensor': torch.from_numpy(y_nonlinear).float().view(-1, 1),
    'epochs': 100
}

optimizer_testing_loop(linear_parameters)

Epoch 10, Loss: 1138.9935302734375
Epoch 20, Loss: 869.0379638671875
Epoch 30, Loss: 676.70654296875
Epoch 40, Loss: 551.13232421875
Epoch 50, Loss: 475.0643310546875
Epoch 60, Loss: 430.46612548828125
Epoch 70, Loss: 402.9716491699219
Epoch 80, Loss: 383.5935974121094
Epoch 90, Loss: 367.9411315917969
Epoch 100, Loss: 354.38629150390625
weight: tensor([[-4.1813]])
bias: tensor([8.6936])


# ***AdamW<a name="AdamW"></a>***

In [132]:
class AdamW_torch(Optimizer):
    def __init__(self, params, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8, weight_decay=0.01):
        hyperparams = {'lr': learning_rate, 'beta1': beta1, 'beta2': beta2, 'epsilon': epsilon, 'weight_decay': weight_decay}
        super().__init__(params=params, defaults=hyperparams)

    @torch.no_grad()
    def step(self):
        for group in self.param_groups:
            lr = group['lr']
            beta1 = group['beta1']
            beta2 = group['beta2']
            epsilon = group['epsilon']
            weight_decay = group['weight_decay']

            for theta_t in group['params']:
                if theta_t.grad is None:
                    continue

                state = self.state[theta_t]
                if 'm' not in state: # Moment d'ordre 1
                    state['m'] = torch.zeros_like(theta_t)
                if 'v' not in state: # Moment d'ordre 2
                    state['v'] = torch.zeros_like(theta_t)
                if 't' not in state: # Temps
                    state['t'] = 0

                # Premier Moment
                m = state['m']
                m_t = beta1 * m + (1 - beta1) * theta_t.grad
                state['m'] = m_t

                # Second Moment
                v = state['v']
                v_t = beta2 * v + (1 - beta2) * theta_t.grad ** 2
                state['v'] = v_t

                # Temps
                t = state['t'] + 1
                state['t'] = t

                # Correction des biais
                m_hat = m_t / (1 - beta1 ** t)
                v_hat = v_t / (1 - beta2 ** t)

                theta_t -= lr * m_hat / (np.sqrt(v_hat) + epsilon) - lr * weight_decay * theta_t


class AdamW_custom:
    def __init__(self, params, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8, weight_decay=0.01):
        self.params = params
        self.learning_rate = learning_rate
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon
        self.weight_decay = weight_decay
        self.state = {param: {'m': Tensor(0.0), 'v': Tensor(0.0), 't': 0} for param in params}

    def step(self):
        for theta_t in self.params:
            if theta_t.grad is None:
                continue

            state = self.state[theta_t]

            # Premier Moment
            m = state['m']
            m_t = self.beta1 * m.data + (1 - self.beta1) * theta_t.grad
            state['m'].data = m_t

            # Second Moment
            v = state['v']
            v_t = self.beta2 * v.data + (1 - self.beta2) * theta_t.grad ** 2
            state['v'].data = v_t

            # Temps
            t = state['t'] + 1
            state['t'] = t

            # Correction des biais
            m_hat = m_t / (1 - self.beta1 ** t)
            v_hat = v_t / (1 - self.beta2 ** t)

            theta_t.data -= self.learning_rate * m_hat / (np.sqrt(v_hat) + self.epsilon) - self.learning_rate * self.weight_decay * theta_t.data

    def zero_grad(self):
        for param in self.params:
            param.grad = 0.0

## ***Test de AdamW***

In [133]:
model = nn.Linear(1, 1)
linear_parameters = {
    'model': model,
    'criterion': nn.MSELoss(),
    'optimizer': AdamW_torch(model.parameters(), learning_rate=0.1, beta1=0.9, beta2=0.999, epsilon=1e-8, weight_decay=0.01),
    'x_tensor': torch.from_numpy(x_linear).float().view(-1, 1),
    'y_tensor': torch.from_numpy(y_linear).float().view(-1, 1),
    'epochs': 100
}

optimizer_testing_loop(linear_parameters)

Epoch 10, Loss: 210.1480255126953
Epoch 20, Loss: 82.62165832519531
Epoch 30, Loss: 22.42963218688965
Epoch 40, Loss: 5.376733303070068
Epoch 50, Loss: 4.686898231506348
Epoch 60, Loss: 5.464333534240723
Epoch 70, Loss: 4.954035758972168
Epoch 80, Loss: 4.336297988891602
Epoch 90, Loss: 4.101994037628174
Epoch 100, Loss: 4.055665969848633
weight: tensor([[2.9842]])
bias: tensor([5.2393])


In [134]:
model = nn.Linear(1, 1)
linear_parameters = {
    'model': model,
    'criterion': nn.MSELoss(),
    'optimizer': AdamW_torch(model.parameters(), learning_rate=0.1, beta1=0.9, beta2=0.999, epsilon=1e-8, weight_decay=0.01),
    'x_tensor': torch.from_numpy(x_linear).float().view(-1, 1),
    'y_tensor': torch.from_numpy(y_linear).float().view(-1, 1),
    'epochs': 100
}

optimizer_testing_loop(linear_parameters)

Epoch 10, Loss: 118.39867401123047
Epoch 20, Loss: 35.853355407714844
Epoch 30, Loss: 10.578374862670898
Epoch 40, Loss: 8.456700325012207
Epoch 50, Loss: 7.222285747528076
Epoch 60, Loss: 5.000164031982422
Epoch 70, Loss: 4.1139140129089355
Epoch 80, Loss: 4.039061069488525
Epoch 90, Loss: 4.043933868408203
Epoch 100, Loss: 4.068148612976074
weight: tensor([[2.9916]])
bias: tensor([5.2697])


# ***Evaluation des Optimiseurs<a name="evaluation-des-optimiseurs"></a>***

In [135]:
def f(x : torch.Tensor | Tensor):
    return (x - 2) ** 2


def f_nonconvexe(x : torch.Tensor | Tensor):
    return 3*x ** 2 - 2*x

In [158]:
def eval_optim(x : torch.Tensor | Tensor, convexe : bool = True, scheduler : bool = False):
    if convexe:
        print(f"Optimisation de la fonction convexe f(x) = (x - 2)²")
        y = f(x)
    else:
        print(f"Optimisation de la fonction non convexe f(x) = 3x² - 2x")
        y = f_nonconvexe(x)

    y.backward()
    if isinstance(x, torch.Tensor):
        print(f"Gradient de f en x={x.item()}: x.grad={x.grad.item()}")
    else:
        print(f"Gradient de f en x={x.data}: x.grad={x.grad}")

    resulting_x = []
    resulting_fx = []

    if isinstance(x, torch.Tensor):
        optimizers_torch = [
            SGD_torch,
            RMSProp_torch,
            Adagrad_torch,
            Adam_torch,
            AdamW_torch
        ]

        for optimizer in optimizers_torch:
            optimizer = optimizer([x])
            if scheduler:
                scheduler = LRSchedulerOnPlateau(optimizer, initial_lr=0.01, patience=5, factor=0.5, min_lr=1e-6, mode='min', threshold=1e-4)
            for i in range(100):
                optimizer.zero_grad()
                if convexe:
                    y = f(x)
                else:
                    y = f_nonconvexe(x)
                y.backward()
                optimizer.step()
                if scheduler:
                    scheduler.step(y)
            print(f"Optimiseur {optimizer.__class__.__name__}: x={x.item()}, f(x)={f(x).item()}")
            resulting_x.append(x.item())
            resulting_fx.append(f(x).item())

    else:
        optimizers_custom = [
            SGD_custom,
            RMSProp_custom,
            Adagrad_custom,
            Adam_custom,
            AdamW_custom
        ]

        for optimizer in optimizers_custom:
            optimizer = optimizer([x])
            if scheduler:
                scheduler = LRSchedulerOnPlateau(optimizer, initial_lr=0.01, patience=5, factor=0.5, min_lr=1e-6, mode='min', threshold=1e-4)
            for i in range(100):
                optimizer.zero_grad()
                if convexe:
                    y = f(x)
                else:
                    y = f_nonconvexe(x)
                y.backward()
                optimizer.step()
                if scheduler:
                    scheduler.step(y)
            print(f"Optimiseur {optimizer.__class__.__name__}: x={x.data}, f(x)={f(x).data}")
            resulting_x.append(x.data)
            resulting_fx.append(f(x).data)

    return resulting_x, resulting_fx

In [159]:
x = torch.tensor([69.], requires_grad=True)
conv_torch_x, conv_torch_fx = eval_optim(x, convexe=True)
print()
nonconv_torch_x, nonconv_torch_fx = eval_optim(x, convexe=False)

Optimisation de la fonction convexe f(x) = (x - 2)²
Gradient de f en x=69.0: x.grad=134.0
Optimiseur SGD_torch: x=10.885512351989746, f(x)=78.95233154296875
Optimiseur RMSProp_torch: x=9.80399227142334, f(x)=60.90229415893555
Optimiseur Adagrad_torch: x=9.618916511535645, f(x)=58.047889709472656
Optimiseur Adam_torch: x=9.519140243530273, f(x)=56.537471771240234
Optimiseur AdamW_torch: x=9.428813934326172, f(x)=55.18727493286133

Optimisation de la fonction non convexe f(x) = 3x² - 2x
Gradient de f en x=9.428813934326172: x.grad=69.43231201171875
Optimiseur SGD_torch: x=0.3520234227180481, f(x)=2.715826988220215
Optimiseur RMSProp_torch: x=0.33838367462158203, f(x)=2.7609689235687256
Optimiseur Adagrad_torch: x=0.3333333432674408, f(x)=2.777777671813965
Optimiseur Adam_torch: x=0.3333333432674408, f(x)=2.777777671813965
Optimiseur AdamW_torch: x=0.3333345949649811, f(x)=2.77777361869812


In [160]:
x = Tensor(69.)
conv_custom_x, conv_custom_fx = eval_optim(x, convexe=True)
print()
nonconv_custom_x, nonconv_custom_fx = eval_optim(x, convexe=False)

Optimisation de la fonction convexe f(x) = (x - 2)²
Gradient de f en x=69.0: x.grad=134.0
Optimiseur SGD_custom: x=10.885510244948467, f(x)=78.95229231308416
Optimiseur RMSProp_custom: x=9.803987775902403, f(x)=60.90222520643413
Optimiseur Adagrad_custom: x=9.618913070757662, f(x)=58.04783637976195
Optimiseur Adam_custom: x=9.519135827365043, f(x)=56.53740359036458
Optimiseur AdamW_custom: x=9.428814427498079, f(x)=55.18728379820361

Optimisation de la fonction non convexe f(x) = 3x² - 2x
Gradient de f en x=9.428814427498079: x.grad=69.43231722986997
Optimiseur SGD_custom: x=0.3520234079595069, f(x)=2.715826847913398
Optimiseur RMSProp_custom: x=0.3383837368874569, f(x)=2.760968605840092
Optimiseur Adagrad_custom: x=0.3333333333333333, f(x)=2.777777777777778
Optimiseur Adam_custom: x=0.3333333333333333, f(x)=2.777777777777778
Optimiseur AdamW_custom: x=0.33333456306680137, f(x)=2.77777367866773


In [165]:
check_diffs(conv_torch_x, conv_custom_x, tol=1e-4)
print()
check_diffs(conv_torch_fx, conv_custom_fx, tol=1e-4)

All elements between
[10.885512351989746, 9.80399227142334, 9.618916511535645, 9.519140243530273, 9.428813934326172]nd
[10.885510244948467, 9.803987775902403, 9.618913070757662, 9.519135827365043, 9.428814427498079]
are close within a tolerance of 0.0001

All elements between
[78.95233154296875, 60.90229415893555, 58.047889709472656, 56.537471771240234, 55.18727493286133]nd
[78.95229231308416, 60.90222520643413, 58.04783637976195, 56.53740359036458, 55.18728379820361]
are close within a tolerance of 0.0001


# ***Réseau de Neurones<a name="réseau-de-neurones"></a>***

In [142]:
def func_nn(x, W1, b1, W2, b2):
    h1 = W1 * x + b1
    y = W2 * h1 + b2
    return y


def mse(y, y_hat):
    return (y - y_hat) ** 2

In [166]:
def eval_nn_optim(scheduler : bool = False, custom : bool = True):
    results = []

    if not custom:
        optimizers = [
            SGD_torch,
            RMSProp_torch,
            Adagrad_torch,
            Adam_torch,
            AdamW_torch
        ]

        for optimizer in optimizers:
            W1 = torch.tensor([1.], requires_grad=True)
            b1 = torch.tensor([1.], requires_grad=True)
            W2 = torch.tensor([1.], requires_grad=True)
            b2 = torch.tensor([1.], requires_grad=True)

            x = torch.tensor([1.], requires_grad=True)
            y = torch.tensor([10.])

            optimizer = optimizer([W1, b1, W2, b2])

            if scheduler:
                scheduler = LRSchedulerOnPlateau(optimizer, initial_lr=0.01, patience=5, factor=0.5, min_lr=1e-6, mode='min', threshold=1e-4)

            for i in range(100):
                optimizer.zero_grad()

                y_hat = func_nn(x, W1, b1, W2, b2)
                loss = mse(y, y_hat)

                loss.backward()
                optimizer.step()

                if scheduler:
                    scheduler.step(loss)

            print(f"Optimiseur {optimizer.__class__.__name__}:\nW1={W1.item()}, b1={b1.item()}, W2={W2.item()}, b2={b2.item()}")
            results.append([W1.item(), b1.item(), W2.item(), b2.item()])

    else:
        optimizers = [
            SGD_custom,
            RMSProp_custom,
            Adagrad_custom,
            Adam_custom,
            AdamW_custom
        ]

        for optimizer in optimizers:
            W1 = Tensor(1.)
            b1 = Tensor(1.)
            W2 = Tensor(1.)
            b2 = Tensor(1.)

            x = Tensor(1.)
            y = Tensor(10.)

            optimizer = optimizer([W1, b1, W2, b2])

            if scheduler:
                scheduler = LRSchedulerOnPlateau(optimizer, initial_lr=0.01, patience=5, factor=0.5, min_lr=1e-6, mode='min', threshold=1e-4)

            for i in range(100):
                optimizer.zero_grad()

                y_hat = func_nn(x, W1, b1, W2, b2)
                loss = mse(y, y_hat)

                loss.backward()
                optimizer.step()

                if scheduler:
                    scheduler.step(loss)

            print(f"Optimiseur {optimizer.__class__.__name__}:\nW1={W1.data}, b1={b1.data}, W2={W2.data}, b2={b2.data}")
            results.append([W1.data, b1.data, W2.data, b2.data])

    return results

In [167]:
torch_nn = eval_nn_optim(scheduler=False, custom=False)

Optimiseur SGD_torch:
W1=1.7965515851974487, b1=1.7965515851974487, W2=2.356534481048584, b2=1.532727837562561
Optimiseur RMSProp_torch:
W1=1.956400752067566, b1=1.956400752067566, W2=1.956400752067566, b2=1.899566411972046
Optimiseur Adagrad_torch:
W1=1.186563491821289, b1=1.186563491821289, W2=1.186563491821289, b2=1.1806892156600952
Optimiseur Adam_torch:
W1=1.1003371477127075, b1=1.1003371477127075, W2=1.1003371477127075, b2=1.0987093448638916
Optimiseur AdamW_torch:
W1=1.1013890504837036, b1=1.1013890504837036, W2=1.1013890504837036, b2=1.0997445583343506


In [168]:
custom_nn = eval_nn_optim(scheduler=False, custom=True)

Optimiseur SGD_custom:
W1=1.7965517126874235, b1=1.7965517126874235, W2=2.3565344500454675, b2=1.532727995527799
Optimiseur RMSProp_custom:
W1=1.9564006987933535, b1=1.9564006987933535, W2=1.9564006987933535, b2=1.8995662439967032
Optimiseur Adagrad_custom:
W1=1.1865635594156694, b1=1.1865635594156694, W2=1.1865635594156694, b2=1.1806890649673805
Optimiseur Adam_custom:
W1=1.1003370383873736, b1=1.1003370383873736, W2=1.1003370384224467, b2=1.0987096217296624
Optimiseur AdamW_custom:
W1=1.1013890816161964, b1=1.1013890816161964, W2=1.1013890816512846, b2=1.0997449288859633


In [169]:
check_diffs(torch_nn, custom_nn, tol=1e-4)

All elements between
[[1.7965515851974487, 1.7965515851974487, 2.356534481048584, 1.532727837562561], [1.956400752067566, 1.956400752067566, 1.956400752067566, 1.899566411972046], [1.186563491821289, 1.186563491821289, 1.186563491821289, 1.1806892156600952], [1.1003371477127075, 1.1003371477127075, 1.1003371477127075, 1.0987093448638916], [1.1013890504837036, 1.1013890504837036, 1.1013890504837036, 1.0997445583343506]]nd
[[1.7965517126874235, 1.7965517126874235, 2.3565344500454675, 1.532727995527799], [1.9564006987933535, 1.9564006987933535, 1.9564006987933535, 1.8995662439967032], [1.1865635594156694, 1.1865635594156694, 1.1865635594156694, 1.1806890649673805], [1.1003370383873736, 1.1003370383873736, 1.1003370384224467, 1.0987096217296624], [1.1013890816161964, 1.1013890816161964, 1.1013890816512846, 1.0997449288859633]]
are close within a tolerance of 0.0001


# ***Scheduler de Taux d'apprentissage<a name="schedulers"></a>***

## **Implementation de LRScheduler**

In [174]:
class LRScheduler:
    def __init__(self, optimizer, initial_lr):
        self.optimizer = optimizer
        self.initial_lr = initial_lr

    def get_lr(self):
        return self.optimizer.param_groups[0]['lr']

    def set_lr(self, lr):
        for group in self.optimizer.param_groups:
            group['lr'] = lr

## **Implementation de LRSchedulerOnPlateau**

In [175]:
class LRSchedulerOnPlateau(LRScheduler):
    def __init__(self, optimizer, initial_lr, patience=10, factor=0.1, min_lr=1e-6, mode='min', threshold=1e-4):
        super().__init__(optimizer, initial_lr)
        self.patience = patience
        self.factor = factor
        self.min_lr = min_lr
        self.mode = mode
        self.threshold = threshold

        self.best_value = None
        self.num_bad_epochs = 0

    def step(self, current_value):
        if self.best_value is None:
            self.best_value = current_value
            return

        if self.mode == 'min':
            improvement = self.best_value - current_value
        elif self.mode == 'max':
            improvement = current_value - self.best_value
        else:
            raise ValueError("Mode must be either 'min' (minimize) or 'max' (maximize).")

        if improvement > self.threshold:
            self.best_value = current_value
            self.num_bad_epochs = 0
        else:
            self.num_bad_epochs += 1

        if self.num_bad_epochs >= self.patience:
            self.reduce_lr()

    def reduce_lr(self):
        current_lr = self.get_lr()
        new_lr = max(current_lr * self.factor, self.min_lr)
        if new_lr < current_lr:
            print(f"Reducing learning rate: {current_lr:.6f} -> {new_lr:.6f}")
            self.set_lr(new_lr)
        self.num_bad_epochs = 0

## **Test de LRSchedulerOnPlateau**

In [176]:
x = torch.tensor([69.], requires_grad=True)
conv_torch_scheduler_x, conv_torch_scheduler_fx = eval_optim(x, convexe=True, scheduler=True)
print()
nonconv_torch_scheduler_x, nonconv_torch_scheduler_fx = eval_optim(x, convexe=False, scheduler=True)

Optimisation de la fonction convexe f(x) = (x - 2)²
Gradient de f en x=69.0: x.grad=134.0
Optimiseur SGD_torch: x=10.885512351989746, f(x)=78.95233154296875
Optimiseur RMSProp_torch: x=9.80399227142334, f(x)=60.90229415893555
Optimiseur Adagrad_torch: x=9.618916511535645, f(x)=58.047889709472656
Optimiseur Adam_torch: x=9.519140243530273, f(x)=56.537471771240234
Optimiseur AdamW_torch: x=9.428813934326172, f(x)=55.18727493286133

Optimisation de la fonction non convexe f(x) = 3x² - 2x
Gradient de f en x=9.428813934326172: x.grad=69.43231201171875
Optimiseur SGD_torch: x=0.3520234227180481, f(x)=2.715826988220215
Reducing learning rate: 0.010000 -> 0.005000
Reducing learning rate: 0.005000 -> 0.002500
Reducing learning rate: 0.002500 -> 0.001250
Reducing learning rate: 0.001250 -> 0.000625
Reducing learning rate: 0.000625 -> 0.000313
Reducing learning rate: 0.000313 -> 0.000156
Reducing learning rate: 0.000156 -> 0.000078
Reducing learning rate: 0.000078 -> 0.000039
Reducing learning ra

In [177]:
x = Tensor(69.)
conv_custom_scheduler_x, conv_custom_scheduler_fx = eval_optim(x, convexe=True, scheduler=True)
print()
nonconv_custom_scheduler_x, nonconv_custom_scheduler_fx = eval_optim(x, convexe=False, scheduler=True)

Optimisation de la fonction convexe f(x) = (x - 2)²
Gradient de f en x=69.0: x.grad=134.0


TypeError: '>' not supported between instances of 'Tensor' and 'float'

In [49]:
eval_nn_optim(scheduler=True)

Reducing learning rate: 0.010000 -> 0.005000
Reducing learning rate: 0.005000 -> 0.002500
Reducing learning rate: 0.002500 -> 0.001250
Reducing learning rate: 0.001250 -> 0.000625
Reducing learning rate: 0.000625 -> 0.000313
Reducing learning rate: 0.000313 -> 0.000156
Reducing learning rate: 0.000156 -> 0.000078
Reducing learning rate: 0.000078 -> 0.000039
Reducing learning rate: 0.000039 -> 0.000020
Reducing learning rate: 0.000020 -> 0.000010
Reducing learning rate: 0.000010 -> 0.000005
Reducing learning rate: 0.000005 -> 0.000002
Reducing learning rate: 0.000002 -> 0.000001
Reducing learning rate: 0.000001 -> 0.000001
Optimiseur SGD_torch:
W1=1.796550989151001, b1=1.796550989151001, W2=2.3565328121185303, b2=1.5327274799346924
Optimisation du réseau de neurones:
W1=1.796550989151001, b1=1.796550989151001, W2=2.3565328121185303, b2=1.5327274799346924

Optimiseur RMSProp_torch:
W1=1.956400752067566, b1=1.956400752067566, W2=1.956400752067566, b2=1.899566411972046
Optimisation du rése