In [2]:
%matplotlib inline
import numpy as np
import torch

----------------------------
#Session 1 
----------------------------

# Training a Neural Network



## Linear regression from scratch

This material is adapted from from [this tutorial](https://colab.research.google.com/github/AyyappaSwamyPanthadi/JovianDLwithPyTorch/blob/main/02_linear_regression.ipynb).


### Quick recap on Linear Regression (You may skip this)



In this tutorial, we'll discuss one of the foundational algorithms in machine learning: *Linear regression*. We'll create a model that predicts crop yields for apples and oranges (*target variables*) by looking at the average temperature, rainfall, and humidity (*input variables or features*) in a region. Here's the training data:

![linear-regression-training-data](https://i.imgur.com/6Ujttb4.png)

In a linear regression model, each target variable is estimated to be a weighted sum of the input variables, offset by some constant, known as a bias :

```
yield_apple  = w11 * temp + w12 * rainfall + w13 * humidity + b1
yield_orange = w21 * temp + w22 * rainfall + w23 * humidity + b2
```

Visually, it means that the yield of apples is a linear or planar function of temperature, rainfall and humidity:

![linear-regression-graph](https://i.imgur.com/4DJ9f8X.png)

The *learning* part of linear regression is to figure out a set of weights `w11, w12,... w23, b1 & b2` using the training data, to make accurate predictions for new data. The _learned_ weights will be used to predict the yields for apples and oranges in a new region using the average temperature, rainfall, and humidity for that region. 

We'll _train_ our model by adjusting the weights slightly many times to make better predictions, using an optimization technique called *gradient descent*. Let's begin by importing Numpy and PyTorch.

### Training data

We can represent the training data using two matrices: `inputs` and `targets`, each with one row per observation, and one column per variable.

In [47]:
# Input (temp, rainfall, humidity)
inputs_np = np.array([[73, 67, 43], 
                   [91, 88, 64], 
                   [87, 134, 58], 
                   [102, 43, 37], 
                   [69, 96, 70]], dtype='float32')

In [48]:
# Targets (apples, oranges)
targets_np = np.array([[56, 70], 
                    [81, 101], 
                    [119, 133], 
                    [22, 37], 
                    [103, 119]], dtype='float32')

We've separated the input and target variables because we'll operate on them separately. Also, we've created numpy arrays, because this is typically how you would work with training data: read some CSV files as numpy arrays, do some processing, and then convert them to PyTorch tensors.

Let's convert the arrays to PyTorch tensors.

In [49]:
# Convert inputs and targets to tensors
inputs = torch.from_numpy(inputs_np)
targets = torch.from_numpy(targets_np)
print(inputs)
print(targets)

tensor([[ 73.,  67.,  43.],
        [ 91.,  88.,  64.],
        [ 87., 134.,  58.],
        [102.,  43.,  37.],
        [ 69.,  96.,  70.]])
tensor([[ 56.,  70.],
        [ 81., 101.],
        [119., 133.],
        [ 22.,  37.],
        [103., 119.]])


### Prediction model and trainable parameters
The weights and biases (`w11, w12,... w23, b1 & b2`) can also be represented as matrices, initialized as random values. The first row of `w` and the first element of `b` are used to predict the first target variable, i.e., yield of apples, and similarly, the second for oranges.

In [50]:
# Weights and biases
w = torch.randn(2, 3, requires_grad=True)
b = torch.randn(2, requires_grad=True)
print(w)
print(b)

tensor([[ 1.2687,  1.4088, -0.0919],
        [ 0.5960, -1.3285, -0.0507]], requires_grad=True)
tensor([ 0.8669, -0.8363], requires_grad=True)


`torch.randn` creates a tensor with the given shape, with elements picked randomly from a [normal distribution](https://en.wikipedia.org/wiki/Normal_distribution) with mean 0 and standard deviation 1.



## Exercise 1: define the linear model
Our *model* is simply a function that performs a matrix multiplication of the `inputs` and the weights `w` (transposed) and adds the bias `b` (replicated for each observation).

![matrix-mult](https://i.imgur.com/WGXLFvA.png)

* Explain why the transpose and why the weight $W$ are at the right-hand-side of the matrix multiplication.
* Implement the model as a closure (recall that  `@` represents matrix multiplication in PyTorch, and the `.t` method returns the transpose of a tensor.)


In [51]:
# we use X * WT + b instead of W * X + b because the convention of pytorch uses column tensors 
debug = False
def exercise1():
  def model(inputs):
      '''This closure implements a linear model. 
      It should capture w and b from the local scope, while inputs are passed explicitly.'''
      wt = torch.t(w)
      return inputs@wt+b
  return model

def test_exercise1():
    model = exercise1()
    preds = model(inputs)
    global w, b
    with torch.no_grad():
      w*=2
      b*=2
      preds2 = model(inputs)
      w/=2
      b/=2
    if  preds.shape==targets.shape and torch.allclose(preds2, 2*preds, 1e-3):
      print('Exercise 1: OK')
    else:
      print('Exercise 1: NOK')
      if debug:
        print(f'targest have shape {targets.shape} while preds have shape {preds.shape}')
        print(f' result : {preds2}')
        print(f' expected : {preds*2}')
  

test_exercise1()

Exercise 1: OK



The matrix obtained by passing the input data into the model is a set of predictions for the target variables.

In [52]:
# Generate predictions
model=exercise1()
preds = model(inputs)
print(preds)

tensor([[ 183.9185,  -48.5211],
        [ 234.4100,  -66.7573],
        [ 294.6909, -129.9496],
        [ 187.4501,    0.9518],
        [ 217.2185,  -90.8017]], grad_fn=<AddBackward0>)


Let's compare the predictions of our model with the actual targets.

In [53]:
# Compare with targets
print(targets)

tensor([[ 56.,  70.],
        [ 81., 101.],
        [119., 133.],
        [ 22.,  37.],
        [103., 119.]])


You can see a big difference between our model's predictions and the actual targets because we've initialized our model with random weights and biases. Obviously, we can't expect a randomly initialized model to *just work*.

## Exercise 2: Loss function

Before we improve our model, we need a way to evaluate how well our model is performing. We can compare the model's predictions with the actual targets using the following method:

* Calculate the difference between the two matrices (`preds` and `targets`).
* Square all elements of the difference matrix to remove negative values.
* Calculate the average of the elements in the resulting matrix.

The result is a single number, known as the **mean squared error** (MSE).

In [76]:
debug = False
def mse(t1, t2):
    sub = t1 - t2
    square = sub.square()
    return square.mean()

def test_exercise2():
    t1=torch.linspace(1,25,12).view((3,-1))
    t2 = t1/2;
    loss = mse(t1,t2)
    expected = torch.tensor(56.4318)
    if  torch.allclose(loss, expected, 1e-3):
      print('Exercise 2: OK')
    else:
      print('Exercise 2: NOK')
      if debug:
        print(f' result : {loss}')
        print(f' expected : {expected}')
  

test_exercise2()

Exercise 2: OK


`torch.sum` returns the sum of all the elements in a tensor. The `.numel` method of a tensor returns the number of elements in a tensor. Let's compute the mean squared error for the current predictions of our model.

In [55]:
# Compute loss
loss = mse(preds, targets)
print(loss)

tensor(26783.3184, grad_fn=<MeanBackward0>)


Here’s how we can interpret the result: *On average, each element in the prediction differs from the actual target by the square root of the loss*. And that’s pretty bad, considering the numbers we are trying to predict are themselves in the range 50–200. The result is called the *loss* because it indicates how bad the model is at predicting the target variables. It represents information loss in the model: the lower the loss, the better the model.

### Compute gradients with back-propagation

With PyTorch, we can automatically compute the gradient or derivative of the loss w.r.t. to the weights and biases because they have `requires_grad` set to `True`. We'll see how this is useful in just a moment.

In [56]:
# Compute gradients
loss.backward()

The gradients are stored in the `.grad` property of the respective tensors. Note that the derivative of the loss w.r.t. the weights matrix is itself a matrix with the same dimensions.

In [57]:
# Gradients for weights
print(w)
print(w.grad)

tensor([[ 1.2687,  1.4088, -0.0919],
        [ 0.5960, -1.3285, -0.0507]], requires_grad=True)
tensor([[ 12668.0938,  12738.5078,   7925.1528],
        [-12989.5615, -15925.9668,  -9420.7705]])


### Adjust weights and biases to reduce the loss

In [58]:
w
w.grad

tensor([[ 12668.0938,  12738.5078,   7925.1528],
        [-12989.5615, -15925.9668,  -9420.7705]])

In [59]:
with torch.no_grad():
    w -= w.grad * 1e-5
    b -= b.grad * 1e-5

We multiply the gradients with a very small number (`10^-5` in this case) to ensure that we don't modify the weights by a very large amount. We want to take a small step in the downhill direction of the gradient, not a giant leap. This number is called the *learning rate* of the algorithm. 

We use `torch.no_grad` to indicate to PyTorch that we shouldn't track, calculate, or modify gradients while updating the weights and biases.

In [60]:
# Let's verify that the loss is actually lower
preds = model(inputs)
loss = mse(preds, targets)
print(loss)

tensor(18617.1914, grad_fn=<MeanBackward0>)


Before we proceed, we reset the gradients to zero by invoking the `.zero_()` method. We need to do this because PyTorch accumulates gradients. Otherwise, the next time we invoke `.backward` on the loss, the new gradient values are added to the existing gradients, which may lead to unexpected results.

In [61]:
w.grad.zero_()
b.grad.zero_()
print(w.grad)
print(b.grad)

tensor([[0., 0., 0.],
        [0., 0., 0.]])
tensor([0., 0.])


### Train the model using gradient descent

As seen above, we reduce the loss and improve our model using the gradient descent optimization algorithm. Thus, we can _train_ the model using the following steps:

1. Generate predictions

2. Calculate the loss

3. Compute gradients w.r.t the weights and biases

4. Adjust the weights by subtracting a small quantity proportional to the gradient

5. Reset the gradients to zero

Let's implement the above step by step.

In [62]:
# Generate predictions
preds = model(inputs)
print(preds)

tensor([[162.7268, -24.3158],
        [206.5986, -34.8910],
        [262.0020, -91.8423],
        [166.1174,  24.5366],
        [190.6994, -59.9538]], grad_fn=<AddBackward0>)


In [63]:
# Calculate the loss
loss = mse(preds, targets)
loss_at_init = loss.item()
print(loss_at_init)

18617.19140625


In [64]:
# Compute gradients
loss.backward()
print(w.grad)
print(b.grad)

tensor([[ 10482.5879,  10396.3672,   6478.5972],
        [-10486.2988, -13224.3857,  -7756.2744]])
tensor([ 121.4288, -129.2933])


Let's update the weights and biases using the gradients computed above.

In [65]:
# Adjust weights & reset gradients
with torch.no_grad():
    w -= w.grad * 1e-5
    b -= b.grad * 1e-5
    w.grad.zero_()
    b.grad.zero_()

Let's take a look at the new weights and biases.

In [66]:
print(w)
print(b)

tensor([[ 1.0372,  1.1774, -0.2359],
        [ 0.8308, -1.0370,  0.1211]], requires_grad=True)
tensor([ 0.8642, -0.8335], requires_grad=True)


With the new weights and biases, the model should have a lower loss.

In [73]:
# Calculate loss
preds = model(inputs)
loss = mse(preds, targets)
loss_after_update = loss.item()
print(f'Loss before update: {loss_at_init}')
print(f'Loss after  update: {loss_after_update}')

Loss before update: 18617.19140625
Loss after  update: 13107.2548828125


We have already achieved a significant reduction in the loss merely by adjusting the weights and biases slightly using gradient descent.

##  Exercise 3: Gathering the update code in one function
Assuming a set of `inputs,targets` annotations, perform a complete update of the model parameters `w,b`.

In [79]:
debug = False

def perform_one_update(inputs, targets, w, b):
    preds = model(inputs)
    loss = mse(preds, targets)
    loss.backward()
    with torch.no_grad():
        w -= w.grad * 1e-5
        b -= b.grad * 1e-5
        w.grad.zero_()
        b.grad.zero_()
    

def test_exercise3():   
    model = exercise1()
    preds = model(inputs)
    loss = mse(preds, targets)
    loss_at_init = loss.item()
    perform_one_update(inputs, targets, w, b)
    preds = model(inputs)
    loss = mse(preds, targets)
    loss_after_one_update = loss.item()
    if  loss_after_one_update < loss_at_init:
        print('Exercise 3: OK')
    else:
        print('Exercise 3: NOK')
        if debug:
            print(f'before update : {loss_at_init}')
            print(f'after update : {loss_after_one_update}')
  

test_exercise3()


Exercise 3: OK


## Full-batch gradient descent
To reduce the loss further, we can repeat the process of adjusting the weights and biases using the gradients multiple times. Each iteration is called an _epoch_. Let's train the model for 100 epochs.

In [80]:
def train_with_GD(inputs, targets, w, b, epochs=100):
    for epoch in range(epochs):
        perform_one_update(inputs, targets, w, b)
  
train_with_GD(inputs, targets, w, b, epochs=100)


Once again, let's verify that the loss is now lower:

In [81]:
# Calculate loss
preds = model(inputs)
loss = mse(preds, targets)
loss_after_all_epochs_of_GD = loss.item()
print(f'Loss at init: {loss_at_init}')
print(f'Loss at the end: {loss_after_all_epochs_of_GD}')

Loss at init: 18617.19140625
Loss at the end: 533.44775390625


The loss is now much lower than its initial value. Let's look at the model's predictions and compare them with the targets.

In [82]:
# Predictions
preds

tensor([[ 62.1546,  79.1335],
        [ 77.2815, 105.5760],
        [121.8764, 107.6118],
        [ 48.4121,  89.0070],
        [ 77.5716,  97.1679]], grad_fn=<AddBackward0>)

In [83]:
# Targets
targets

tensor([[ 56.,  70.],
        [ 81., 101.],
        [119., 133.],
        [ 22.,  37.],
        [103., 119.]])

The predictions are now quite close to the target variables. We can get even better results by training for a few more epochs. 

## Exercise 4: Implement Linear regression with SGD (instead of full batch GD)
In this exercise, you will implement SGD for the regression problem at hand. And you should notice that for 100 epochs, and the same initialization:
* SGD reaches a better loss than GD
* each epoch of SGD is slower than for GD because of lower parallelism

You can also implement SGD with Minibatch and verify it provides a trade-off between the loss and computation time.

In [85]:
#@title  New dataset
# Input (temp, rainfall, humidity)
inputs = np.array([[73, 67, 43], 
                   [91, 88, 64], 
                   [87, 134, 58], 
                   [102, 43, 37], 
                   [69, 96, 70], 
                   [74, 66, 43], 
                   [91, 87, 65], 
                   [88, 134, 59], 
                   [101, 44, 37], 
                   [68, 96, 71], 
                   [73, 66, 44], 
                   [92, 87, 64], 
                   [87, 135, 57], 
                   [103, 43, 36], 
                   [68, 97, 70]], 
                  dtype='float32')

# Targets (apples, oranges)
targets = np.array([[56, 70], 
                    [81, 101], 
                    [119, 133], 
                    [22, 37], 
                    [103, 119],
                    [57, 69], 
                    [80, 102], 
                    [118, 132], 
                    [21, 38], 
                    [104, 118], 
                    [57, 69], 
                    [82, 100], 
                    [118, 134], 
                    [20, 38], 
                    [102, 120]], 
                   dtype='float32')

inputs = torch.from_numpy(inputs)
targets = torch.from_numpy(targets)

In [99]:
#@title Adapt the train_with_GD into train_with_SGD : you should observe that the SGD algorithm is more efficient than GD
def train_with_SGD(inputs, targets, w, b, epochs=100):
    for epoch in range(epochs):
        np.random.shuffle(inputs) #PAS SHUFFLE INPUT MAIS LES INDICES SINON INPUTS MATCH PLUS AVEC TARGETS 
        for i in range(0,inputs.shape[0]):
            perform_one_update(inputs[i], targets[i], w, b)
    
def train_with_minibatchSGD(inputs, targets, w, b, epochs=100, bs=-1):
    raise Exception('You need to implement the function train_with_minibatchSGD')
                
def run_method_and_return_loss(method, w0, b0):
    kwargs={}
    if method=='SGD':
        train_function = train_with_SGD
    elif(method=='mini'):
        train_function = train_with_minibatchSGD        
        kwargs['bs'] = 4
    else:
        train_function = train_with_GD
    global w, b
    w = w0.clone().requires_grad_()
    b = b0.clone().requires_grad_()
    def model(inputs):
        return inputs @ w.t() +  b
    with torch.no_grad():
        preds = model(inputs)
        loss_at_init = mse(preds, targets).item()
    # perform  for  100 epochs
    train_function(inputs, targets, w, b, epochs=100, **kwargs)
    with torch.no_grad():
        preds = model(inputs)
        loss_at_end = mse(preds, targets).item()
    return loss_at_init, loss_at_end

def test_exercise4():
    # init weights and biases
    w0 = torch.randn(2,3)
    b0 = torch.randn(2)
    loss_at_init, loss_at_end = run_method_and_return_loss('GD', w0, b0)
    print(f'Loss at init of  GD: {loss_at_init}')
    print(f'Loss after 100 epochs of GD: {loss_at_end}')
    loss_at_init, loss_at_end = run_method_and_return_loss('SGD', w0, b0)
    print(f'Loss at init of SGD: {loss_at_init}')
    print(f'Loss after 100 epochs of SGD: {loss_at_end}')
    loss_at_init, loss_at_end = run_method_and_return_loss('mini', w0, b0)
    print(f'Loss at init of miniSGD: {loss_at_init}')
    print(f'Loss after 100 epochs of miniSGD: {loss_at_end}')
    print('What\'s the catch ?')
    print('=======================Profiling GD=======================')
    %time run_method_and_return_loss('GD', w0, b0)
    print('=======================Profiling SGD======================')
    %time run_method_and_return_loss('SGD', w0, b0)
    print('=====================Profiling mini SGD===================')
    %time run_method_and_return_loss('mini', w0, b0)

test_exercise4()


Loss at init of  GD: 16225.15625
Loss after 100 epochs of GD: 1190.6666259765625
Loss at init of SGD: 16225.15625
Loss after 100 epochs of SGD: 1191.3359375


Exception: You need to implement the function train_with_minibatchSGD

## Exercise 5: Other optimization variants

If you finish early enough you can also consider implementing the following extensions:
* momentum (a.k.a heavy ball)
* Nesterov acceleration
* ADAM


You can also experiment with so-called `weight decay` (which is merely an L$^2$ regularization on the weight matrix $W$).

For all variants, you should investigate its impact on 
* convergence speed (in terms of loss), 
* computational burden of each epoch

In real examples (by opposition to this toy example) you should also investigate generalization (regression loss on a hold-out set).

In [83]:
class OptimSGD:
    def __init__(self, w, b):
        self.w = w
        self.b = b
        def model(inputs):
            return inputs @ w.t() +  b
        self.model = model
        self.deltaw_prev = torch.zeros_like(w)
        self.deltab_prev = torch.zeros_like(b)
    
    def step(self, inputs, targets):
        preds = self.model(inputs)
        loss = mse(preds, targets)
        loss.backward()
        delta_w = w.grad * 1e-5
        delta_b = b.grad * 1e-5
        with torch.no_grad():
            self.w -= delta_w
            self.b -= delta_b
            self.w.grad.zero_()
            self.b.grad.zero_()
        self.update_deltas(delta_w, delta_b)
            
    def update_deltas(self, delta_w, delta_b):
        self.deltaw_prev.copy_(delta_w)
        self.deltab_prev.copy_(delta_b)


class OptimMomentum(OptimSGD):
    def __init__(self, w, b, gamma):
        super().__init__(w, b)
        self.gamma = gamma
    
    def step(self, inputs, targets):
        raise Exception('You need to implement the function OptimMomentum::step()')


class OptimNesterov(OptimSGD):
    def __init__(self, w, b, gamma):
        super().__init__(w, b)
        self.gamma = gamma
        
    def step(self, inputs, targets):        
        raise Exception('You need to implement the function OptimMomentum::step()')

def optim_factory(method, w, b):
    if method == 'SGD':
        return OptimSGD(w,b)
    
    elif method == 'momentum':
        return OptimMomentum(w,b, gamma= 1e-1)
    
    elif method == 'nesterov':
        return OptimNesterov(w,b, gamma= 1e-1)

    
def train_with_minibatchSGD(method, inputs, targets, w, b, epochs=100, bs=-1):
    optim = optim_factory(method, w, b)
    nsamples = inputs.shape[0]
    if bs == -1: 
        bs = nsamples
    nb_batches = int(nsamples/bs)
    for epoch in range(epochs):
        shuffled_indices = torch.randperm(nsamples)
        batches = torch.chunk(shuffled_indices, nb_batches)
        for batch in batches:
            optim.step(inputs[batch], targets[batch])
            
def run_method_and_return_loss(method, w0, b0):
    global w, b
    w = w0.clone().requires_grad_()
    b = b0.clone().requires_grad_()
    with torch.no_grad():
        preds = model(inputs)
        loss_at_init = mse(preds, targets).item()
    # perform  for  100 epochs
    train_with_minibatchSGD(method, inputs, targets, w, b, epochs=100, bs=1)
    with torch.no_grad():
        preds = model(inputs)
        loss_at_end = mse(preds, targets).item()
    return loss_at_init, loss_at_end


def test_exercise5():
    # init weights and biases
    w0 = torch.randn(2,3)
    b0 = torch.randn(2)
    loss_at_init, loss_at_end = run_method_and_return_loss('SGD', w0, b0)
    print(f'Loss at init of  SGD: {loss_at_init}')
    print(f'Loss after 100 epochs of SGD: {loss_at_end}')
    loss_at_init, loss_at_end = run_method_and_return_loss('momentum', w0, b0)
    print(f'Loss at init of  Momentum: {loss_at_init}')
    print(f'Loss after 100 epochs of Momentum: {loss_at_end}')
    loss_at_init, loss_at_end = run_method_and_return_loss('nesterov', w0, b0)
    print(f'Loss at init of  Nesterov: {loss_at_init}')
    print(f'Loss after 100 epochs of Nesterov: {loss_at_end}')
    print('=======================Profiling SGD=======================')
    %time run_method_and_return_loss('SGD', w0, b0)
    print('=======================Profiling momentum=======================')
    %time run_method_and_return_loss('momentum', w0, b0)
    print('=======================Profiling nesterov=======================')
    %time run_method_and_return_loss('nesterov', w0, b0)
   
test_exercise5()

Loss at init of  SGD: 28292.6953125
Loss after 100 epochs of SGD: 2.2992875576019287
Loss at init of  Momentum: 28292.6953125
Loss after 100 epochs of Momentum: 1.1301101446151733
Loss at init of  Nesterov: 28292.6953125
Loss after 100 epochs of Nesterov: 1.1042059659957886
CPU times: user 224 ms, sys: 1.4 ms, total: 226 ms
Wall time: 226 ms
CPU times: user 249 ms, sys: 1.23 ms, total: 250 ms
Wall time: 250 ms
CPU times: user 271 ms, sys: 1.09 ms, total: 272 ms
Wall time: 272 ms
