In [None]:
import torch
from torch import nn
import numpy as np
import matplotlib.pyplot as plt
import sys
from platform import python_version

In [None]:
print(torch.__version__)

In [None]:
print(python_version())

# 2.1 Introduction to Autograd

## What is Autograd

Autograd package is an engine to calculate derivatives which is Jacobian-vector product. It provides automatic differentiation for all operations on Tensors. It is a define-by-run framework, which means that your backprop is defined by how your code is run, and that every single iteration can be different.

## Why Autograd

Neural networks are nothing more than composite mathematical functions that are delicately tweaked (trained) to output the required result. The tweaking or the training is done through a remarkable algorithm called backpropagation. Backpropagation is used to calculate the gradients of the loss with respect to the input weights to later update the weights and eventually reduce the loss. With autograd, we can skip all the steps to manually calculate our gradients. This can save us time and energy.

## How to perform Autograd

In [None]:
# without autograd
t_1 = torch.randn(5)
print(f'Without autograd: {t_1}')

# with autograd
t_2 = torch.randn(5, requires_grad=True)
print(f'With autograd: {t_2}')

In [None]:
t_add = t_1 + t_2
print(f'Addition of tensor: {t_add}')
print(f'Addition of tensor with autograd has attribute of requires_grad: {t_add.requires_grad}')

In this case, grad_fn has attribute of `AddBackward` because our tensor operation is addition.

In [None]:
t_sub = t_1 - t_2
t_mul = t_1 * t_2
t_mean = t_add.mean()
t_sig = t_add.sigmoid()
print(f'Subtraction of tensor with autograd has attribute of requires_grad: {t_sub.requires_grad}')
print(f'Multiplication of tensor with autograd has attribute of requires_grad: {t_mul.requires_grad}')
print(f'Mean of tensor with autograd has attribute of requires_grad: {t_mean.requires_grad}')
print(f'Sigmoid of tensor with autograd has attribute of requires_grad: {t_sig.requires_grad}')

## How grad is stored in tensor

Gradient(s) are calculated automatically by calling `.backward()` function.

In [None]:
t_mean.backward()

In [None]:
print(f'Gradient of t_1 without autograd: {t_1.grad}')
print(f'Gradient of t_2 with autograd: {t_2.grad}')

> grad can be implicitly created only for **_scalar_** outputs.

Basically need to multiply with vector to produce scalar output, which is Jacobian product.

## How to exclude gradient calculation

Sometimes during our training loop when we want to update our weights, then this operation should not be part of the gradient computation. Therefore, we need to exclude gradient calculation. We can do it with 3 ways:
- `x.requires_grad_(False)`
- `x.detach()`
- `with torch.no_grad():`

In [None]:
t_1 = torch.randn(5, requires_grad=True)
print(f'Autograd tensor: {t_1}')

t_1.requires_grad_(False)
print(f'Without autograd tensor: {t_1}')

In [None]:
# detach() will create new tensor with same value but it doesn't require the gradient

t_2 = t_1.detach()
print(f't_2: {t_2}')

In [None]:
t_1 = torch.randn(5, requires_grad=True)
print(f'Autograd tensor: {t_1}')

t_1.detach_()
print(f'Inplace detach tensor: {t_1}')

In [None]:
t_1 = torch.randn(5, requires_grad=True)
print(f't_1: {t_1}')

t_ans = t_1 + 2
print(f't_ans: {t_ans}')

with torch.no_grad():
    t_ans = t_1 + 2
    print(f't_ans: {t_ans}')
print(f'Final t_ans: {t_ans}')

Note that whenever we call `.backward()`, the gradient for this tensor will be accumulated in `.grad` attribute. As the result their values will be summed up.

In [None]:
weights = torch.tensor([1., 2., 3., 4., 5.], requires_grad=True)
for epoch in range(5):
    output = (weights*2).sum()
    output.backward()
    print(f'Epoch {epoch}: {weights.grad}')

All the gradients are summed up and our weights or gradients are clearly incorrect. Before we do the next iteration step and optimization step, we must empty the gradient so we must call `.grad.zero_()` function.

In [None]:
weights = torch.tensor([1., 2., 3., 4., 5.], requires_grad=True)
for epoch in range(5):
    output = (weights*2).sum()
    output.backward()
    print(f'Epoch {epoch}: {weights.grad}')
    weights.grad.zero_()

# 2.2 Linear Regression Example

General linear regression function: $$y = wX + b$$

In this example, let our function $f(x) = 4x$, for now just ignore bias $b$. First we will generate dummy data of $X$ and $y$. Below is the plot of our data which the function $f(x) = 4x$. We will use linear regression to find the weight/gradient which should be $w = 4$ and predict the $y$ value respective to $x$ value of 250 which should be $y = 1000$

In [None]:
X = np.array([2, 5, 7, 10, 15], dtype=np.float32)
y = np.array([8, 20, 28, 40, 60], dtype=np.float32)
plt.scatter(X, y)
plt.xlabel('X')
plt.ylabel('y')
plt.show()

There are many ways to do linear regression. Here we will show you 3 ways to do so which are:
- Linear Regression using **NumPy** 
- Linear Regression using **PyTorch** without **_Autograd_**
- Linear Regression using **PyTorch** with **_Autograd_**

## Linear Regression using NumPy

First, lets initiate weight to zero and some of our hyperparameters at the beginning. 

In [None]:
w = 0.0
learning_rate = 0.001
n_iters = 15

Define `forward()` function to return model prediction.

In [None]:
def forward(X):
    return w * X

Define `lossMSE()` for our loss function. In this case, we will use **Mean Square Error** for our loss function.
$$MSE = \frac {1}{N}(Y\_Pred- Y)^2$$

In [None]:
def lossMSE(y, y_pred):
    return ((y_pred - y)**2).mean()

Define `gradient()` function to return gradient of the loss with respect to our parameters.
$$\frac{dJ}{dw} = \frac{1}{N}(2X)(Y\_Pred-Y)$$

In [None]:
def gradient(X, y, y_pred):
    return np.dot(2*X, y_pred-y).mean()

Then we first make use the linear regression to predict $f(250)$ before the training.

In [None]:
print(f'Prediction before training: f(250) = {forward(250):.3f}')

Now we will train our linear regression model.

In [None]:
for epoch in range(n_iters):
    # forward pass
    y_pred = forward(X)
    
    # loss
    loss = lossMSE(y, y_pred)
    
    # backward pass
    dw = gradient(X, y, y_pred)
    
    # update weights
    w -= learning_rate * dw
    
    print(f'Epoch {epoch+1}: weight = {w:.5f}, loss = {loss:.10f}')

In [None]:
print(f'Prediction after training: f(250) = {forward(250):.3f}')

## Linear Regression using PyTorch without Autograd

Now we will use PyTorch to train our linear regression model. First, we convert our dataset $X$, $y$ from numpy array to pytorch tensor using `torch.from_numpy()` and initiate tensor $w$ with zero using `torch.tensor()`.

In [None]:
X = torch.from_numpy(X)
y = torch.from_numpy(y)
w = torch.zeros(1, dtype=torch.float32)

Then we define some functions for our linear regression.

In [None]:
def forward(X):
    return w * X
def lossMSE(y, y_pred):
    return ((y_pred - y)**2).mean()
def gradient(X, y, y_pred):
    return torch.matmul(2*X, y_pred-y).mean()

Then we define our linear regression function without autograd.

In [None]:
def linearRegressionNoAutograd(X, y, w):
    print(f'Prediction before training: f(250) = {forward(250).item():.3f}')
    for epoch in range(n_iters):
        y_pred = forward(X)
        loss = lossMSE(y, y_pred)
        dw = gradient(X, y, y_pred)
        w -= learning_rate * dw
        print(f'Epoch {epoch+1}: weight = {w.item():.5f}, loss = {loss.item():.10f}')
    print(f'Prediction after training: f(250) = {forward(250).item():.3f}')

In [None]:
linearRegressionNoAutograd(X, y, w)

## Linear Regression using PyTorch with Autograd

With `autograd`, we no need to manually calculate gradient anymore with `.backward()` function.

In [None]:
w.zero_()
w.requires_grad_(True)

Then we define our linear regression function with autograd.

In [None]:
def linearRegressionAutograd(X, y, w):
    print(f'Prediction before training: f(250) = {forward(250).item():.3f}')
    for epoch in range(n_iters):
        y_pred = forward(X)
        loss = lossMSE(y, y_pred)
        loss.backward()
        with torch.no_grad():
            w -= learning_rate * w.grad
        w.grad.zero_()
        print(f'Epoch {epoch+1}: weight = {w.item():.5f}, loss = {loss.item():.10f}')
    print(f'Prediction after training: f(250) = {forward(250).item():.3f}')

In [None]:
linearRegressionAutograd(X, y, w)

`.backward()` back propagation function is not as exact as numerical gradient computation. So we may requires to tune our hyperparameters such as number of iterations and learning rates. Lets reset our weight to zero using `.storage()`. Note that we can't reset our $w$ using inplace operation.
> PyTorch doesn’t allow in-place operations on leaf variables that have `requires_grad=True` (such as parameters of your model) because the developers could not decide how such an operation should behave. If you want the operation to be differentiable, you can work around the limitation by cloning the leaf variable (or use a non-inplace version of the operator). Source: [PyTorch Forum](https://discuss.pytorch.org/t/leaf-variable-was-used-in-an-inplace-operation/308/2)

In [None]:
learning_rate = 0.005
w.storage()[:] = 0
linearRegressionAutograd(X, y, w)

### Introduction to Optimizer and Loss Function

**Optimizers** are algorithms or methods used to change the attributes of the neural network such as weights and learning rate to reduce the losses. Optimizers are used to solve optimization problems by minimizing the function. [Source](https://towardsdatascience.com/overview-of-various-optimizers-in-neural-networks-17c1be2df6d5)

**Loss function** is a method of evaluating how well specific algorithm models the given data. If predictions deviates too much from actual results, loss function would cough up a very large number. Gradually, with the help of some optimization function, loss function learns to reduce the error in prediction. [Source](https://towardsdatascience.com/common-loss-functions-in-machine-learning-46af0ffc4d23)

In [None]:
optimizer = torch.optim.SGD([w], lr=learning_rate)
lossMSE = nn.MSELoss()

In [None]:
def linearRegressionAutogradOptimizerLoss(X, y, w):
    print(f'Prediction before training: f(250) = {forward(250).item():.3f}')
    for epoch in range(n_iters):
        y_pred = forward(X)
        loss = lossMSE(y, y_pred)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        print(f'Epoch {epoch+1}: weight = {w.item():.5f}, loss = {loss.item():.10f}')
    print(f'Prediction after training: f(250) = {forward(250).item():.3f}')

In [None]:
w.storage()[:] = 0
linearRegressionAutogradOptimizerLoss(X, y, w)

# Exercise

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.datasets import make_regression

In [None]:
X, y = make_regression(n_samples=50, n_features=1, noise=2, random_state=428)
X = torch.from_numpy(X).reshape(50, 1)
y = torch.from_numpy(y).reshape(50, 1)
plt.scatter(X, y)
plt.xlabel('X')
plt.ylabel('y')
plt.show()

In [None]:
print(f'Size of X: {X.shape}')
print(f'Size of y: {y.shape}')

**_TASK_**: Build a **linear regression model** to predict the $y$ value of 428 using **PyTorch** with autograd, optimizer and loss function.
> **Challenge**: Convergence of model within 15 iterations.

In [None]:
w = torch.zeros(1, dtype=torch.float32, )
learning_rate = 0
n_iters = 0
optimizer = None
lossMSE = None
def forward(X):
    return None
print(f'Prediction before training: f(428) = {forward(428).item():.3f}')
for epoch in range(n_iters):
    y_pred = None
    loss = None
    loss.()
    optimizer.()
    optimizer.()
    print(f'Epoch {epoch+1}: weight = {w.item():.5f}, loss = {loss.item():.10f}')
print(f'Prediction after training: f(428) = {forward(428).item():.3f}')

**Expected Output:**
```
weight = 29.835, loss = 3.494
Prediction after training: f(428) = 12769.(approx)
```