## Here there isneed of finding gradient for optimization 

In [15]:
import torch

x = torch.randn(3, requires_grad=True)
print(x)

tensor([-0.8525, -0.6867, -0.5735], requires_grad=True)


In [16]:
y = x +2
print(y)

tensor([1.1475, 1.3133, 1.4265], grad_fn=<AddBackward0>)


Here there is gradient function addBackward

In [17]:
y = x - 2 
y

tensor([-2.8525, -2.6867, -2.5735], grad_fn=<SubBackward0>)

Here there is gradient function subtractBackward

In [18]:
z = x.mean()
z

tensor(-0.7042, grad_fn=<MeanBackward0>)

When backward() is called it will do the optimization of the gradient function ie grad_fn mentioned in the value ie example grad_fn=<SubBackward0>

In [21]:
vector = torch.tensor([0.1, 1.0, 0.001], dtype = torch.float32)
y.backward(vector)
y

tensor([-2.8525, -2.6867, -2.5735], grad_fn=<SubBackward0>)

In [22]:
z.backward()
z

tensor(-0.7042, grad_fn=<MeanBackward0>)

Usually when backward() is called the Jacobian vector multiplication is being taken place
So when we run z.backward(), z should be a scalar value, if not then we should give a vector
Suppose you have a neural network 
𝑓
𝜃
(
𝑥
)
f 
θ
​
 (x) parameterized by 
𝜃
θ, where 
𝑥
x is the input data and 
𝜃
θ are the model parameters. To compute gradients efficiently, you might compute the Jacobian-vector product 
𝐽
𝑓
𝜃
(
𝑥
)
⋅
𝑣
, where 
𝑣
v is a vector representing the direction in parameter space you want to compute the effect of the function 
𝑓
𝜃

​

## After finding optimization we need to end the value optimziation so we need to do it by : 

Method 1 : x.detach()

In [24]:
removed = z.detach()
removed

tensor(-0.7042)

Method 2 : x.requires_grad_(False)

In [26]:
x.requires_grad_(False)
x

tensor([-0.8525, -0.6867, -0.5735])

## Need to reset the gradient while in Gradient optimization

In [29]:
import torch 

weights = torch.ones(4, requires_grad=True)

for epoch in range(3):
    print(f"At the beginning of iteration : {weights.grad}")
    model_output = (weights*3).sum()
    model_output.backward()
    print(f"At the end of iteration : {weights.grad}")

At the beginning of iteration : None
At the end of iteration : tensor([3., 3., 3., 3.])
At the beginning of iteration : tensor([3., 3., 3., 3.])
At the end of iteration : tensor([6., 6., 6., 6.])
At the beginning of iteration : tensor([6., 6., 6., 6.])
At the end of iteration : tensor([9., 9., 9., 9.])


Here the gradient is being accumulated on each iteration so we need to reset it to zero

In [32]:
import torch 

weights = torch.ones(4, requires_grad=True)

for epoch in range(3):
    print(f"At the beginning of iteration : {weights.grad}")
    model_output = (weights*3).sum()
    model_output.backward()
    print(f"At the end of iteration : {weights.grad}")
    weights.grad.zero_()

At the beginning of iteration : None
At the end of iteration : tensor([3., 3., 3., 3.])
At the beginning of iteration : tensor([0., 0., 0., 0.])
At the end of iteration : tensor([3., 3., 3., 3.])
At the beginning of iteration : tensor([0., 0., 0., 0.])
At the end of iteration : tensor([3., 3., 3., 3.])


In [42]:
import torch 
from torch.optim import SGD

x = torch.tensor(1.0)
y = torch.tensor(2.0)

w = torch.tensor(1.0, requires_grad=True)
optimizer = SGD([w], lr=0.05)

for epoch in range(4):
    y_hat = w * x
    loss = (y_hat - y)**2

    print(loss)

    loss.backward()

    print(f"Gradient = {w.grad} , Weight = {w}")

    optimizer.step()
    print(f"After step weight = {w}")
    optimizer.zero_grad()


tensor(1., grad_fn=<PowBackward0>)
gradient = -2.0 weight = 1.0
After step weight = 1.100000023841858
tensor(0.8100, grad_fn=<PowBackward0>)
gradient = -1.7999999523162842 weight = 1.100000023841858
After step weight = 1.190000057220459
tensor(0.6561, grad_fn=<PowBackward0>)
gradient = -1.619999885559082 weight = 1.190000057220459
After step weight = 1.2710000276565552
tensor(0.5314, grad_fn=<PowBackward0>)
gradient = -1.4579999446868896 weight = 1.2710000276565552
After step weight = 1.3438999652862549


## Small optimization

### Using numpy

In [45]:
import numpy as np

x = np.array([1, 2, 3, 4], dtype=np.float32)
y = np.array([2, 4, 6, 8], dtype=np.float32)

w = 0.0

def forward(x):
    return w*x

def loss(y, y_predicted):
    return ((y_predicted - y)**2).mean()

# gradient
# MSE = 1/N * (w*x - y)**2
# d3/dw = 1/N * 2x(w*x - y)
def gradient(x, y, y_predicted):
    return np.dot(2*x, y_predicted - y).mean()

print(f"Prediction before training: f(5) = {forward(5):.3f}")

# Training

learning_rate = 0.01
n_iters = 12

for epoch in range(n_iters):
    y_predicted = forward(x)
    l = loss(y, y_predicted)
    dw = gradient(x, y, y_predicted)
    
    w -= learning_rate * dw
    
    if(epoch % 1 == 0):
        print(f"epoch {epoch+1}: w = {w:.3f}, loss = {l:.8f}")
        
print(f"Prediction after training: f(5) = {forward( 5):.3f}")

Prediction before training: f(5) = 0.000
epoch 1: w = 1.200, loss = 30.00000000
epoch 2: w = 1.680, loss = 4.79999924
epoch 3: w = 1.872, loss = 0.76800019
epoch 4: w = 1.949, loss = 0.12288000
epoch 5: w = 1.980, loss = 0.01966083
epoch 6: w = 1.992, loss = 0.00314574
epoch 7: w = 1.997, loss = 0.00050331
epoch 8: w = 1.999, loss = 0.00008053
epoch 9: w = 1.999, loss = 0.00001288
epoch 10: w = 2.000, loss = 0.00000206
epoch 11: w = 2.000, loss = 0.00000033
epoch 12: w = 2.000, loss = 0.00000005
Prediction after training: f(5) = 10.000


### Using pytorch

In [55]:
import torch

x = torch.tensor([1, 2, 3, 4], dtype=torch.float32)
y = torch.tensor([2, 4, 6, 8], dtype=torch.float32)

w = torch.tensor(0.0, dtype = torch.float32, requires_grad=True)

def forward(x):
    return w*x

def loss(y, y_predicted):
    return ((y_predicted - y)**2).mean()

print(f"Prediction before training: f(5) = {forward(5):.3f}")

# Training

learning_rate = 0.01
n_iters = 100

for epoch in range(n_iters):
    y_predicted = forward(x)
    l = loss(y, y_predicted)
    
    l.backward() # Here no need of gradient function as in numpy
    
    with torch.no_grad():
        w -= learning_rate * w.grad
    
    w.grad.zero_()
    
    if(epoch % 1 == 0):
        print(f"epoch {epoch+1}: w = {w:.3f}, loss = {l:.8f}")
        
print(f"Prediction after training: f(5) = {forward( 5):.3f}")

Prediction before training: f(5) = 0.000
epoch 1: w = 0.300, loss = 30.00000000
epoch 2: w = 0.555, loss = 21.67499924
epoch 3: w = 0.772, loss = 15.66018772
epoch 4: w = 0.956, loss = 11.31448650
epoch 5: w = 1.113, loss = 8.17471695
epoch 6: w = 1.246, loss = 5.90623236
epoch 7: w = 1.359, loss = 4.26725292
epoch 8: w = 1.455, loss = 3.08308983
epoch 9: w = 1.537, loss = 2.22753215
epoch 10: w = 1.606, loss = 1.60939169
epoch 11: w = 1.665, loss = 1.16278565
epoch 12: w = 1.716, loss = 0.84011245
epoch 13: w = 1.758, loss = 0.60698116
epoch 14: w = 1.794, loss = 0.43854395
epoch 15: w = 1.825, loss = 0.31684780
epoch 16: w = 1.851, loss = 0.22892261
epoch 17: w = 1.874, loss = 0.16539653
epoch 18: w = 1.893, loss = 0.11949898
epoch 19: w = 1.909, loss = 0.08633806
epoch 20: w = 1.922, loss = 0.06237914
epoch 21: w = 1.934, loss = 0.04506890
epoch 22: w = 1.944, loss = 0.03256231
epoch 23: w = 1.952, loss = 0.02352631
epoch 24: w = 1.960, loss = 0.01699772
epoch 25: w = 1.966, loss = 

Here we do the optimization manually there is need of many number of epochs