Adapted from tutorials at pytorch.org

# Autograd: Automatic differentiation

In [2]:
from torch.autograd import Variable
import torch
import numpy as np

In [3]:
a = Variable(torch.randn(2,3), requires_grad=True)

.data , .grad, .grad_fn

In [4]:
a

Variable containing:
 0.8333 -0.7631  0.6214
-0.8634  0.6053 -0.9941
[torch.FloatTensor of size 2x3]

In [5]:
a.data


 0.8333 -0.7631  0.6214
-0.8634  0.6053 -0.9941
[torch.FloatTensor of size 2x3]

In [6]:
print(a.grad)

None


In [7]:
b = a*a

In [8]:
c = b.mean()

If you want to compute the derivatives, you can call .backward() on a Variable. 

If Variable is a scalar (i.e. it holds a one element tensor), you don’t need to specify any arguments to backward(), however if it has more elements, you need to specify a grad_output argument that is a tensor of matching shape.

In [9]:
c.backward()

In [10]:
c

Variable containing:
 0.6272
[torch.FloatTensor of size 1]

In [12]:
a.grad

Variable containing:
 0.2778 -0.2544  0.2071
-0.2878  0.2018 -0.3314
[torch.FloatTensor of size 2x3]

In [51]:
x = Variable(torch.ones(2, 2), requires_grad=True)
y = x + 2
z = y * y
z.backward(torch.ones(2, 2), retain_graph=True)
# the retain_variables flag will prevent the internal buffers from being freed
print(x.grad)

Variable containing:
 6  6
 6  6
[torch.FloatTensor of size 2x2]



In [57]:
y = x + 2

gradient = torch.randn(2, 2)

# this would fail if we didn't specify
# that we want to retain variables
y.backward(gradient)

print(x.grad)

Variable containing:
  9.1141  10.4848
  6.7608  10.2028
[torch.FloatTensor of size 2x2]



In [58]:
y.backward(torch.ones(2, 2))
print(x.grad)

Variable containing:
 10.1141  11.4848
  7.7608  11.2028
[torch.FloatTensor of size 2x2]



# 2 - Layer Network MLP

In [115]:
N, D_in, H, D_out = 64, 1000, 100, 10

In [116]:
x = Variable(torch.randn(N, D_in), requires_grad= False)
y = Variable(torch.randn(N, D_out), requires_grad= False)

In [117]:
w1 = Variable(torch.randn(D_in, H), requires_grad=True)
w2 = Variable(torch.randn(H, D_out ), requires_grad=True)

In [120]:
w1.data


-1.0402e+00  2.7434e-01 -9.8606e-01  ...  -1.6858e+00  6.5415e-01  8.6206e-01
 1.6614e+00  7.0503e-01 -2.1643e+00  ...  -4.9011e-01 -8.0367e-04  7.8051e-01
-3.6560e-01  2.1856e+00  6.9547e-01  ...   6.2659e-01  1.4984e-01  1.1572e+00
                ...                   ⋱                   ...                
 1.4684e-01  9.9845e-01  1.2135e+00  ...   1.0289e-01  7.8113e-01 -4.4499e-01
 9.0211e-01  1.4462e+00 -8.0597e-01  ...  -2.3426e-01  2.9448e-01 -5.9979e-01
 4.4465e-01  1.4254e+00  5.1383e-01  ...  -3.9099e-01  1.9480e+00  9.8071e-01
[torch.FloatTensor of size 1000x100]

In [119]:
print(w1.grad)

None


Forward pass

In [121]:
h_out = x.mm(w1)

In [122]:
h_out_relu = h_out.clamp(min=0)

In [123]:
y_pred = h_out_relu.mm(w2)

MSE loss calculation

In [124]:
(y - y_pred).pow(2).mean()

Variable containing:
 49703.0195
[torch.FloatTensor of size 1]

In [125]:
loss = (y-y_pred).pow(2).mean()

Backward pass

In [126]:
loss.backward()

SGD Learning rule : Parameter update

In [127]:
learning_rate = 1e-5

In [128]:
w1.data -= learning_rate*w1.grad.data

In [129]:
w2.data -= learning_rate*w2.grad.data

In [130]:
w1.grad

Variable containing:
-6.5523e+00 -5.8850e+00  7.1335e+00  ...  -9.2303e+00  9.0762e+00 -2.5260e+00
-4.3329e+00 -4.3050e+00  1.4226e+01  ...   1.1594e+01 -3.8298e+00 -1.3600e+01
-3.1369e+00  2.3575e+00  1.6431e+01  ...  -2.9896e+00  1.1508e+01  1.9873e+00
                ...                   ⋱                   ...                
 3.4611e+01 -4.2962e+00  1.2984e+01  ...  -7.5931e+00 -4.2852e+00  3.5785e+00
-1.2022e+01 -2.4825e+01 -6.1322e+00  ...   3.0110e+01 -2.5653e+01 -2.5324e+00
 1.6616e+00  5.5213e-01 -7.8417e+00  ...  -1.5453e+01 -2.2512e+00  8.5819e+00
[torch.FloatTensor of size 1000x100]

In [131]:
w2.grad

Variable containing:
 -165.3033   509.7229  1023.4786  ...    320.2248  -324.8032  -106.2662
 -534.9585   296.9145   897.2170  ...     29.8218  -577.4785  -234.4964
 -300.4757   -33.3462   640.7281  ...    156.2877   -89.6661    69.2354
              ...                  ⋱                 ...               
 -109.2193    85.5342   642.8596  ...    121.7496  -116.1697  -135.5347
 -142.9786   237.6744   755.4649  ...     22.7992  -367.2899   -55.7950
 -195.6143    76.7552   518.6180  ...    198.4615     5.4108   -93.4247
[torch.FloatTensor of size 100x10]

Set w1.grad and w2.grad to zero after updating w1 and w2

In [133]:
w1.grad.data.zero_()
w2.grad.data.zero_();

In [73]:
for i in range(100):
    h_out = x.mm(w1)
    h_out_relu = h_out.clamp(min=0)
    y_pred = h_out_relu.mm(w2)
    
    loss = (y-y_pred).pow(2).mean()
    if i%10==0:
        print(i, loss.data[0])
#     print(i, loss)
    
    
    loss.backward()
    
    w1.data -= learning_rate*w1.grad.data
    w2.data -= learning_rate*w2.grad.data
    
    w1.grad.data.zero_()
    w2.grad.data.zero_()

0 2349.51220703125
10 2107.16650390625
20 1894.9832763671875
30 1709.343505859375
40 1547.046630859375
50 1402.998291015625
60 1275.3468017578125
70 1162.5869140625
80 1063.0400390625
90 973.5719604492188


# Using torch.nn

In [77]:
N, D_in, H, D_out = 64, 1000, 100, 10

In [78]:
x = Variable(torch.randn(N, D_in), requires_grad= False)
y = Variable(torch.randn(N, D_out), requires_grad= False)

In [79]:
w1 = Variable(torch.randn(D_in, H), requires_grad=True)
w2 = Variable(torch.randn(H, D_out ), requires_grad=True)

In [80]:
import torch.nn as nn

In [81]:
model = nn.Sequential(
    nn.Linear(D_in, H),
    nn.ReLU(),
    nn.Linear(H, D_out)
)

Forward pass

In [82]:
y_pred = model(x)

In [83]:
loss_fn = nn.MSELoss()

Loss calculation

In [84]:
loss_fn(y_pred, y)

Variable containing:
 1.0958
[torch.FloatTensor of size 1]

In [85]:
loss = loss_fn(y_pred, y)

Backward pass

In [86]:
loss.backward()

Paramter update

In [87]:
for param in model.parameters():
    param.data -= learning_rate*param.grad.data

Make gradients zero after updating to avoid accumulation of gradients 

In [89]:
model.zero_grad()

# Using optim for learning rule

In [108]:
N, D_in, H, D_out = 64, 1000, 100, 10

In [109]:
x = Variable(torch.randn(N, D_in), requires_grad= False)
y = Variable(torch.randn(N, D_out), requires_grad= False)

In [110]:
w1 = Variable(torch.randn(D_in, H), requires_grad=True)
w2 = Variable(torch.randn(H, D_out ), requires_grad=True)

In [111]:
model = nn.Sequential(
    nn.Linear(D_in, H),
    nn.ReLU(),
    nn.Linear(H, D_out)
)

In [112]:
optimizer = torch.optim.Adam(model.parameters())

In [113]:
loss_fn = nn.MSELoss()

In [114]:
for i in range(100):
    # Forward pass without torch.nn
#     h_out = x.mm(w1)
#     h_out_relu = h_out.clamp(min=0)
#     y_pred = h_out_relu.mm(w2)

    # Forward pass using torch.nn model
    y_pred = model(x)
    
    # Loss calculation from basics
#     loss = (y-y_pred).pow(2).mean()

    # Loss calculation using torch.nn Loss function 
    loss = loss_fn(y_pred, y)
    if i%10==0:
        print(i, loss.data[0])
    
    # Backward pass
    loss.backward()
    
    # Using Learning rule with optim 
    optimizer.step()
    
    # SGD learning rule without optim 
#     for param in model.parameters():
#         param.data -= learning_rate*param.grad.data
    
#     w1.data -= learning_rate*w1.grad.data
#     w2.data -= learning_rate*w2.grad.data
    
    # Setting gradient to zero
    model.zero_grad()

0 1.0697293281555176
10 0.10139219462871552
20 0.030052784830331802
30 0.009489026851952076
40 0.003732791170477867
50 0.001254924340173602
60 0.00047503304085694253
70 0.00016581712407059968
80 6.223517993930727e-05
90 2.1776450012112036e-05


# NN + Optim 

The nn package defines a set of Modules, which are roughly equivalent to neural network layers. 

A Module receives input Variables and computes output Variables, but may also hold internal state such as Variables containing learnable parameters. 

The nn package also defines a set of useful loss functions that are commonly used when training neural networks.

In [26]:
from torch.autograd import Variable

In [27]:
N, D_in, H, D_out = 64, 1000, 100, 10

In [37]:
x = Variable(torch.randn(N, D_in), requires_grad=False)
y = Variable(torch.randn(N, D_out), requires_grad=False)

In [38]:
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)

In [39]:
# list(model.parameters())

In [40]:
loss_fn = torch.nn.MSELoss(size_average=False)

Forward pass

In [41]:
y_pred = model(x)

In [44]:
loss = loss_fn(y_pred, y)

In [45]:
loss.backward()

In [None]:
for param in model.parameters():
    param.data -= learning_rate * param.grad.data

In [58]:
loss_fn = torch.nn.MSELoss(size_average=False)

In [59]:
learning_rate = 1e-4

Forward pass

In [60]:
y_pred = model(x)

In [61]:
loss = loss_fn(y_pred, y)

In [62]:
loss

Variable containing:
 686.5364
[torch.FloatTensor of size 1]

<b> Important </b> 

Zero the gradients before running the backward pass.

In [43]:
model.zero_grad()

Backward pass

In [44]:
loss.backward()

In [48]:
for param in model.parameters():
    param.data -= learning_rate * param.grad.data

# Using optim : Advanced optimization functions

In [48]:
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)

Make gradients zero as before

In [49]:
 optimizer.zero_grad()

Backward pass

In [65]:
loss.backward()

<b> Update parameters of learning algorithm </b>

In [50]:
 optimizer.step()

In [51]:
N, D_in, H, D_out = 64, 1000, 100, 10

x = Variable(torch.randn(N, D_in), requires_grad=False)
y = Variable(torch.randn(N, D_out), requires_grad=False)

model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)

loss_fn = torch.nn.MSELoss(size_average=False)

optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)

for i in range(500):
    # Forward pass
    y_pred = model(x)
    
    # Calculate loss
    loss = loss_fn(y_pred, y)
    
    optimizer.zero_grad()
    
    # Backward pass
    loss.backward()
    
    # Learning rule to update my parameters
    optimizer.step()