# LEARNING PYTORCH WITH EXAMPLES

We will use a fully-connected ReLU network as our running example. The network will have a single hidden layer, and will be trained with gradient descent to fit random data by minimizing the Euclidean distance between the network output and the true output.

## Tensors: NumPy

In [1]:
import numpy as np

In [2]:
bs, inLayers, hiddenLayers, outLayers = 64, 1000, 100, 10

In [3]:
# Create random input and output data
x = np.random.randn(bs, inLayers)
y = np.random.randn(bs, outLayers)

In [4]:
# Randomly initialize weights
w1 = np.random.randn(inLayers, hiddenLayers)
w2 = np.random.randn(hiddenLayers, outLayers)

In [5]:
lr = 1e-6

In [6]:
for t in range(500):
    # Forward pass: compute predicted y
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)

    # Compute and print loss
    loss = np.square(y_pred - y).sum()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)

    # Update weights
    w1 -= lr * grad_w1
    w2 -= lr * grad_w2


0 24603906.853505544
1 16215284.200646996
2 12260312.909112968
3 10173330.840558935
4 8924197.835007789
5 8024561.749056915
6 7214095.037081659
7 6399624.058991681
8 5538171.882181951
9 4677424.108857464
10 3851629.727775866
11 3111542.1348418025
12 2472183.9035610417
13 1945472.0206272826
14 1521393.4706984898
15 1189324.5715634315
16 931531.8487052382
17 734007.6332559065
18 582388.3008606636
19 466300.6912089363
20 376833.75874461775
21 307673.43766566133
22 253762.31574757787
23 211413.56688292816
24 177810.10348672094
25 150924.20741080953
26 129205.6247805726
27 111501.77702019678
28 96914.66112679569
29 84795.73350186634
30 74633.66351218717
31 66051.1194656651
32 58743.27113912437
33 52477.682046226444
34 47073.45423126889
35 42383.3931587698
36 38287.42013491482
37 34691.85007324268
38 31518.43064616839
39 28705.448601111504
40 26201.390673681988
41 23973.748620509643
42 21976.835457987832
43 20180.88039017554
44 18563.3526651458
45 17099.92519747375
46 15772.27765787338
47 14

## Tensors: PyTorch

In [7]:
import torch

In [8]:
dtype = torch.float
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [9]:
# Create random input and output data
x = torch.randn(bs, inLayers, device=device, dtype=dtype)
y = torch.randn(bs, outLayers, device=device, dtype=dtype)

In [10]:
# Randomly initialize weights
w1 = torch.randn(inLayers, hiddenLayers, device=device, dtype=dtype)
w2 = torch.randn(hiddenLayers, outLayers, device=device, dtype=dtype)

In [11]:
lr = 1e-6

In [12]:
for t in range(500):
    # Forward pass: compute predicted y
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum().item()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)

    # Update weights using gradient descent
    w1 -= lr * grad_w1
    w2 -= lr * grad_w2

0 27522666.0
1 22776952.0
2 22336470.0
3 22648548.0
4 21431724.0
5 17836912.0
6 12833208.0
7 8164951.0
8 4827895.0
9 2816305.5
10 1703229.25
11 1102266.125
12 770565.125
13 576711.875
14 454587.46875
15 371417.3125
16 310719.4375
17 264037.53125
18 226723.0625
19 196112.46875
20 170593.140625
21 149084.78125
22 130809.3046875
23 115150.8984375
24 101669.8125
25 90022.4296875
26 79913.953125
27 71113.1953125
28 63431.28125
29 56698.09765625
30 50781.7890625
31 45567.38671875
32 40959.51171875
33 36881.57421875
34 33265.52734375
35 30051.9765625
36 27190.587890625
37 24634.66015625
38 22349.7109375
39 20303.896484375
40 18469.349609375
41 16820.560546875
42 15337.57421875
43 14001.1318359375
44 12794.8662109375
45 11704.91015625
46 10718.9111328125
47 9825.345703125
48 9016.060546875
49 8280.80859375
50 7610.4345703125
51 7000.359375
52 6444.91943359375
53 5938.65380859375
54 5477.51318359375
55 5056.486328125
56 4672.49169921875
57 4321.02880859375
58 3999.30908203125
59 3704.0119628906

## Autograd

### PyTorch: Tensors and autograd

In [13]:
# Create random Tensors to hold input and outputs.
# Setting requires_grad=False indicates that we do not need to compute gradients
# with respect to these Tensors during the backward pass.
x = torch.randn(bs, inLayers, device=device, dtype=dtype)
y = torch.randn(bs, outLayers, device=device, dtype=dtype)

In [15]:
# Create random Tensors for weights.
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Tensors during the backward pass.
w1 = torch.randn(inLayers, hiddenLayers, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(hiddenLayers, outLayers, device=device, dtype=dtype, requires_grad=True)

In [16]:
lr = 1e-6

In [17]:
for t in range(500):
    # Forward pass: compute predicted y using operations on Tensors; these
    # are exactly the same operations we used to compute the forward pass using
    # Tensors, but we do not need to keep references to intermediate values since
    # we are not implementing the backward pass by hand.
    y_pred = x.mm(w1).clamp(min=0).mm(w2)

    # Compute and print loss using operations on Tensors.
    # Now loss is a Tensor of shape (1,)
    # loss.item() gets the a scalar value held in the loss.
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())

    # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Tensors with requires_grad=True.
    # After this call w1.grad and w2.grad will be Tensors holding the gradient
    # of the loss with respect to w1 and w2 respectively.
    loss.backward()

    # Manually update weights using gradient descent. Wrap in torch.no_grad()
    # because weights have requires_grad=True, but we don't need to track this
    # in autograd.
    # An alternative way is to operate on weight.data and weight.grad.data.
    # Recall that tensor.data gives a tensor that shares the storage with
    # tensor, but doesn't track history.
    # You can also use torch.optim.SGD to achieve this.
    with torch.no_grad():
        w1 -= lr * w1.grad
        w2 -= lr * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()

0 25563466.0
1 21330816.0
2 19389156.0
3 17563348.0
4 14971865.0
5 11791179.0
6 8592242.0
7 5932050.0
8 3986951.25
9 2687096.0
10 1853246.75
11 1325691.625
12 987251.5625
13 764550.1875
14 612047.875
15 503290.8125
16 422397.34375
17 360011.78125
18 310430.59375
19 269974.84375
20 236382.421875
21 208061.84375
22 183922.453125
23 163158.0
24 145184.40625
25 129538.5625
26 115858.9375
27 103868.46875
28 93304.78125
29 83973.0
30 75707.734375
31 68370.09375
32 61837.0078125
33 56013.84375
34 50811.46484375
35 46151.0859375
36 41971.01953125
37 38214.796875
38 34834.69140625
39 31787.3125
40 29035.609375
41 26545.78125
42 24290.533203125
43 22248.515625
44 20396.275390625
45 18712.123046875
46 17180.103515625
47 15785.400390625
48 14514.2802734375
49 13354.2412109375
50 12295.357421875
51 11329.6728515625
52 10445.9892578125
53 9636.939453125
54 8895.66796875
55 8216.08203125
56 7592.564453125
57 7019.7783203125
58 6493.8056640625
59 6010.779296875
60 5566.462890625
61 5157.396484375
62 4

### PyTorch: Defining new autograd functions

In [18]:
class MyReLU(torch.autograd.Function):
    """
    We can implement our own custom autograd Functions by subclassing
    torch.autograd.Function and implementing the forward and backward passes
    which operate on Tensors.
    """
    
    @staticmethod
    def forward(ctx, input):
        """
        In the forward pass we receive a Tensor containing the input and return
        a Tensor containing the output. ctx is a context object that can be used
        to stash information for backward computation. You can cache arbitrary
        objects for use in the backward pass using the ctx.save_for_backward method.
        """
        ctx.save_for_backward(input)
        return input.clamp(min=0)
    
    @staticmethod
    def backward(ctx, gradOutput):
        """
        In the backward pass we receive a Tensor containing the gradient of the loss
        with respect to the output, and we need to compute the gradient of the loss
        with respect to the input.
        """
        input, = ctx.saved_tensors
        gradInput = gradOutput.clone()
        gradInput[input < 0] = 0
        return gradInput

In [19]:
# Create random Tensors to hold input and outputs.
x = torch.randn(bs, inLayers, device=device, dtype=dtype)
y = torch.randn(bs, outLayers, device=device, dtype=dtype)

# Create random Tensors for weights.
w1 = torch.randn(inLayers, hiddenLayers, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(hiddenLayers, outLayers, device=device, dtype=dtype, requires_grad=True)

lr = 1e-6

In [20]:
for t in range(500):
    # To apply our Function, we use Function.apply method. We alias this as 'relu'.
    relu = MyReLU.apply

    # Forward pass: compute predicted y using operations; we compute
    # ReLU using our custom autograd operation.
    y_pred = relu(x.mm(w1)).mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())

    # Use autograd to compute the backward pass.
    loss.backward()

    # Update weights using gradient descent
    with torch.no_grad():
        w1 -= lr * w1.grad
        w2 -= lr * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()

0 30367084.0
1 24312102.0
2 20211664.0
3 16003799.0
4 11850859.0
5 8189737.0
6 5471554.0
7 3629030.5
8 2466955.0
9 1741329.75
10 1285627.75
11 989015.4375
12 787801.8125
13 644657.3125
14 538220.75
15 456018.71875
16 390687.75
17 337511.46875
18 293472.78125
19 256519.703125
20 225264.75
21 198633.640625
22 175744.53125
23 155958.96875
24 138789.484375
25 123813.640625
26 110706.84375
27 99203.4453125
28 89068.6796875
29 80115.859375
30 72186.34375
31 65168.953125
32 58926.16015625
33 53356.98828125
34 48377.828125
35 43916.984375
36 39916.6875
37 36320.3671875
38 33083.1328125
39 30165.154296875
40 27530.310546875
41 25148.033203125
42 22991.244140625
43 21036.857421875
44 19265.22265625
45 17655.576171875
46 16191.8671875
47 14859.7705078125
48 13645.673828125
49 12540.0185546875
50 11530.4267578125
51 10608.25
52 9765.3583984375
53 8994.5107421875
54 8288.873046875
55 7642.037109375
56 7049.15771484375
57 6504.98828125
58 6005.42919921875
59 5546.7353515625
60 5125.0361328125
61 473

### TensorFlow: Static Graphs

In [24]:
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
import numpy as np

W0514 21:16:40.464212 4594152896 deprecation.py:323] From /usr/local/lib/python3.7/site-packages/tensorflow/python/compat/v2_compat.py:63: disable_resource_variables (from tensorflow.python.ops.variable_scope) is deprecated and will be removed in a future version.
Instructions for updating:
non-resource variables are not supported in the long term


In [25]:
# First we set up the computational graph:

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create placeholders for the input and target data; these will be filled
# with real data when we execute the graph.
x = tf.placeholder(tf.float32, shape=(None, D_in))
y = tf.placeholder(tf.float32, shape=(None, D_out))

# Create Variables for the weights and initialize them with random data.
# A TensorFlow Variable persists its value across executions of the graph.
w1 = tf.Variable(tf.random_normal((D_in, H)))
w2 = tf.Variable(tf.random_normal((H, D_out)))

# Forward pass: Compute the predicted y using operations on TensorFlow Tensors.
# Note that this code does not actually perform any numeric operations; it
# merely sets up the computational graph that we will later execute.
h = tf.matmul(x, w1)
h_relu = tf.maximum(h, tf.zeros(1))
y_pred = tf.matmul(h_relu, w2)

# Compute loss using operations on TensorFlow Tensors
loss = tf.reduce_sum((y - y_pred) ** 2.0)

# Compute gradient of the loss with respect to w1 and w2.
grad_w1, grad_w2 = tf.gradients(loss, [w1, w2])

# Update the weights using gradient descent. To actually update the weights
# we need to evaluate new_w1 and new_w2 when executing the graph. Note that
# in TensorFlow the the act of updating the value of the weights is part of
# the computational graph; in PyTorch this happens outside the computational
# graph.
learning_rate = 1e-6
new_w1 = w1.assign(w1 - learning_rate * grad_w1)
new_w2 = w2.assign(w2 - learning_rate * grad_w2)

# Now we have built our computational graph, so we enter a TensorFlow session to
# actually execute the graph.
with tf.Session() as sess:
    # Run the graph once to initialize the Variables w1 and w2.
    sess.run(tf.global_variables_initializer())

    # Create numpy arrays holding the actual data for the inputs x and targets
    # y
    x_value = np.random.randn(N, D_in)
    y_value = np.random.randn(N, D_out)
    for _ in range(500):
        # Execute the graph many times. Each time it executes we want to bind
        # x_value to x and y_value to y, specified with the feed_dict argument.
        # Each time we execute the graph we want to compute the values for loss,
        # new_w1, and new_w2; the values of these Tensors are returned as numpy
        # arrays.
        loss_value, _, _ = sess.run([loss, new_w1, new_w2],
                                    feed_dict={x: x_value, y: y_value})
        print(loss_value)

27961068.0
24249680.0
29023452.0
39042690.0
48117150.0
46143810.0
30491066.0
13679269.0
5017296.0
2055520.1
1142978.8
814920.9
654218.1
550036.7
471229.8
407666.66
355033.66
310804.66
273366.6
241512.25
214218.34
190724.42
170421.17
152776.34
137374.3
123866.96
111984.46
101489.68
92180.46
83898.94
76512.64
69912.805
64000.273
58684.1
53899.89
49578.92
45674.215
42136.53
38923.07
35997.453
33330.94
30896.117
28673.408
26636.725
24767.695
23050.223
21468.902
20014.945
18676.207
17440.057
16297.418
15240.135
14260.018
13351.357
12508.1455
11725.387
10998.373
10321.619
9691.315
9104.814
8557.853
8047.4736
7570.558
7124.9053
6708.2056
6318.1714
5953.2207
5611.3506
5290.7207
4990.1113
4708.216
4443.6465
4195.2646
3961.8918
3742.581
3536.354
3342.4893
3160.227
2988.5996
2826.9238
2674.7307
2531.265
2396.072
2268.6846
2148.5327
2035.1692
1928.2063
1827.2859
1731.9764
1641.9866
1556.9761
1476.6741
1400.7744
1329.0024
1261.1862
1197.0424
1136.3799
1079.0032
1024.6702
973.23395
924.55334
878.440

## nn module (PyTorch)

### nn

In [26]:
# Create random Tensors to hold input and outputs.
x = torch.randn(bs, inLayers)
y = torch.randn(bs, outLayers)

In [29]:
# Use the nn package to define our model as a sequence of layers. nn.Sequential
# is a Module which contains other Modules, and applies them in sequence to
# produce its output. Each Linear Module computes output from input using a
# linear function, and holds internal Tensors for its weight and bias.
model = torch.nn.Sequential(
    torch.nn.Linear(inLayers, hiddenLayers),
    torch.nn.ReLU(),
    torch.nn.Linear(hiddenLayers, outLayers)
)

In [30]:
# The nn package also contains definitions of popular loss functions; in this
# case we will use Mean Squared Error (MSE) as our loss function.
lossFunc = torch.nn.MSELoss(reduction='sum')

In [31]:
lr = 1e-4
for t in range(500):
    # Forward pass: compute predicted y by passing x to the model. Module objects
    # override the __call__ operator so you can call them like functions. When
    # doing so you pass a Tensor of input data to the Module and it produces
    # a Tensor of output data.
    y_pred = model(x)
    
    # Compute and print loss. We pass Tensors containing the predicted and true
    # values of y, and the loss function returns a Tensor containing the
    # loss.
    loss = lossFunc(y_pred, y)
    print(t, loss.item())
    
    # Zero the gradients before running the backward pass.
    model.zero_grad()
    
    # Backward pass: compute gradient of the loss with respect to all the learnable
    # parameters of the model. Internally, the parameters of each Module are stored
    # in Tensors with requires_grad=True, so this call will compute gradients for
    # all learnable parameters in the model.
    loss.backward()
    
    # Update the weights using gradient descent. Each parameter is a Tensor, so
    # we can access its gradients like we did before.
    with torch.no_grad():
        for param in model.parameters():
            param -= lr * param.grad

0 704.6354370117188
1 656.0349731445312
2 613.5722045898438
3 576.0888061523438
4 542.3736572265625
5 511.8268737792969
6 483.8934631347656
7 457.8943786621094
8 433.7062683105469
9 411.2217102050781
10 389.97021484375
11 369.88800048828125
12 351.03369140625
13 333.1393737792969
14 316.1562805175781
15 299.9294128417969
16 284.4257507324219
17 269.69757080078125
18 255.63558959960938
19 242.19082641601562
20 229.32337951660156
21 217.00054931640625
22 205.17660522460938
23 193.87718200683594
24 183.09654235839844
25 172.8220977783203
26 163.0198516845703
27 153.67112731933594
28 144.77667236328125
29 136.28517150878906
30 128.25624084472656
31 120.68209838867188
32 113.54468536376953
33 106.78565979003906
34 100.40357208251953
35 94.3956298828125
36 88.74329376220703
37 83.42688751220703
38 78.43273162841797
39 73.726318359375
40 69.31039428710938
41 65.16813659667969
42 61.283321380615234
43 57.63856506347656
44 54.22392654418945
45 51.02602005004883
46 48.03166961669922
47 45.226699

### optim

In [32]:
# Create random Tensors to hold input and outputs.
x = torch.randn(bs, inLayers)
y = torch.randn(bs, outLayers)

In [33]:
# Use the nn package to define our model and loss function.
model = torch.nn.Sequential(
    torch.nn.Linear(inLayers, hiddenLayers),
    torch.nn.ReLU(),
    torch.nn.Linear(hiddenLayers, outLayers),
)
lossFunc = torch.nn.MSELoss(reduction='sum')

In [34]:
# Use the optim package to define an Optimizer that will update the weights of
# the model for us. Here we will use Adam; the optim package contains many other
# optimization algoriths. The first argument to the Adam constructor tells the
# optimizer which Tensors it should update.
lr = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [35]:
for t in range(500):
    # Forward pass: compute predicted y by passing x to the model.
    y_pred = model(x)

    # Compute and print loss.
    loss = lossFunc(y_pred, y)
    print(t, loss.item())

    # Before the backward pass, use the optimizer object to zero all of the
    # gradients for the variables it will update (which are the learnable
    # weights of the model). This is because by default, gradients are
    # accumulated in buffers( i.e, not overwritten) whenever .backward()
    # is called. Checkout docs of torch.autograd.backward for more details.
    optimizer.zero_grad()

    # Backward pass: compute gradient of the loss with respect to model
    # parameters
    loss.backward()

    # Calling the step function on an Optimizer makes an update to its
    # parameters
    optimizer.step()

0 607.0188598632812
1 590.6580200195312
2 574.820556640625
3 559.4537353515625
4 544.6851806640625
5 530.396728515625
6 516.5231323242188
7 503.0291748046875
8 489.9194641113281
9 477.24676513671875
10 464.9467468261719
11 452.95977783203125
12 441.3625183105469
13 430.1064147949219
14 419.2139892578125
15 408.70880126953125
16 398.4510192871094
17 388.5047912597656
18 378.8213806152344
19 369.4045715332031
20 360.24884033203125
21 351.3499450683594
22 342.6952819824219
23 334.2796325683594
24 326.0880126953125
25 318.1285705566406
26 310.3663635253906
27 302.7712707519531
28 295.3818664550781
29 288.1734619140625
30 281.1408386230469
31 274.26190185546875
32 267.55419921875
33 261.0174560546875
34 254.63961791992188
35 248.4164581298828
36 242.33798217773438
37 236.3878936767578
38 230.54200744628906
39 224.82925415039062
40 219.2278289794922
41 213.74957275390625
42 208.3717498779297
43 203.11180114746094
44 198.0037841796875
45 192.9966278076172
46 188.0975341796875
47 183.284332275

### Custom nn modules

In [37]:
class TwoLayerNet(torch.nn.Module):
    def __init__(self, inLayers, hiddenLayers, outLayers):
        """
        In the constructor we instantiate two nn.Linear modules and assign them as
        member variables.
        """
        super(TwoLayerNet, self).__init__()
        self.linear1 = torch.nn.Linear(inLayers, hiddenLayers)
        self.linear2 = torch.nn.Linear(hiddenLayers, outLayers)
        
    def forward(self, x):
        """
        In the forward function we accept a Tensor of input data and we must return
        a Tensor of output data. We can use Modules defined in the constructor as
        well as arbitrary operators on Tensors.
        """
        hiddenRelu = self.linear1(x).clamp(min=0)
        y_pred = self.linear2(hiddenRelu)
        return y_pred

In [38]:
# Create random Tensors to hold input and outputs.
x = torch.randn(bs, inLayers)
y = torch.randn(bs, outLayers)

In [39]:
# Construct our model by instantiating the class defined above
model = TwoLayerNet(inLayers, hiddenLayers, outLayers)

In [40]:
# Construct our loss function and an Optimizer. The call to model.parameters()
# in the SGD constructor will contain the learnable parameters of the two
# nn.Linear modules which are members of the model.
criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)

In [41]:
for t in range(500):
    # Forward pass: Compute predicted y by passing x to the model
    y_pred = model(x)

    # Compute and print loss
    loss = criterion(y_pred, y)
    print(t, loss.item())

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

0 698.6260375976562
1 648.302978515625
2 604.7340698242188
3 565.9534301757812
4 531.5799560546875
5 500.35870361328125
6 471.8827209472656
7 445.45245361328125
8 420.8814697265625
9 398.1556396484375
10 377.1867980957031
11 357.53033447265625
12 338.9509582519531
13 321.3117370605469
14 304.44232177734375
15 288.33184814453125
16 272.9393310546875
17 258.25775146484375
18 244.28768920898438
19 230.85391235351562
20 218.07339477539062
21 205.93829345703125
22 194.3413848876953
23 183.28509521484375
24 172.74557495117188
25 162.72438049316406
26 153.2340850830078
27 144.2152557373047
28 135.66685485839844
29 127.57860565185547
30 119.94268798828125
31 112.71531677246094
32 105.90383911132812
33 99.48690032958984
34 93.43436431884766
35 87.73951721191406
36 82.38789367675781
37 77.36138916015625
38 72.65276336669922
39 68.23396301269531
40 64.08425903320312
41 60.193275451660156
42 56.54343795776367
43 53.12583923339844
44 49.92079544067383
45 46.923606872558594
46 44.11262130737305
47 4