# LEARNING PYTORCH WITH EXAMPLES

We will use a fully-connected ReLU network as our running example. The network will have a single hidden layer, and will be trained with gradient descent to fit random data by minimizing the Euclidean distance between the network output and the true output.

## Tensors: NumPy

In [1]:
import numpy as np

In [2]:
bs, inLayers, hiddenLayers, outLayers = 64, 1000, 100, 10

In [3]:
# Create random input and output data
x = np.random.randn(bs, inLayers)
y = np.random.randn(bs, outLayers)

In [4]:
# Randomly initialize weights
w1 = np.random.randn(inLayers, hiddenLayers)
w2 = np.random.randn(hiddenLayers, outLayers)

In [5]:
lr = 1e-6

In [6]:
for t in range(500):
    # Forward pass: compute predicted y
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)

    # Compute and print loss
    loss = np.square(y_pred - y).sum()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)

    # Update weights
    w1 -= lr * grad_w1
    w2 -= lr * grad_w2


0 34503479.22250166
1 34893992.17745902
2 38364004.68985458
3 37937977.28634244
4 29640918.749765597
5 17654562.308469646
6 8676989.380463904
7 4181782.0325094108
8 2315416.1811515987
9 1531061.5372748624
10 1151037.1028947309
11 928018.8964320219
12 774039.1676438965
13 657007.3412725374
14 563506.788302982
15 486901.0908901044
16 423161.09386332374
17 369567.4754970871
18 324229.56439463084
19 285650.19580753753
20 252660.58123200035
21 224159.41425009436
22 199534.64107258612
23 178136.16582005893
24 159477.38262902034
25 143163.64984494343
26 128829.34632347885
27 116194.28994870001
28 105026.0280247574
29 95123.67839699321
30 86318.72857035958
31 78463.59235692986
32 71441.58151132295
33 65157.028477316446
34 59519.82675198534
35 54449.49244270219
36 49876.275389530376
37 45746.92324756003
38 42012.37818886339
39 38626.40643877477
40 35553.649171932586
41 32759.709085881263
42 30214.86360414515
43 27894.782112086526
44 25775.154195343715
45 23836.849962373642
46 22063.433217831596

## Tensors: PyTorch

In [7]:
import torch

In [8]:
dtype = torch.float
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [9]:
# Create random input and output data
x = torch.randn(bs, inLayers, device=device, dtype=dtype)
y = torch.randn(bs, outLayers, device=device, dtype=dtype)

In [10]:
# Randomly initialize weights
w1 = torch.randn(inLayers, hiddenLayers, device=device, dtype=dtype)
w2 = torch.randn(hiddenLayers, outLayers, device=device, dtype=dtype)

In [11]:
lr = 1e-6

In [13]:
for t in range(500):
    # Forward pass: compute predicted y
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum().item()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)

    # Update weights using gradient descent
    w1 -= lr * grad_w1
    w2 -= lr * grad_w2

0 28575828.0
1 22731716.0
2 20205798.0
3 18215448.0
4 15651395.0
5 12452147.0
6 9167774.0
7 6347785.5
8 4258071.0
9 2840891.5
10 1931549.75
11 1356425.375
12 991463.1875
13 754060.8125
14 594655.125
15 483036.03125
16 401668.875
17 339932.375
18 291548.0
19 252619.640625
20 220580.84375
21 193795.03125
22 171136.125
23 151772.15625
24 135089.5625
25 120626.609375
26 108032.890625
27 97007.390625
28 87319.7109375
29 78780.5078125
30 71231.703125
31 64536.8671875
32 58582.03125
33 53272.6640625
34 48526.38671875
35 44274.125
36 40456.21875
37 37021.1328125
38 33928.41796875
39 31134.40625
40 28606.33984375
41 26315.853515625
42 24235.646484375
43 22343.919921875
44 20621.626953125
45 19052.576171875
46 17625.400390625
47 16321.97265625
48 15131.7412109375
49 14039.5380859375
50 13036.40234375
51 12114.3076171875
52 11266.240234375
53 10485.5546875
54 9765.5966796875
55 9100.861328125
56 8487.162109375
57 7919.5322265625
58 7394.4833984375
59 6908.3935546875
60 6457.701171875
61 6039.5434