In [1]:
# Ali Babolhavaeji
# compare Pytorch and TensorFlow
# for a simple 2 layer FC Neural Network
# 7/7/2019

import numpy as np



# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N , D_in, H, D_out=64 ,1000, 100, 10

x=np.random.randn(N,D_in)
y=np.random.randn(N,D_out)

w1=np.random.randn(D_in,H)
w2=np.random.randn(H,D_out)

learning_rate=1e-6

for t in range(600):
    # forward pass: compute predicted y
    h=x.dot(w1)
    h_relu=np.maximum(h,0) # each element which is less than 0 (negative) make it zero
    y_pred=h_relu.dot(w2)

    
    #compute loss
    loss=np.square(y_pred-y).sum()
    print(t,loss)
    
    #Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred=2.0*(y_pred-y)
    grad_w2= h_relu.T.dot(grad_y_pred)
    grad_h_relu=grad_y_pred.dot(w2.T)
    grad_h=grad_h_relu.copy()
    grad_h[h<0]=0
    grad_w1=x.T.dot(grad_h)
    
    w1 -=learning_rate*grad_w1
    w2 -=learning_rate*grad_w2


0 34831816.56015163
1 34575914.279136315
2 38118028.05142328
3 38406759.84523293
4 31272490.393196777
5 19675260.33534778
6 10023740.353332078
7 4777141.707017243
8 2480542.874601391
9 1518688.0400987137
10 1078980.7276639082
11 841631.393011281
12 689767.083769817
13 579688.4459068718
14 494118.933480188
15 425011.43633404945
16 368020.59797989554
17 320371.10502502014
18 280217.9034559954
19 246182.7034039523
20 217190.5106742406
21 192282.32732945462
22 170783.7666194326
23 152171.44234835697
24 135958.40770898227
25 121789.21676101041
26 109364.5845614615
27 98441.90876165574
28 88814.4308852404
29 80313.62706135235
30 72777.47451957826
31 66064.65096052599
32 60068.98446579503
33 54706.87757420363
34 49898.773960496976
35 45579.76032637006
36 41690.77474199217
37 38181.821763536216
38 35012.70675171603
39 32146.074513086656
40 29546.06203284904
41 27185.47907594876
42 25040.87700427756
43 23091.607937172295
44 21316.817599819886
45 19697.767903805954
46 18217.980403503832
47 16864

In [2]:
# print(loss)
# numpy cannot utilize GPUs to accelerate its computation.

# A Tensor is conceptually identical to a numpy array: its a n-dimentional array
# Tensor can keep track of a computational graph and gradients

# at follow I will use pyTorch to fit a two-layer network to random data


In [3]:
import torch

dtype= torch.float
device=torch.device('cpu')
# device = torch.device("cuda:0") # Uncomment this to run on GPU

N , D_in, H, D_out=64 ,1000, 100, 10

x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

w1=torch.randn(D_in,H, device=device, dtype=dtype)
w2=torch.randn(H,D_out, device=device, dtype=dtype)

learning_rate=1e-6

for t in range(500):
    h=x.mm(w1)
    h_relu=h.clamp(min=0)
    y_pred=h_relu.mm(w2)
    
    
    loss=(y_pred - y).pow(2).sum().item()     # convert from tensor to number by .item() method
    print(t,loss)
    
    grad_y_pred= 2.0* (y_pred-y)
    grad_w2= h_relu.t().mm(grad_y_pred)
    grad_h_relu=grad_y_pred.mm(w2.t())
    grad_h= grad_h_relu.clone()
    grad_h[h<0]=0
    grad_w1=x.t().mm(grad_h)
    
    w1 -=learning_rate*grad_w1
    w2 -=learning_rate*grad_w2




0 31468456.0
1 26551446.0
2 23531918.0
3 19659140.0
4 14867784.0
5 10166847.0
6 6526124.0
7 4115297.5
8 2662626.75
9 1811023.125
10 1306017.75
11 993097.375
12 787932.375
13 644729.5
14 539211.5
15 457816.4375
16 392953.84375
17 340060.8125
18 296124.09375
19 259201.4375
20 227871.328125
21 201121.15625
22 178143.65625
23 158291.625
24 141063.953125
25 126064.015625
26 112953.921875
27 101454.421875
28 91330.453125
29 82412.703125
30 74514.78125
31 67507.046875
32 61277.04296875
33 55723.66796875
34 50756.78515625
35 46306.58203125
36 42310.58203125
37 38718.56640625
38 35480.3828125
39 32559.142578125
40 29915.59375
41 27520.439453125
42 25347.5234375
43 23373.0390625
44 21576.04296875
45 19938.09765625
46 18443.166015625
47 17076.474609375
48 15824.0283203125
49 14676.0986328125
50 13624.14453125
51 12658.951171875
52 11772.9873046875
53 10957.6005859375
54 10206.3515625
55 9513.4658203125
56 8872.7431640625
57 8280.146484375
58 7732.1123046875
59 7223.90673828125
60 6752.68212890625

486 0.00012551953841466457
487 0.00012311412137933075
488 0.00012062439054716378
489 0.00011881873797392473
490 0.00011670697131194174
491 0.00011501891276566312
492 0.00011307183740427718
493 0.00011128957703476772
494 0.00010933560406556353
495 0.00010772822861326858
496 0.00010614385973894969
497 0.00010449755063746125
498 0.00010285170719726011
499 0.0001011282583931461


In [4]:
# Tensor and autograd

#in the above expmles the forward pass and back propagate path are implemented 
# manually it is ok when we have a small network

# we want to use Automatic differentiation for backward passes in nn
# https://en.wikipedia.org/wiki/Automatic_differentiation
# in autograd package of pyTorch you can find it. in this pachage the 
# forward pass of nn is defined as computational graph.
# nodes in this graph will be Tensors and edges will be functions

# if x is a Tensor --> by x.requires_grad=True then x.grad is its gradient

# in the next expmle by using the autograd a two layer nn will be implemented


In [5]:
import torch

dtype= torch.float
device= torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Create random Tensors for weights.
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Tensors during the backward pass.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)


learning_rate = 1e-6
for t in range(500):
    y_pred = x.mm(w1).clamp(min=0).mm(w2)
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())
    
    # After this call w1.grad and w2.grad 
    loss.backward()
    # Manually update weights using gradient descent. Wrap in torch.no_grad()
    # because weights have requires_grad=True, but we don't need to track this
    # in autograd.
    # An alternative way is to operate on weight.data and weight.grad.data.
    # Recall that tensor.data gives a tensor that shares the storage with
    # tensor, but doesn't track history.
    # You can also use torch.optim.SGD to achieve this.
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()
    


0 41798592.0
1 45546512.0
2 50925232.0
3 46702592.0
4 30434702.0
5 14328944.0
6 5927929.5
7 2919494.5
8 1865922.0
9 1408160.125
10 1141654.125
11 953044.875
12 806971.0
13 689336.25
14 592752.125
15 512605.375
16 445620.71875
17 389175.75
18 341333.03125
19 300626.21875
20 265772.15625
21 235745.265625
22 209755.59375
23 187196.09375
24 167517.359375
25 150289.03125
26 135178.90625
27 121882.90625
28 110123.4140625
29 99708.109375
30 90445.015625
31 82187.21875
32 74805.34375
33 68196.328125
34 62265.8046875
35 56914.7734375
36 52089.24609375
37 47733.34375
38 43790.2890625
39 40217.265625
40 36974.3046875
41 34026.93359375
42 31345.419921875
43 28900.666015625
44 26669.89453125
45 24635.373046875
46 22775.6953125
47 21074.6796875
48 19515.76953125
49 18083.978515625
50 16768.25390625
51 15558.1171875
52 14443.314453125
53 13415.7890625
54 12467.9892578125
55 11593.591796875
56 10786.01171875
57 10039.35546875
58 9349.447265625
59 8711.3017578125
60 8120.26416015625
61 7572.00927734375

In [None]:
# eaach autograd function has two important parts:
# 1-forward --> computes output from input tensor
# 2-backward --> receive the gradient of output Tensor with respect to 
# some scalar value.

# in pytorch we define autograd function by torch.autograd.Function 
# and implement the forward and backward functions

In [28]:
import torch

class MyReLU(torch.autograd.Function):
    """
    We can implement our own custom autograd Functions by subclassing
    torch.autograd.Function and implementing the forward and backward passes
    which operate on Tensors.
    """

    @staticmethod
    def forward(ctx, input):
        """
        In the forward pass we receive a Tensor containing the input and return
        a Tensor containing the output. ctx is a context object that can be used
        to stash information for backward computation. You can cache arbitrary
        objects for use in the backward pass using the ctx.save_for_backward method.
        """
        ctx.save_for_backward(input)
        return input.clamp(min=0)

    @staticmethod
    def backward(ctx, grad_output):
        """
        In the backward pass we receive a Tensor containing the gradient of the loss
        with respect to the output, and we need to compute the gradient of the loss
        with respect to the input.
        """
        input, = ctx.saved_tensors
        grad_input = grad_output.clone()
        grad_input[input < 0] = 0
        return grad_input
        

dtype = torch.float
device = torch.device("cpu")

N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Create random Tensors for weights.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)


learning_rate = 1e-6
for t in range(500):
    # To apply our Function, we use Function.apply method. We alias this as 'relu'.
    relu = MyReLU.apply
    y_pred = relu(x.mm(w1)).mm(w2)
    
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())
    
    # Use autograd to compute the backward pass.
    loss.backward()
    # Update weights using gradient descent
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()
    

0 29626868.0
1 27702852.0
2 31081560.0
3 34375416.0
4 32878486.0
5 24472560.0
6 14320701.0
7 7026127.5
8 3430206.0
9 1881991.75
10 1220792.0
11 901455.6875
12 718952.75
13 596613.6875
14 505336.15625
15 432949.34375
16 373624.28125
17 324213.03125
18 282641.75
19 247316.6875
20 217189.65625
21 191429.15625
22 169262.46875
23 150097.1875
24 133435.078125
25 118923.921875
26 106236.1640625
27 95108.6484375
28 85328.140625
29 76696.640625
30 69063.484375
31 62295.140625
32 56280.66796875
33 50925.21484375
34 46144.8984375
35 41874.16015625
36 38047.2421875
37 34616.328125
38 31533.126953125
39 28757.13671875
40 26253.517578125
41 23993.173828125
42 21949.501953125
43 20099.06640625
44 18420.826171875
45 16897.5859375
46 15512.826171875
47 14252.990234375
48 13105.587890625
49 12058.6279296875
50 11103.9189453125
51 10232.9755859375
52 9436.439453125
53 8707.623046875
54 8040.2548828125
55 7428.4755859375
56 6867.369140625
57 6351.77099609375
58 5878.16064453125
59 5442.62890625
60 5041.78

In [None]:
# tensor Flow : Static Graph

#The biggest difference between the two is that 
#TensorFlow’s computational graphs are static 
#and PyTorch uses dynamic computational graphs.

# In TensorFlow, we define the computational graph once 
# and then execute the same graph over and over again, possibly 
# feeding different input data to the graph. In PyTorch, each 
# forward pass defines a new computational graph.

In [31]:
import tensorflow as tf
import numpy as np

N, D_in, H, D_out = 64, 1000, 100, 10

# Create placeholders for the input and target data; these will be filled
# with real data when we execute the graph.
x = tf.placeholder(tf.float32, shape=(None, D_in))
y = tf.placeholder(tf.float32, shape=(None, D_out))

# Create Variables for the weights and initialize them with random data.
# A TensorFlow Variable persists its value across executions of the graph.
w1 = tf.Variable(tf.random_normal((D_in, H)))
w2 = tf.Variable(tf.random_normal((H, D_out)))
# Forward pass: Compute the predicted y using operations on TensorFlow Tensors.
# Note that this code does not actually perform any numeric operations; it
# merely sets up the computational graph that we will later execute.
h = tf.matmul(x, w1)
h_relu = tf.maximum(h, tf.zeros(1))
y_pred = tf.matmul(h_relu, w2)

# Compute loss using operations on TensorFlow Tensors
loss = tf.reduce_sum((y - y_pred) ** 2.0)

# Compute gradient of the loss with respect to w1 and w2.
grad_w1, grad_w2 = tf.gradients(loss, [w1, w2])

# Update the weights using gradient descent. To actually update the weights
# we need to evaluate new_w1 and new_w2 when executing the graph. Note that
# in TensorFlow the the act of updating the value of the weights is part of
# the computational graph; in PyTorch this happens outside the computational
# graph.
learning_rate = 1e-6
new_w1 = w1.assign(w1 - learning_rate * grad_w1)
new_w2 = w2.assign(w2 - learning_rate * grad_w2)


# Now we have built our computational graph, so we enter a TensorFlow session to
# actually execute the graph.
with tf.Session() as sess:
    # Run the graph once to initialize the Variables w1 and w2.
    sess.run(tf.global_variables_initializer())

    # Create numpy arrays holding the actual data for the inputs x and targets
    # y
    x_value = np.random.randn(N, D_in)
    y_value = np.random.randn(N, D_out)
    for _ in range(500):
        # Execute the graph many times. Each time it executes we want to bind
        # x_value to x and y_value to y, specified with the feed_dict argument.
        # Each time we execute the graph we want to compute the values for loss,
        # new_w1, and new_w2; the values of these Tensors are returned as numpy
        # arrays.
        loss_value, _, _ = sess.run([loss, new_w1, new_w2],
                                    feed_dict={x: x_value, y: y_value})
        print(loss_value)




W0707 20:44:13.045311 140161560610560 deprecation.py:323] From /home/ali/.conda/envs/py37/lib/python3.7/site-packages/tensorflow/python/ops/math_grad.py:1205: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


28530208.0
22898420.0
22359064.0
23367076.0
23499546.0
21174508.0
16516468.0
11207066.0
6867215.0
4031419.2
2396695.0
1506696.0
1021842.06
747321.44
581291.06
472617.12
395872.66
338080.2
292397.88
255076.98
223886.28
197439.95
174784.0
155232.6
138259.11
123458.91
110507.61
99127.09
89106.84
80271.74
72449.85
65519.05
59349.15
53842.633
48918.402
44510.75
40553.496
36990.977
33781.145
30882.043
28262.496
25889.37
23738.234
21785.523
20011.467
18397.164
16926.955
15585.992
14362.619
13244.977
12222.386
11286.609
10428.936
9642.5625
8921.012
8258.354
7649.2715
7089.0825
6573.246
6098.4043
5660.4697
5256.7544
4884.6387
4541.0527
4223.478
3929.426
3657.4907
3405.7305
3172.5955
2956.6177
2756.3809
2570.6216
2398.2202
2238.1685
2089.471
1951.3049
1822.8994
1703.431
1592.2777
1488.8491
1392.5181
1302.8174
1219.3386
1141.5043
1068.9115
1001.1959
938.005
879.0066
823.9262
772.47144
724.3874
679.45087
637.44275
598.166
561.42664
527.06573
494.9007
464.7912
436.58768
410.16763
385.4157
362.234
3

In [None]:
# In TensorFlow, packages like Keras, TensorFlow-Slim,
# and TFLearn provide higher-level abstractions over 
# raw computational graphs that are useful for building neural networks.

#In PyTorch, the nn package serves this same purpose

# A Module receives input Tensors and computes output Tensors,
# but may also hold internal state such as Tensors containing 
# learnable parameters. The nn package also defines a set of useful 
# loss functions that are commonly used when training neural networks.

In [33]:
import torch

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Use the nn package to define our model as a sequence of layers. nn.Sequential
# is a Module which contains other Modules, and applies them in sequence to
# produce its output. Each Linear Module computes output from input using a
# linear function, and holds internal Tensors for its weight and bias.
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)


# The nn package also contains definitions of popular loss functions; in this
# case we will use Mean Squared Error (MSE) as our loss function.

loss_fn = torch.nn.MSELoss(reduction='sum')


learning_rate = 1e-4
for t in range(500):
    
    # Forward pass: compute predicted y by passing x to the model. Module objects
    # override the __call__ operator so you can call them like functions. When
    # doing so you pass a Tensor of input data to the Module and it produces
    # a Tensor of output data.
    y_pred = model(x)
    #Compute and print loss. We pass Tensors containing the predicted and true
    # values of y, and the loss function returns a Tensor containing the
    # loss.
    loss = loss_fn(y_pred, y)
    print(t, loss.item())
    # Zero the gradients before running the backward pass.
    model.zero_grad()
    # Backward pass: compute gradient of the loss with respect to all the learnable
    # parameters of the model. Internally, the parameters of each Module are stored
    # in Tensors with requires_grad=True, so this call will compute gradients for
    # all learnable parameters in the model.
    loss.backward()
    # Update the weights using gradient descent. Each parameter is a Tensor, so
    # we can access its gradients like we did before.
    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate * param.grad



0 610.59326171875
1 568.4907836914062
2 531.9447021484375
3 499.69842529296875
4 470.82086181640625
5 444.6819763183594
6 421.02349853515625
7 399.228759765625
8 379.02874755859375
9 360.1925354003906
10 342.4465637207031
11 325.693115234375
12 309.89593505859375
13 294.9231872558594
14 280.77069091796875
15 267.2745666503906
16 254.40463256835938
17 242.1052703857422
18 230.2886505126953
19 218.97633361816406
20 208.2087860107422
21 197.90243530273438
22 188.0142822265625
23 178.54881286621094
24 169.49366760253906
25 160.834228515625
26 152.55699157714844
27 144.64683532714844
28 137.1046142578125
29 129.92333984375
30 123.07766723632812
31 116.56938171386719
32 110.37147521972656
33 104.4775161743164
34 98.87003326416016
35 93.5428237915039
36 88.49761199951172
37 83.69585418701172
38 79.14127349853516
39 74.82650756835938
40 70.73937225341797
41 66.86361694335938
42 63.19025802612305
43 59.72080993652344
44 56.4328727722168
45 53.32257843017578
46 50.38711929321289
47 47.6128959655

469 0.00017569407646078616
470 0.00017224586918018758
471 0.0001688711199676618
472 0.00016556266928091645
473 0.00016232070629484951
474 0.00015914413961581886
475 0.0001560240052640438
476 0.00015297229401767254
477 0.00014998008555267006
478 0.0001470585266361013
479 0.0001441857748432085
480 0.00014136763638816774
481 0.0001385980867780745
482 0.00013589125592261553
483 0.00013323542953003198
484 0.00013063408550806344
485 0.00012808444444090128
486 0.00012558199523482472
487 0.00012312910985201597
488 0.0001207257155328989
489 0.00011836475459858775
490 0.0001160593019449152
491 0.00011379580973880365
492 0.00011157344124512747
493 0.00010939353524008766
494 0.0001072618251782842
495 0.0001051740619004704
496 0.00010312270751455799
497 0.00010111513256561011
498 9.913914982462302e-05
499 9.721200331114233e-05


In [None]:
# Up to this point we have updated the weights of 
# our models by manually mutating the Tensors holding
# learnable parameters (with torch.no_grad()
# or .data to avoid tracking history in autograd). 


# The optim package in PyTorch abstracts the idea of an optimization
# algorithm and provides implementations of commonly used optimization
# algorithms.

# In this example we will use the nn package to define our model as before,
# but we will optimize the model using the Adam algorithm provided
# by the optim package:

In [36]:
import torch

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Use the nn package to define our model and loss function.
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)
    
loss_fn = torch.nn.MSELoss(reduction='sum')

# Use the optim package to define an Optimizer that will update the weights of
# the model for us. Here we will use Adam; the optim package contains many other
# optimization algoriths. The first argument to the Adam constructor tells the
# optimizer which Tensors it should update.
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
for t in range(500):
    # Forward pass: compute predicted y by passing x to the model.
    y_pred = model(x)

    # Compute and print loss.
    loss = loss_fn(y_pred, y)
    print(t, loss.item())

    # Before the backward pass, use the optimizer object to zero all of the
    # gradients for the variables it will update (which are the learnable
    # weights of the model). This is because by default, gradients are
    # accumulated in buffers( i.e, not overwritten) whenever .backward()
    # is called. Checkout docs of torch.autograd.backward for more details.
    optimizer.zero_grad()

    # Backward pass: compute gradient of the loss with respect to model
    # parameters
    loss.backward()

    # Calling the step function on an Optimizer makes an update to its
    # parameters
    optimizer.step()

0 663.5889282226562
1 646.459228515625
2 629.8394775390625
3 613.6710815429688
4 597.9603881835938
5 582.7056884765625
6 567.916748046875
7 553.5863037109375
8 539.7053833007812
9 526.2758178710938
10 513.2562255859375
11 500.64788818359375
12 488.4715270996094
13 476.5760192871094
14 464.99267578125
15 453.7293395996094
16 442.8023681640625
17 432.1817626953125
18 421.8706970214844
19 411.81024169921875
20 402.0400390625
21 392.59307861328125
22 383.44842529296875
23 374.5487060546875
24 365.86212158203125
25 357.4551086425781
26 349.2991943359375
27 341.34869384765625
28 333.6108093261719
29 326.0469970703125
30 318.6715087890625
31 311.46112060546875
32 304.4285888671875
33 297.5714416503906
34 290.8782653808594
35 284.32373046875
36 277.9141540527344
37 271.6457824707031
38 265.4954528808594
39 259.4707336425781
40 253.56874084472656
41 247.78541564941406
42 242.11964416503906
43 236.56622314453125
44 231.13232421875
45 225.81748962402344
46 220.58741760253906
47 215.4625244140625


440 3.555766738827515e-07
441 3.3006421062964364e-07
442 3.066090528136556e-07
443 2.844608388841152e-07
444 2.6415892762088333e-07
445 2.452551370879519e-07
446 2.2744654870621162e-07
447 2.1124940019490168e-07
448 1.9601777978550672e-07
449 1.8177412641762203e-07
450 1.686399571099173e-07
451 1.5643280448784935e-07
452 1.4506386492030288e-07
453 1.343967852562855e-07
454 1.2466733778637717e-07
455 1.1564938517949486e-07
456 1.0718498089090644e-07
457 9.928236721634676e-08
458 9.201831119298731e-08
459 8.518682648173126e-08
460 7.891681264027284e-08
461 7.309310490200005e-08
462 6.772359029127983e-08
463 6.268131613751393e-08
464 5.805562253158314e-08
465 5.3782219566755884e-08
466 4.9789644407383093e-08
467 4.612181214724842e-08
468 4.266382447326578e-08
469 3.950546201281213e-08
470 3.66051438049908e-08
471 3.384683111562481e-08
472 3.128772263494284e-08
473 2.8964203480086326e-08
474 2.674187271622941e-08
475 2.4702815792920774e-08
476 2.2900746188270205e-08
477 2.1206377098792473e

In [None]:
# PyTorch: Custom nn Modules

In [38]:
import torch

class TwoLayerNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        """
        In the constructor we instantiate two nn.Linear modules and assign them as
        member variables.
        """
        super(TwoLayerNet, self).__init__()
        self.linear1 = torch.nn.Linear(D_in, H)
        self.linear2 = torch.nn.Linear(H, D_out)

    def forward(self, x):
        """
        In the forward function we accept a Tensor of input data and we must return
        a Tensor of output data. We can use Modules defined in the constructor as
        well as arbitrary operators on Tensors.
        """
        h_relu = self.linear1(x).clamp(min=0)
        y_pred = self.linear2(h_relu)
        return y_pred

    
N, D_in, H, D_out = 64, 1000, 100, 10
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Construct our model by instantiating the class defined above
model = TwoLayerNet(D_in, H, D_out)

# Construct our loss function and an Optimizer. The call to model.parameters()
# in the SGD constructor will contain the learnable parameters of the two
# nn.Linear modules which are members of the model.
criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)

for t in range(500):
    # Forward pass: Compute predicted y by passing x to the model
    y_pred = model(x)

    # Compute and print loss
    loss = criterion(y_pred, y)
    print(t, loss.item())

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()


0 633.211181640625
1 584.4163208007812
2 542.16845703125
3 505.1554870605469
4 472.06988525390625
5 442.4481201171875
6 415.5071105957031
7 390.7856750488281
8 368.0558776855469
9 346.93743896484375
10 327.2259521484375
11 308.7205810546875
12 291.2306213378906
13 274.76416015625
14 259.1430358886719
15 244.3948974609375
16 230.38055419921875
17 217.1249542236328
18 204.56077575683594
19 192.67449951171875
20 181.4388885498047
21 170.7839813232422
22 160.66757202148438
23 151.07484436035156
24 141.97320556640625
25 133.3880615234375
26 125.2781982421875
27 117.61164093017578
28 110.3675537109375
29 103.53289794921875
30 97.08149719238281
31 90.9969253540039
32 85.2608871459961
33 79.86323547363281
34 74.79491424560547
35 70.04734802246094
36 65.58991241455078
37 61.40578079223633
38 57.48429870605469
39 53.8143424987793
40 50.374942779541016
41 47.155303955078125
42 44.14723587036133
43 41.330665588378906
44 38.69758605957031
45 36.23670196533203
46 33.93800354003906
47 31.788757324218

In [None]:
# PyTorch: Control Flow + Weight Sharing


# As an example of dynamic graphs and weight sharing,
# we implement a very strange model: a fully-connected ReLU network that
# on each forward pass chooses a random number between 1 and 4 and uses 
# that many hidden layers, reusing the same weights multiple times to 
# compute the innermost hidden layer

In [41]:
import random
import torch


class DynamicNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        """
        In the constructor we construct three nn.Linear instances that we will use
        in the forward pass.
        """
        super(DynamicNet, self).__init__()
        self.input_linear = torch.nn.Linear(D_in, H)
        self.middle_linear = torch.nn.Linear(H, H)
        self.output_linear = torch.nn.Linear(H, D_out)

    def forward(self, x):
        """
        For the forward pass of the model, we randomly choose either 0, 1, 2, or 3
        and reuse the middle_linear Module that many times to compute hidden layer
        representations.

        Since each forward pass builds a dynamic computation graph, we can use normal
        Python control-flow operators like loops or conditional statements when
        defining the forward pass of the model.

        Here we also see that it is perfectly safe to reuse the same Module many
        times when defining a computational graph. This is a big improvement from Lua
        Torch, where each Module could be used only once.
        """
        h_relu = self.input_linear(x).clamp(min=0)
        for _ in range(random.randint(0, 3)):
            h_relu = self.middle_linear(h_relu).clamp(min=0)
        y_pred = self.output_linear(h_relu)
        return y_pred

N, D_in, H, D_out = 64, 1000, 100, 10
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Construct our model by instantiating the class defined above
model = DynamicNet(D_in, H, D_out)


# Construct our loss function and an Optimizer. Training this strange model with
# vanilla stochastic gradient descent is tough, so we use momentum
criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.9)

for t in range(500):
    # Forward pass: Compute predicted y by passing x to the model
    y_pred = model(x)

    # Compute and print loss
    loss = criterion(y_pred, y)
    print(t, loss.item())

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()


0 711.6954345703125
1 688.2286376953125
2 689.2037963867188
3 685.5953979492188
4 550.1834716796875
5 677.5451049804688
6 453.7674255371094
7 399.433837890625
8 339.59521484375
9 279.4507751464844
10 674.0021362304688
11 673.1001586914062
12 150.1975860595703
13 656.974853515625
14 103.40988159179688
15 668.416748046875
16 666.5835571289062
17 634.1448364257812
18 636.0421142578125
19 89.16863250732422
20 569.4816284179688
21 92.20834350585938
22 597.1493530273438
23 76.9737319946289
24 568.5272216796875
25 429.7373046875
26 611.8482055664062
27 594.8176879882812
28 336.27545166015625
29 543.8712158203125
30 63.49512481689453
31 247.32949829101562
32 445.9248046875
33 328.8293762207031
34 369.216796875
35 174.19329833984375
36 95.16177368164062
37 155.94361877441406
38 292.3994140625
39 210.6295623779297
40 97.24149322509766
41 198.59829711914062
42 129.93209838867188
43 72.09276580810547
44 100.5576400756836
45 88.79411315917969
46 78.01246643066406
47 74.44268798828125
48 62.55672073

487 0.12173724919557571
488 0.10886430740356445
489 0.9271794557571411
490 0.7783960103988647
491 0.7324033975601196
492 0.2033696472644806
493 1.11538827419281
494 0.459725022315979
495 0.4725465476512909
496 0.18160203099250793
497 1.1873395442962646
498 0.47252127528190613
499 0.28997641801834106
