In [1]:
#Tensors
#Warm-up Numpy

import numpy as np

# N is batch size; D_in is input dimension
# H is hidden dimension; D_out is output dimension

N, D_in, H, D_out= 64, 1000, 100, 10

#Create a random input and output data

x= np.random.randn(N, D_in)
y= np.random.randn(N, D_out)

In [2]:
#Randomly initialize weights
w1= np.random.randn(D_in, H)
w2= np.random.randn(H, D_out)

In [3]:
learning_rate= 1e-6
for t in range(500):
    #Forward pass: computed predicted y
    h= x.dot(w1)
    h_relu= np.maximum(h,0)
    y_pred= h_relu.dot(w2)
    
    #Compute and print loss
    loss= np.square(y_pred-y).sum()
    print(t, loss)
    
    #Backprop to compute gradients of w1 and w2 wrt loss
    grad_y_pred= 2.0*(y_pred-y)
    grad_w2= h_relu.T.dot(grad_y_pred)
    grad_h_relu= grad_y_pred.dot(w2.T)
    grad_h= grad_h_relu.copy()
    grad_h[h<0] = 0
    grad_w1= x.T.dot(grad_h)
    
    #Update Weights
    w1 -= learning_rate*grad_w1
    w2 -= learning_rate*grad_w2
    

0 33724405.56997866
1 29034222.77645179
2 26324203.653557844
3 22186272.725594405
4 16595329.298513532
5 10984878.088049687
6 6796032.390883205
7 4170553.2907033334
8 2684981.540652819
9 1854947.881631908
10 1374770.2753264857
11 1076424.6183633585
12 876396.6108915708
13 732153.6770488448
14 622211.9892843405
15 534969.9128911432
16 463813.68887134956
17 404739.9754847117
18 355058.33013927564
19 312820.00084642065
20 276654.9436625218
21 245519.18460356997
22 218597.770192486
23 195263.59360428283
24 174897.514949654
25 157043.68777869316
26 141337.87031057605
27 127481.91604019946
28 115212.98049725624
29 104323.51160947536
30 94635.97925475822
31 85998.49934142022
32 78272.79155840354
33 71348.88997017515
34 65130.92598537846
35 59533.55524718616
36 54489.551927509674
37 49936.454224688496
38 45817.53666586605
39 42087.90793995019
40 38704.115948322346
41 35630.85100344436
42 32834.233186797836
43 30289.21103173523
44 27966.443704140882
45 25847.67530843983
46 23911.082733581534
47

In [5]:
#PyTorch: Tensors

import torch

dtype= torch.float
device= torch.device("cpu")

# N is batch size; D_in is input dimension
# H is hidden dimension; D_out is output dimension

N, D_in, H, D_out = 64, 1000, 100, 10

#Create random input and output data
x= torch.randn(N, D_in, device=device, dtype=dtype)
y= torch.randn(N, D_out, device=device, dtype=dtype)

In [6]:
#Randomly initialize weights
w1= torch.randn(D_in, H, device=device, dtype=dtype)
w2= torch.randn(H, D_out, device=device, dtype=dtype)

In [10]:
learning_rate=1e-6
for t in range(500):
    #Forward Pass: Compute predicted y
    h= x.mm(w1)
    h_relu= h.clamp(min=0)
    y_pred= h_relu.mm(w2)
    
    #Compute and print loss
    loss= (y_pred-y).pow(2).sum().item()
    print(t, loss)
    
    #backprop to compute gradients of w1 and w2wrt loss
    grad_y_pred= 2.0*(y_pred-y)
    grad_w2= h_relu.t().mm(grad_y_pred)
    grad_h_relu= grad_y_pred.mm(w2.t())
    grad_h= grad_h_relu.clone()
    grad_h[h<0] = 0
    grad_w1= x.t().mm(grad_h)
    
    #Update weights using gradient descent
    w1-= learning_rate*grad_w1
    w2-= learning_rate*grad_w2
    
    
    

0 1.4963567082304507e-05
1 1.4919565728632733e-05
2 1.4802921214140952e-05
3 1.4692417607875541e-05
4 1.4659665794169996e-05
5 1.4490239664155524e-05
6 1.4422346794162877e-05
7 1.4312360690382775e-05
8 1.414841699443059e-05
9 1.4026240023667924e-05
10 1.3881945960747544e-05
11 1.3822695109411143e-05
12 1.3765581570623908e-05
13 1.359191446681507e-05
14 1.3545101865020115e-05
15 1.3502329238690436e-05
16 1.3353926988202147e-05
17 1.3286577086546458e-05
18 1.3227136150817387e-05
19 1.3076587492832914e-05
20 1.2986108231416438e-05
21 1.2878766938229091e-05
22 1.2819640687666833e-05
23 1.2762587175529916e-05
24 1.2656352737394627e-05
25 1.2496350791479927e-05
26 1.2397992577461991e-05
27 1.2280785085749812e-05
28 1.2172295100754127e-05
29 1.2055683328071609e-05
30 1.1939499927393626e-05
31 1.1851922863570508e-05
32 1.1844873370137066e-05
33 1.1774861377489287e-05
34 1.1704277312674094e-05
35 1.1664449630188756e-05
36 1.1573486517590936e-05
37 1.1467416697996669e-05
38 1.1519140571181197e-0

458 2.447185124765383e-06
459 2.4374585336772725e-06
460 2.4337141439900734e-06
461 2.4253349693026394e-06
462 2.433951522107236e-06
463 2.427693061690661e-06
464 2.4196833692258224e-06
465 2.420319788143388e-06
466 2.4098633275571046e-06
467 2.413933543721214e-06
468 2.400756329734577e-06
469 2.402754944341723e-06
470 2.402638074272545e-06
471 2.412756202829769e-06
472 2.4041773940552957e-06
473 2.4066796413535485e-06
474 2.4097596451611025e-06
475 2.396351874267566e-06
476 2.382300635872525e-06
477 2.3735958620818565e-06
478 2.368602281421772e-06
479 2.362809482292505e-06
480 2.359372047067154e-06
481 2.364209422012209e-06
482 2.3550917376269354e-06
483 2.3510417577199405e-06
484 2.342691004741937e-06
485 2.346961082366761e-06
486 2.3466991478926502e-06
487 2.3408631477650488e-06
488 2.321839019714389e-06
489 2.314321136509534e-06
490 2.3182651602837723e-06
491 2.313372533535585e-06
492 2.3190004867501557e-06
493 2.3028098894428695e-06
494 2.2948374862608034e-06
495 2.296058937645284

In [11]:
#PyTorch tensors and AutoGrad

import torch

dtype= torch.float
device= torch.device("cpu")

# N is batch size; D_in is input dimension
# H is hidden dimension; D_out is output dimension
N, D_in, H, D_out= 64,1000, 100, 10

In [12]:
# Create random Tensors to hold input and outputs.
# Setting requires_grad=False indicates that we do not need to compute gradients
# with respect to these Tensors during the backward pass.

x= torch.randn(N, D_in, device=device, dtype=dtype)
y= torch.randn(N, D_out, device=device, dtype=dtype)

In [13]:
# Create random Tensors for weights.
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Tensors during the backward pass.

w1= torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2= torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

In [15]:
learning_rate= 1e-6
for t in range(500):
    # Forward pass: compute predicted y using operations on Tensors; these
    # are exactly the same operations we used to compute the forward pass using
    # Tensors, but we do not need to keep references to intermediate values since
    # we are not implementing the backward pass by hand.
    y_pred= x.mm(w1).clamp(min=0).mm(w2)
    
    # Compute and print loss using operations on Tensors.
    # Now loss is a Tensor of shape (1,)
    # loss.item() gets the a scalar value held in the loss.
    loss= (y_pred-y).pow(2).sum()
    print(t, loss)
    
     # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Tensors with requires_grad=True.
    # After this call w1.grad and w2.grad will be Tensors holding the gradient
    # of the loss with respect to w1 and w2 respectively.
    loss.backward()
    
    # Manually update weights using gradient descent. Wrap in torch.no_grad()
    # because weights have requires_grad=True, but we don't need to track this
    # in autograd.
    # An alternative way is to operate on weight.data and weight.grad.data.
    # Recall that tensor.data gives a tensor that shares the storage with
    # tensor, but doesn't track history.
    # You can also use torch.optim.SGD to achieve this.
    with torch.no_grad():
        w1-= learning_rate*w1.grad
        w2-= learning_rate*w2.grad
        
        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()

0 tensor(27076118., grad_fn=<SumBackward0>)
1 tensor(10821847., grad_fn=<SumBackward0>)
2 tensor(9439236., grad_fn=<SumBackward0>)
3 tensor(8232111., grad_fn=<SumBackward0>)
4 tensor(7073195., grad_fn=<SumBackward0>)
5 tensor(5910679., grad_fn=<SumBackward0>)
6 tensor(4813059.5000, grad_fn=<SumBackward0>)
7 tensor(3819019., grad_fn=<SumBackward0>)
8 tensor(2978602.5000, grad_fn=<SumBackward0>)
9 tensor(2295171.2500, grad_fn=<SumBackward0>)
10 tensor(1761792.6250, grad_fn=<SumBackward0>)
11 tensor(1353413.2500, grad_fn=<SumBackward0>)
12 tensor(1047102.6875, grad_fn=<SumBackward0>)
13 tensor(818210.7500, grad_fn=<SumBackward0>)
14 tensor(647757.7500, grad_fn=<SumBackward0>)
15 tensor(520219.6250, grad_fn=<SumBackward0>)
16 tensor(424035.5000, grad_fn=<SumBackward0>)
17 tensor(350614.7188, grad_fn=<SumBackward0>)
18 tensor(293822.1250, grad_fn=<SumBackward0>)
19 tensor(249229.8906, grad_fn=<SumBackward0>)
20 tensor(213693.7969, grad_fn=<SumBackward0>)
21 tensor(184982.2969, grad_fn=<SumB

291 tensor(0.0252, grad_fn=<SumBackward0>)
292 tensor(0.0241, grad_fn=<SumBackward0>)
293 tensor(0.0231, grad_fn=<SumBackward0>)
294 tensor(0.0221, grad_fn=<SumBackward0>)
295 tensor(0.0211, grad_fn=<SumBackward0>)
296 tensor(0.0202, grad_fn=<SumBackward0>)
297 tensor(0.0193, grad_fn=<SumBackward0>)
298 tensor(0.0185, grad_fn=<SumBackward0>)
299 tensor(0.0177, grad_fn=<SumBackward0>)
300 tensor(0.0169, grad_fn=<SumBackward0>)
301 tensor(0.0162, grad_fn=<SumBackward0>)
302 tensor(0.0155, grad_fn=<SumBackward0>)
303 tensor(0.0148, grad_fn=<SumBackward0>)
304 tensor(0.0142, grad_fn=<SumBackward0>)
305 tensor(0.0136, grad_fn=<SumBackward0>)
306 tensor(0.0130, grad_fn=<SumBackward0>)
307 tensor(0.0124, grad_fn=<SumBackward0>)
308 tensor(0.0119, grad_fn=<SumBackward0>)
309 tensor(0.0114, grad_fn=<SumBackward0>)
310 tensor(0.0109, grad_fn=<SumBackward0>)
311 tensor(0.0104, grad_fn=<SumBackward0>)
312 tensor(0.0100, grad_fn=<SumBackward0>)
313 tensor(0.0096, grad_fn=<SumBackward0>)
314 tensor(

In [43]:
#PyTorch: Defining new autograd functions

import torch

class MyReLU(torch.autograd.Function):
    
    """
    We can implement our own custom autograd Functions by subclassing
    torch.autograd.Function and implementing the forward and backward passes
    which operate on Tensors.
    """
    
    @staticmethod
    def forward(ctx, input):
        """
        In the forward pass we receive a Tensor containing the input and return
        a Tensor containing the output. ctx is a context object that can be used
        to stash information for backward computation. You can cache arbitrary
        objects for use in the backward pass using the ctx.save_for_backward method.
        """
        ctx.save_for_backward(input)
        return input.clamp(min=0)
    
    @staticmethod
    def backward(ctx, grad_output):
        """
        In the backward pass we receive a Tensor containing the gradient of the loss
        with respect to the output, and we need to compute the gradient of the loss
        with respect to the input.
        """
        input, =ctx.saved_tensors
        grad_input= grad_output.clone()
        grad_input[input<0] = 0
        return grad_input
    
dtype= torch.float
device= torch.device("cpu")
    
    
# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, D_out, H= 64, 1000, 10, 100
    
# Create random Tensors to hold input and outputs.
x= torch.randn(N, D_in, dtype=dtype, device=device)
y= torch.randn(N, D_out, dtype=dtype, device=device)
    
# Create random Tensors for weights.
w1= torch.randn(D_in, H, dtype=dtype, device=device, requires_grad=True)
w2= torch.randn(H, D_out, dtype=dtype, device=device, requires_grad=True)
    
learning_rate=1e-6
for t in range(500):
# To apply our Function, we use Function.apply method. We alias this as 'relu'
    relu= MyReLU.apply
        
    # Forward pass: compute predicted y using operations; we compute
    # ReLU using our custom autograd operation.
    y_pred= relu(x.mm(w1)).mm(w2)
        
    #Compute and print loss
    loss= (y_pred-y).pow(2).sum()
    print(t, loss.item())
        
    # Use autograd to compute the backward pass.
    loss.backward()
        
    # Update weights using gradient descent
    with torch.no_grad():
        w1-= learning_rate*w1.grad
        w2-= learning_rate*w2.grad
            
        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()

0 39202548.0
1 38869392.0
2 40418644.0
3 36323232.0
4 24806634.0
5 13298797.0
6 6285913.0
7 3244526.5
8 2003435.5
9 1444873.0
10 1136402.25
11 932296.25
12 781151.4375
13 662682.75
14 566854.9375
15 488099.9375
16 422732.15625
17 368071.90625
18 321945.90625
19 282776.34375
20 249328.90625
21 220633.8125
22 195894.109375
23 174463.859375
24 155839.015625
25 139577.703125
26 125337.0
27 112825.40625
28 101786.828125
29 92023.578125
30 83363.3828125
31 75668.84375
32 68808.578125
33 62673.16796875
34 57173.6875
35 52238.03515625
36 47794.71875
37 43788.46875
38 40165.125
39 36886.578125
40 33915.11328125
41 31217.94921875
42 28767.263671875
43 26535.677734375
44 24502.185546875
45 22646.27734375
46 20950.484375
47 19399.34375
48 17978.916015625
49 16675.9140625
50 15480.0703125
51 14380.9716796875
52 13369.3857421875
53 12437.9013671875
54 11579.73828125
55 10788.537109375
56 10058.1357421875
57 9383.23046875
58 8759.0302734375
59 8181.224609375
60 7645.90283203125
61 7149.94189453125
62

418 0.006252708379179239
419 0.006062432192265987
420 0.005877060350030661
421 0.005690125282853842
422 0.005513648968189955
423 0.0053383405320346355
424 0.005171135067939758
425 0.005007625557482243
426 0.004854121245443821
427 0.004702413454651833
428 0.004558888264000416
429 0.00441961782053113
430 0.004285200033336878
431 0.004152967128902674
432 0.00402637617662549
433 0.0039043258875608444
434 0.003785183420404792
435 0.0036734214518219233
436 0.003557545365765691
437 0.0034542393404990435
438 0.003349514678120613
439 0.0032488051801919937
440 0.0031539173796772957
441 0.003055509179830551
442 0.0029652882367372513
443 0.0028756295796483755
444 0.0027953425887972116
445 0.0027126597706228495
446 0.0026371439453214407
447 0.002559192478656769
448 0.0024855947121977806
449 0.0024118702858686447
450 0.0023437472991645336
451 0.00227629323489964
452 0.0022116098552942276
453 0.0021505337208509445
454 0.0020875202026218176
455 0.002029707422479987
456 0.001970677636563778
457 0.00191

In [52]:
#TensorFlow: Static Graphs

import tensorflow as tf
import numpy as np

# First we set up the computational graph:

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, D_out, H= 64, 1000, 10, 100

# Create placeholders for the input and target data; these will be filled
# with real data when we execute the graph.
x= tf.placeholder(tf.float32, shape=(None, D_in))
y= tf.placeholder(tf.float32, shape=(None, D_out))

# Create Variables for the weights and initialize them with random data.
# A TensorFlow Variable persists its value across executions of the graph.
w1= tf.Variable(tf.random_normal((D_in, H)))
w2= tf.Variable(tf.random_normal((H, D_out)))

# Forward pass: Compute the predicted y using operations on TensorFlow Tensors.
# Note that this code does not actually perform any numeric operations; it
# merely sets up the computational graph that we will later execute.

h= tf.matmul(x, w1)
h_relu= tf.maximum(h, tf.zeros(1))
y_pred= tf.matmul(h_relu, w2)

# Compute loss using operations on TensorFlow Tensors
loss= tf.reduce_sum((y-y_pred)**2.0)

# Compute gradient of the loss with respect to w1 and w2.
grad_w1, grad_w2= tf.gradients(loss, [w1,w2])

# Update the weights using gradient descent. To actually update the weights
# we need to evaluate new_w1 and new_w2 when executing the graph. Note that
# in TensorFlow the the act of updating the value of the weights is part of
# the computational graph; in PyTorch this happens outside the computational
# graph.

learning_rate=1e-6
new_w1= w1.assign(w1- learning_rate*grad_w1)
new_w2= w2.assign(w2- learning_rate*grad_w2)

# Now we have built our computational graph, so we enter a TensorFlow session to
# actually execute the graph.

with tf.Session() as sess:
    # Run the graph once to initialize the Variables w1 and w2.
    sess.run(tf.global_variables_initializer())
    
    # Create numpy arrays holding the actual data for the inputs x and targets y
    x_value= np.random.randn(N, D_in)
    y_value= np.random.randn(N, D_out)
    
    for t in range(500):
         # Execute the graph many times. Each time it executes we want to bind
        # x_value to x and y_value to y, specified with the feed_dict argument.
        # Each time we execute the graph we want to compute the values for loss,
        # new_w1, and new_w2; the values of these Tensors are returned as numpy
        # arrays.
        loss_value, _, _=sess.run([loss, new_w1, new_w2],
                                 feed_dict={x: x_value, y: y_value})
        print(loss_value)


32437714.0
31087796.0
31969452.0
30320974.0
24492516.0
16309620.0
9416295.0
5136260.0
2931730.0
1852920.6
1309648.4
1008030.6
819220.0
686959.75
586604.6
506650.94
440986.25
386062.28
339594.78
300012.75
266119.3
236887.89
211552.88
189499.1
170255.97
153414.39
138569.06
125440.93
113803.84
103450.49
94232.555
86004.67
78635.36
72014.24
66051.39
60670.68
55807.453
51400.14
47402.562
43771.21
40466.016
37452.29
34701.582
32187.875
29886.383
27776.129
25841.66
24063.729
22428.197
20923.205
19534.473
18251.719
17065.75
15968.168
14951.137
14008.277
13133.701
12320.924
11565.887
10863.232
10209.099
9599.549
9031.041
8500.685
8005.334
7542.383
7109.5986
6704.615
6325.4297
5970.314
5639.4316
5330.045
5039.6904
4767.13
4510.995
4270.2354
4043.8037
3830.7803
3630.213
3441.3042
3263.3274
3095.5095
2937.3022
2787.9434
2646.9663
2513.8403
2388.0967
2269.273
2156.8992
2050.6475
1950.1099
1855.0011
1764.9928
1679.7422
1599.0061
1522.4697
1449.9429
1381.147
1315.8961
1253.9761
1195.2863
1139.5956
10

In [58]:
#PyTorch: nn Module
    

import torch

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, D_out, H= 64, 1000, 10, 100

# Create random Tensors to hold inputs and outputs
x= torch.randn(N, D_in)
y=torch.randn(N, D_out)

# Use the nn package to define our model as a sequence of layers. nn.Sequential
# is a Module which contains other Modules, and applies them in sequence to
# produce its output. Each Linear Module computes output from input using a
# linear function, and holds internal Tensors for its weight and bias.
model= torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out))

# The nn package also contains definitions of popular loss functions; in this
# case we will use Mean Squared Error (MSE) as our loss function.
loss_fn= torch.nn.MSELoss(reduction='sum')

learning_rate=1e-6
for t in range(500):
    # Forward pass: compute predicted y by passing x to the model. Module objects
    # override the __call__ operator so you can call them like functions. When
    # doing so you pass a Tensor of input data to the Module and it produces
    # a Tensor of output data.
    y_pred= model(x)
    
    # Compute and print loss. We pass Tensors containing the predicted and true
    # values of y, and the loss function returns a Tensor containing the
    # loss.
    loss= loss_fn(y_pred,y)
    print(t, loss)
    
    # Zero the gradients before running the backward pass.
    model.zero_grad()
    
    # Backward pass: compute gradient of the loss with respect to all the learnable
    # parameters of the model. Internally, the parameters of each Module are stored
    # in Tensors with requires_grad=True, so this call will compute gradients for
    # all learnable parameters in the model.
    loss.backward()
    
    # Update the weights using gradient descent. Each parameter is a Tensor, so
    # we can access its gradients like we did before.
    with torch.no_grad():
        for param in model.parameters():
            param-=param.grad*learning_rate

0 tensor(745.7253, grad_fn=<MseLossBackward>)
1 tensor(745.1293, grad_fn=<MseLossBackward>)
2 tensor(744.5339, grad_fn=<MseLossBackward>)
3 tensor(743.9393, grad_fn=<MseLossBackward>)
4 tensor(743.3455, grad_fn=<MseLossBackward>)
5 tensor(742.7532, grad_fn=<MseLossBackward>)
6 tensor(742.1618, grad_fn=<MseLossBackward>)
7 tensor(741.5711, grad_fn=<MseLossBackward>)
8 tensor(740.9813, grad_fn=<MseLossBackward>)
9 tensor(740.3922, grad_fn=<MseLossBackward>)
10 tensor(739.8038, grad_fn=<MseLossBackward>)
11 tensor(739.2162, grad_fn=<MseLossBackward>)
12 tensor(738.6291, grad_fn=<MseLossBackward>)
13 tensor(738.0427, grad_fn=<MseLossBackward>)
14 tensor(737.4571, grad_fn=<MseLossBackward>)
15 tensor(736.8723, grad_fn=<MseLossBackward>)
16 tensor(736.2883, grad_fn=<MseLossBackward>)
17 tensor(735.7051, grad_fn=<MseLossBackward>)
18 tensor(735.1227, grad_fn=<MseLossBackward>)
19 tensor(734.5411, grad_fn=<MseLossBackward>)
20 tensor(733.9604, grad_fn=<MseLossBackward>)
21 tensor(733.3808, gra

252 tensor(618.0703, grad_fn=<MseLossBackward>)
253 tensor(617.6411, grad_fn=<MseLossBackward>)
254 tensor(617.2124, grad_fn=<MseLossBackward>)
255 tensor(616.7840, grad_fn=<MseLossBackward>)
256 tensor(616.3561, grad_fn=<MseLossBackward>)
257 tensor(615.9288, grad_fn=<MseLossBackward>)
258 tensor(615.5023, grad_fn=<MseLossBackward>)
259 tensor(615.0765, grad_fn=<MseLossBackward>)
260 tensor(614.6510, grad_fn=<MseLossBackward>)
261 tensor(614.2261, grad_fn=<MseLossBackward>)
262 tensor(613.8015, grad_fn=<MseLossBackward>)
263 tensor(613.3776, grad_fn=<MseLossBackward>)
264 tensor(612.9541, grad_fn=<MseLossBackward>)
265 tensor(612.5308, grad_fn=<MseLossBackward>)
266 tensor(612.1077, grad_fn=<MseLossBackward>)
267 tensor(611.6853, grad_fn=<MseLossBackward>)
268 tensor(611.2631, grad_fn=<MseLossBackward>)
269 tensor(610.8412, grad_fn=<MseLossBackward>)
270 tensor(610.4200, grad_fn=<MseLossBackward>)
271 tensor(609.9990, grad_fn=<MseLossBackward>)
272 tensor(609.5784, grad_fn=<MseLossBac

In [59]:
#PyTorch: optim

import torch

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, D_out, H= 64, 1000, 10, 100

# Create random Tensors to hold inputs and outputs
x= torch.randn(N, D_in)
y= torch.randn(N, D_out)

# Use the nn package to define our model and loss function.
model= torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out))

loss_fn= torch.nn.MSELoss(reduction='sum')

# Use the optim package to define an Optimizer that will update the weights of
# the model for us. Here we will use Adam; the optim package contains many other
# optimization algoriths. The first argument to the Adam constructor tells the
# optimizer which Tensors it should update.

learning_rate=1e-6
optimizer=torch.optim.Adam(model.parameters(), lr=learning_rate)
for t in range(500):
    # Forward pass: compute predicted y by passing x to the model.
    y_pred= model(x)
    
    # Compute and print loss.
    loss= loss_fn(y_pred, y)
    print(t, loss.item())
    
    # Before the backward pass, use the optimizer object to zero all of the
    # gradients for the variables it will update (which are the learnable
    # weights of the model). This is because by default, gradients are
    # accumulated in buffers( i.e, not overwritten) whenever .backward()
    # is called. Checkout docs of torch.autograd.backward for more details.
    optimizer.zero_grad()
    
    # Backward pass: compute gradient of the loss with respect to model
    # parameters
    loss.backward()
    
    # Calling the step function on an Optimizer makes an update to its
    # parameters
    optimizer.step()

0 735.410888671875
1 735.2228393554688
2 735.034912109375
3 734.846923828125
4 734.6589965820312
5 734.47119140625
6 734.2833862304688
7 734.0955810546875
8 733.9078369140625
9 733.7200927734375
10 733.5324096679688
11 733.3447875976562
12 733.1572875976562
13 732.9700317382812
14 732.7827758789062
15 732.5955810546875
16 732.408447265625
17 732.2212524414062
18 732.0341186523438
19 731.8470458984375
20 731.6600341796875
21 731.4730834960938
22 731.2861328125
23 731.0993041992188
24 730.9124755859375
25 730.7256469726562
26 730.5389404296875
27 730.3524169921875
28 730.1659545898438
29 729.9795532226562
30 729.793212890625
31 729.6068725585938
32 729.4205932617188
33 729.2343139648438
34 729.0480346679688
35 728.86181640625
36 728.6756591796875
37 728.4895629882812
38 728.3035278320312
39 728.1175537109375
40 727.9315185546875
41 727.74560546875
42 727.5597534179688
43 727.3740234375
44 727.1883544921875
45 727.002685546875
46 726.8170776367188
47 726.6315307617188
48 726.4459838867188

492 649.4932250976562
493 649.3314208984375
494 649.169677734375
495 649.0081176757812
496 648.8464965820312
497 648.6849365234375
498 648.5234375
499 648.362060546875


In [63]:
#PyTorch: Custom nn Modules

import torch

class TwoLayerNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        """
        In the constructor we instantiate two nn.Linear modules and assign them as
        member variables.
        """
        super(TwoLayerNet, self).__init__()
        self.linear1= torch.nn.Linear(D_in, H)
        self.linear2= torch.nn.Linear(H, D_out)
        
    def forward(self, x):
        """
        In the forward function we accept a Tensor of input data and we must return
        a Tensor of output data. We can use Modules defined in the constructor as
        well as arbitrary operators on Tensors.
        """
        h_relu= self.linear1(x).clamp(min=0)
        y_pred= self.linear2(h_relu)
        return y_pred
    
# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
n, D_in, D_out, H= 64, 1000, 10, 100

# Create random Tensors to hold inputs and outputs
x= torch.randn(N, D_in)
y= torch.randn(N, D_out)

# Construct our model by instantiating the class defined above
model= TwoLayerNet(D_in, H, D_out)

# Construct our loss function and an Optimizer. The call to model.parameters()
# in the SGD constructor will contain the learnable parameters of the two
# nn.Linear modules which are members of the model.
criterion= torch.nn.MSELoss(reduction='sum')
optimizer= torch.optim.SGD(model.parameters(), lr=1e-4)

for t in range(500):
    # Forward pass: Compute predicted y by passing x to the model
    y_pred= model(x)
    
    # Compute and print loss
    loss= criterion(y_pred, y)
    print(y, loss)
    
    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

tensor([[-0.8105, -1.3391, -1.1458,  0.2609,  1.9668,  0.9404,  0.4225, -1.1355,
          1.1398, -1.0651],
        [-0.2697,  0.8769, -2.2202,  0.6719,  1.5369, -0.2797,  1.8110,  1.2522,
          1.0812,  0.4714],
        [-0.9371, -0.5206,  1.2659, -0.4720,  1.2975,  0.4455,  0.9960,  0.4565,
          0.0153,  0.4935],
        [ 0.9244, -0.1092,  1.9596, -0.3080,  0.9493,  0.4979, -0.0641,  1.6074,
         -1.7428, -0.5953],
        [ 0.1003, -0.2959,  0.2097, -0.0248, -1.2706, -0.0570, -0.6627,  0.1261,
         -0.1403,  0.1411],
        [ 0.1821,  0.7885, -0.3916, -0.0237, -0.0515, -0.3283, -0.3274,  1.7532,
         -0.2324,  0.1220],
        [ 0.4003, -0.0009,  1.2339, -0.5424,  0.5311,  0.4586,  2.2318, -1.6105,
         -0.5990,  0.3998],
        [ 0.7088,  2.1017,  0.7638, -1.9396, -2.0021, -0.9541, -1.3842,  0.3042,
          0.4627, -0.0162],
        [ 1.4741,  1.7762,  0.7077, -0.9738, -1.1907,  0.2450, -0.5634, -0.5015,
         -1.1932, -1.4105],
        [ 0.3444, -

          1.4218,  0.5596]]) tensor(24.2641, grad_fn=<MseLossBackward>)
tensor([[-0.8105, -1.3391, -1.1458,  0.2609,  1.9668,  0.9404,  0.4225, -1.1355,
          1.1398, -1.0651],
        [-0.2697,  0.8769, -2.2202,  0.6719,  1.5369, -0.2797,  1.8110,  1.2522,
          1.0812,  0.4714],
        [-0.9371, -0.5206,  1.2659, -0.4720,  1.2975,  0.4455,  0.9960,  0.4565,
          0.0153,  0.4935],
        [ 0.9244, -0.1092,  1.9596, -0.3080,  0.9493,  0.4979, -0.0641,  1.6074,
         -1.7428, -0.5953],
        [ 0.1003, -0.2959,  0.2097, -0.0248, -1.2706, -0.0570, -0.6627,  0.1261,
         -0.1403,  0.1411],
        [ 0.1821,  0.7885, -0.3916, -0.0237, -0.0515, -0.3283, -0.3274,  1.7532,
         -0.2324,  0.1220],
        [ 0.4003, -0.0009,  1.2339, -0.5424,  0.5311,  0.4586,  2.2318, -1.6105,
         -0.5990,  0.3998],
        [ 0.7088,  2.1017,  0.7638, -1.9396, -2.0021, -0.9541, -1.3842,  0.3042,
          0.4627, -0.0162],
        [ 1.4741,  1.7762,  0.7077, -0.9738, -1.1907,  0

          1.4218,  0.5596]]) tensor(2.6938, grad_fn=<MseLossBackward>)
tensor([[-0.8105, -1.3391, -1.1458,  0.2609,  1.9668,  0.9404,  0.4225, -1.1355,
          1.1398, -1.0651],
        [-0.2697,  0.8769, -2.2202,  0.6719,  1.5369, -0.2797,  1.8110,  1.2522,
          1.0812,  0.4714],
        [-0.9371, -0.5206,  1.2659, -0.4720,  1.2975,  0.4455,  0.9960,  0.4565,
          0.0153,  0.4935],
        [ 0.9244, -0.1092,  1.9596, -0.3080,  0.9493,  0.4979, -0.0641,  1.6074,
         -1.7428, -0.5953],
        [ 0.1003, -0.2959,  0.2097, -0.0248, -1.2706, -0.0570, -0.6627,  0.1261,
         -0.1403,  0.1411],
        [ 0.1821,  0.7885, -0.3916, -0.0237, -0.0515, -0.3283, -0.3274,  1.7532,
         -0.2324,  0.1220],
        [ 0.4003, -0.0009,  1.2339, -0.5424,  0.5311,  0.4586,  2.2318, -1.6105,
         -0.5990,  0.3998],
        [ 0.7088,  2.1017,  0.7638, -1.9396, -2.0021, -0.9541, -1.3842,  0.3042,
          0.4627, -0.0162],
        [ 1.4741,  1.7762,  0.7077, -0.9738, -1.1907,  0.

          1.4218,  0.5596]]) tensor(0.1903, grad_fn=<MseLossBackward>)
tensor([[-0.8105, -1.3391, -1.1458,  0.2609,  1.9668,  0.9404,  0.4225, -1.1355,
          1.1398, -1.0651],
        [-0.2697,  0.8769, -2.2202,  0.6719,  1.5369, -0.2797,  1.8110,  1.2522,
          1.0812,  0.4714],
        [-0.9371, -0.5206,  1.2659, -0.4720,  1.2975,  0.4455,  0.9960,  0.4565,
          0.0153,  0.4935],
        [ 0.9244, -0.1092,  1.9596, -0.3080,  0.9493,  0.4979, -0.0641,  1.6074,
         -1.7428, -0.5953],
        [ 0.1003, -0.2959,  0.2097, -0.0248, -1.2706, -0.0570, -0.6627,  0.1261,
         -0.1403,  0.1411],
        [ 0.1821,  0.7885, -0.3916, -0.0237, -0.0515, -0.3283, -0.3274,  1.7532,
         -0.2324,  0.1220],
        [ 0.4003, -0.0009,  1.2339, -0.5424,  0.5311,  0.4586,  2.2318, -1.6105,
         -0.5990,  0.3998],
        [ 0.7088,  2.1017,  0.7638, -1.9396, -2.0021, -0.9541, -1.3842,  0.3042,
          0.4627, -0.0162],
        [ 1.4741,  1.7762,  0.7077, -0.9738, -1.1907,  0.

          1.4218,  0.5596]]) tensor(0.0209, grad_fn=<MseLossBackward>)
tensor([[-0.8105, -1.3391, -1.1458,  0.2609,  1.9668,  0.9404,  0.4225, -1.1355,
          1.1398, -1.0651],
        [-0.2697,  0.8769, -2.2202,  0.6719,  1.5369, -0.2797,  1.8110,  1.2522,
          1.0812,  0.4714],
        [-0.9371, -0.5206,  1.2659, -0.4720,  1.2975,  0.4455,  0.9960,  0.4565,
          0.0153,  0.4935],
        [ 0.9244, -0.1092,  1.9596, -0.3080,  0.9493,  0.4979, -0.0641,  1.6074,
         -1.7428, -0.5953],
        [ 0.1003, -0.2959,  0.2097, -0.0248, -1.2706, -0.0570, -0.6627,  0.1261,
         -0.1403,  0.1411],
        [ 0.1821,  0.7885, -0.3916, -0.0237, -0.0515, -0.3283, -0.3274,  1.7532,
         -0.2324,  0.1220],
        [ 0.4003, -0.0009,  1.2339, -0.5424,  0.5311,  0.4586,  2.2318, -1.6105,
         -0.5990,  0.3998],
        [ 0.7088,  2.1017,  0.7638, -1.9396, -2.0021, -0.9541, -1.3842,  0.3042,
          0.4627, -0.0162],
        [ 1.4741,  1.7762,  0.7077, -0.9738, -1.1907,  0.

          1.4218,  0.5596]]) tensor(0.0027, grad_fn=<MseLossBackward>)
tensor([[-0.8105, -1.3391, -1.1458,  0.2609,  1.9668,  0.9404,  0.4225, -1.1355,
          1.1398, -1.0651],
        [-0.2697,  0.8769, -2.2202,  0.6719,  1.5369, -0.2797,  1.8110,  1.2522,
          1.0812,  0.4714],
        [-0.9371, -0.5206,  1.2659, -0.4720,  1.2975,  0.4455,  0.9960,  0.4565,
          0.0153,  0.4935],
        [ 0.9244, -0.1092,  1.9596, -0.3080,  0.9493,  0.4979, -0.0641,  1.6074,
         -1.7428, -0.5953],
        [ 0.1003, -0.2959,  0.2097, -0.0248, -1.2706, -0.0570, -0.6627,  0.1261,
         -0.1403,  0.1411],
        [ 0.1821,  0.7885, -0.3916, -0.0237, -0.0515, -0.3283, -0.3274,  1.7532,
         -0.2324,  0.1220],
        [ 0.4003, -0.0009,  1.2339, -0.5424,  0.5311,  0.4586,  2.2318, -1.6105,
         -0.5990,  0.3998],
        [ 0.7088,  2.1017,  0.7638, -1.9396, -2.0021, -0.9541, -1.3842,  0.3042,
          0.4627, -0.0162],
        [ 1.4741,  1.7762,  0.7077, -0.9738, -1.1907,  0.

          1.4218,  0.5596]]) tensor(0.0004, grad_fn=<MseLossBackward>)
tensor([[-0.8105, -1.3391, -1.1458,  0.2609,  1.9668,  0.9404,  0.4225, -1.1355,
          1.1398, -1.0651],
        [-0.2697,  0.8769, -2.2202,  0.6719,  1.5369, -0.2797,  1.8110,  1.2522,
          1.0812,  0.4714],
        [-0.9371, -0.5206,  1.2659, -0.4720,  1.2975,  0.4455,  0.9960,  0.4565,
          0.0153,  0.4935],
        [ 0.9244, -0.1092,  1.9596, -0.3080,  0.9493,  0.4979, -0.0641,  1.6074,
         -1.7428, -0.5953],
        [ 0.1003, -0.2959,  0.2097, -0.0248, -1.2706, -0.0570, -0.6627,  0.1261,
         -0.1403,  0.1411],
        [ 0.1821,  0.7885, -0.3916, -0.0237, -0.0515, -0.3283, -0.3274,  1.7532,
         -0.2324,  0.1220],
        [ 0.4003, -0.0009,  1.2339, -0.5424,  0.5311,  0.4586,  2.2318, -1.6105,
         -0.5990,  0.3998],
        [ 0.7088,  2.1017,  0.7638, -1.9396, -2.0021, -0.9541, -1.3842,  0.3042,
          0.4627, -0.0162],
        [ 1.4741,  1.7762,  0.7077, -0.9738, -1.1907,  0.

          1.4218,  0.5596]]) tensor(0.0001, grad_fn=<MseLossBackward>)
tensor([[-0.8105, -1.3391, -1.1458,  0.2609,  1.9668,  0.9404,  0.4225, -1.1355,
          1.1398, -1.0651],
        [-0.2697,  0.8769, -2.2202,  0.6719,  1.5369, -0.2797,  1.8110,  1.2522,
          1.0812,  0.4714],
        [-0.9371, -0.5206,  1.2659, -0.4720,  1.2975,  0.4455,  0.9960,  0.4565,
          0.0153,  0.4935],
        [ 0.9244, -0.1092,  1.9596, -0.3080,  0.9493,  0.4979, -0.0641,  1.6074,
         -1.7428, -0.5953],
        [ 0.1003, -0.2959,  0.2097, -0.0248, -1.2706, -0.0570, -0.6627,  0.1261,
         -0.1403,  0.1411],
        [ 0.1821,  0.7885, -0.3916, -0.0237, -0.0515, -0.3283, -0.3274,  1.7532,
         -0.2324,  0.1220],
        [ 0.4003, -0.0009,  1.2339, -0.5424,  0.5311,  0.4586,  2.2318, -1.6105,
         -0.5990,  0.3998],
        [ 0.7088,  2.1017,  0.7638, -1.9396, -2.0021, -0.9541, -1.3842,  0.3042,
          0.4627, -0.0162],
        [ 1.4741,  1.7762,  0.7077, -0.9738, -1.1907,  0.

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [2]:
#PyTorch: Control Flow + Weight Sharing

import torch
import random

class DynamicNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        """
        In the constructor we construct three nn.Linear instances that we will use
        in the forward pass.
        """
        super(DynamicNet, self).__init__()
        self.input_linear= torch.nn.Linear(D_in, H)
        self.middle_linear= torch.nn.Linear(H,H)
        self.output_linear= torch.nn.Linear(H, D_out)
        
    def forward(self, x):
        """
        For the forward pass of the model, we randomly choose either 0, 1, 2, or 3
        and reuse the middle_linear Module that many times to compute hidden layer
        representations.

        Since each forward pass builds a dynamic computation graph, we can use normal
        Python control-flow operators like loops or conditional statements when
        defining the forward pass of the model.

        Here we also see that it is perfectly safe to reuse the same Module many
        times when defining a computational graph. This is a big improvement from Lua
        Torch, where each Module could be used only once.
        """
        h_relu= self.input_linear(x).clamp(min=0)
        for _ in range(random.randint(0,3)):
            h_relu= self.middle_layer=(h_relu).clamp(min=0)
        y_pred= self.output_linear(h_relu)
        return y_pred
    
# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Construct our model by instantiating the class defined above
model= DynamicNet(D_in, H, D_out)

# Construct our loss function and an Optimizer. Training this strange model with
# vanilla stochastic gradient descent is tough, so we use momentum
criterion= torch.nn.MSELoss(reduction='sum')
optimizer= torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.9)
for t in range(500):
    # Forward pass: Compute predicted y by passing x to the model
    y_pred= model(x)
    
    # Compute and print loss
    loss= criterion(y_pred, y)
    print(t,loss)
    
    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    

0 tensor(644.6586, grad_fn=<MseLossBackward>)
1 tensor(598.7096, grad_fn=<MseLossBackward>)
2 tensor(522.6442, grad_fn=<MseLossBackward>)
3 tensor(435.7083, grad_fn=<MseLossBackward>)
4 tensor(352.8583, grad_fn=<MseLossBackward>)
5 tensor(280.4581, grad_fn=<MseLossBackward>)
6 tensor(217.4393, grad_fn=<MseLossBackward>)
7 tensor(164.2935, grad_fn=<MseLossBackward>)
8 tensor(120.5012, grad_fn=<MseLossBackward>)
9 tensor(86.3611, grad_fn=<MseLossBackward>)
10 tensor(63.2298, grad_fn=<MseLossBackward>)
11 tensor(52.1151, grad_fn=<MseLossBackward>)
12 tensor(52.1589, grad_fn=<MseLossBackward>)
13 tensor(58.4908, grad_fn=<MseLossBackward>)
14 tensor(63.4142, grad_fn=<MseLossBackward>)
15 tensor(61.1949, grad_fn=<MseLossBackward>)
16 tensor(52.1237, grad_fn=<MseLossBackward>)
17 tensor(41.0252, grad_fn=<MseLossBackward>)
18 tensor(32.0078, grad_fn=<MseLossBackward>)
19 tensor(25.8487, grad_fn=<MseLossBackward>)
20 tensor(21.4589, grad_fn=<MseLossBackward>)
21 tensor(18.4874, grad_fn=<MseLoss

248 tensor(1.1035e-09, grad_fn=<MseLossBackward>)
249 tensor(9.9230e-10, grad_fn=<MseLossBackward>)
250 tensor(8.6154e-10, grad_fn=<MseLossBackward>)
251 tensor(7.6080e-10, grad_fn=<MseLossBackward>)
252 tensor(6.7935e-10, grad_fn=<MseLossBackward>)
253 tensor(6.0220e-10, grad_fn=<MseLossBackward>)
254 tensor(5.2374e-10, grad_fn=<MseLossBackward>)
255 tensor(4.5782e-10, grad_fn=<MseLossBackward>)
256 tensor(4.1284e-10, grad_fn=<MseLossBackward>)
257 tensor(3.9449e-10, grad_fn=<MseLossBackward>)
258 tensor(3.7992e-10, grad_fn=<MseLossBackward>)
259 tensor(3.4390e-10, grad_fn=<MseLossBackward>)
260 tensor(3.0205e-10, grad_fn=<MseLossBackward>)
261 tensor(2.5835e-10, grad_fn=<MseLossBackward>)
262 tensor(2.3299e-10, grad_fn=<MseLossBackward>)
263 tensor(2.1133e-10, grad_fn=<MseLossBackward>)
264 tensor(1.8832e-10, grad_fn=<MseLossBackward>)
265 tensor(1.6838e-10, grad_fn=<MseLossBackward>)
266 tensor(1.4977e-10, grad_fn=<MseLossBackward>)
267 tensor(1.3408e-10, grad_fn=<MseLossBackward>)
