A fully-connected ReLU network with one hidden layer and no biases, trained to predict y from x by minimizing squared Euclidean distance.

This implementation computes the forward pass using operations on PyTorch Variables, and uses PyTorch autograd to compute gradients.

In this implementation we implement our own custom autograd function to perform the ReLU function.

In [1]:
import torch

In [2]:
class MyReLU(torch.autograd.Function):
    """
    We can implement our own custom autograd Functions by subclassing
    torch.autograd.Function and implementing the forward and backward passes
    which operate on Tensors.
    """

    @staticmethod
    def forward(ctx, input):
        """
        In the forward pass we receive a Tensor containing the input and return
        a Tensor containing the output. ctx is a context object that can be used
        to stash information for backward computation. You can cache arbitrary
        objects for use in the backward pass using the ctx.save_for_backward method.
        """
        ctx.save_for_backward(input)
        return input.clamp(min=0)

    @staticmethod
    def backward(ctx, grad_output):
        """
        In the backward pass we receive a Tensor containing the gradient of the loss
        with respect to the output, and we need to compute the gradient of the loss
        with respect to the input.
        """
        input, = ctx.saved_tensors
        grad_input = grad_output.clone()
        grad_input[input < 0] = 0
        return grad_input

In [3]:
dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Create random Tensors for weights.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # To apply our Function, we use Function.apply method. We alias this as 'relu'.
    relu = MyReLU.apply

    # Forward pass: compute predicted y using operations; we compute
    # ReLU using our custom autograd operation.
    y_pred = relu(x.mm(w1)).mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())

    # Use autograd to compute the backward pass.
    loss.backward()

    # Update weights using gradient descent
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()


0 33358036.0
1 33026852.0
2 33183516.0
3 29210112.0
4 20982800.0
5 12396972.0
6 6579062.0
7 3557205.25
8 2147192.5
9 1477105.0
10 1122680.875
11 906445.0625
12 756681.75
13 643587.375
14 553462.125
15 479583.875
16 418045.15625
17 366227.625
18 322223.40625
19 284596.0625
20 252292.375
21 224410.640625
22 200204.265625
23 179123.90625
24 160696.0
25 144520.953125
26 130277.8671875
27 117706.375
28 106560.0703125
29 96660.859375
30 87856.0625
31 79997.53125
32 72964.46875
33 66654.5703125
34 60985.2890625
35 55879.6328125
36 51269.0078125
37 47103.09375
38 43332.890625
39 39912.078125
40 36802.8046875
41 33973.671875
42 31393.533203125
43 29038.42578125
44 26884.390625
45 24912.2578125
46 23106.615234375
47 21453.5546875
48 19937.904296875
49 18543.408203125
50 17259.013671875
51 16074.9169921875
52 14982.052734375
53 13972.712890625
54 13039.275390625
55 12175.7890625
56 11376.1982421875
57 10635.607421875
58 9948.8505859375
59 9311.5263671875
60 8719.79296875
61 8169.603515625
62 7657