In [1]:
import numpy as np

In [2]:
#https://pytorch.org/tutorials/beginner/pytorch_with_examples.html

In [3]:
N, D_in, H, D_out = 64, 1000, 100, 1

# Create random input and output data
x = np.random.randn(N, D_in)
y = np.array([np.sum(x, axis=1)]).T

# Randomly initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

In [4]:
y[:,0][:5]

array([-16.81214661,  63.43331031, -30.11298366, -15.99777578,
       -37.89345025])

In [5]:

def backprop_numpy(x, y, w1, w2, learning_rate = 1e-6, iterations = 500):
    w1 = w1.copy()
    w2 = w2.copy()
    for t in range(iterations):
        # Forward pass: compute predicted y
        h = x.dot(w1)
        h_relu = np.maximum(h, 0)
        y_pred = h_relu.dot(w2)

        # Compute and print loss
        loss = np.square(y_pred - y).sum()
        if t%25 == 0: 
            print t, loss

        # Backprop to compute gradients of w1 and w2 with respect to loss
        grad_y_pred = 2.0 * (y_pred - y)
        grad_w2 = h_relu.T.dot(grad_y_pred)
        grad_h_relu = grad_y_pred.dot(w2.T)
        grad_h = grad_h_relu
        grad_h[h < 0] = 0
        grad_w1 = x.T.dot(grad_h)
        # Update weights
        w1 -= learning_rate * grad_w1
        w2 -= learning_rate * grad_w2
        
        

backprop_numpy(x,y,w1,w2)

0 2931687.0931147994
25 11025.313264349254
50 704.8506790624699
75 54.58659054862747
100 4.584337325208587
125 0.40398596588295244
150 0.03691188800463184
175 0.0034834734360802335
200 0.00034079533700742605
225 3.4592504676815905e-05
250 3.642108389390279e-06
275 3.976237743989936e-07
300 4.491794063816992e-08
325 5.233035473884851e-09
350 6.258896035202688e-10
375 7.650478491147152e-11
400 9.51595995633827e-12
425 1.1999778265914711e-12
450 1.5296750978969234e-13
475 1.9665125447343667e-14


In [6]:
#https://ml-cheatsheet.readthedocs.io/en/latest/forwardpropagation.html

In [7]:
def relu(z):
    return np.maximum(0,z)

def feed_forward(x, w1, w2):
    # Hidden layer
    Zh = np.dot(x, w1)
    H = relu(Zh)

    # Output layer
    prediction = np.dot(H, w2)
    return prediction

In [8]:
feed_forward(x, w1, w2)[:,0][:5]

array([376.00512671, 345.30534599, -55.24607577, 145.61110316,
       482.88286753])

In [9]:
def relu_prime(z):
    z[z>0] = 1
    z[z<=0] = 0
    return z


def cost(yHat, y):
    return  ((yHat - y)**2).sum()

def cost_prime(yHat, y):
    return 2*(yHat - y)

def backprop(x, y, w_hidden, w_output, lr = 1e-6, iterations = 500):
    w_hidden = w_hidden.copy()
    w_output = w_output.copy()
    for i in range(iterations):
        Zh = np.dot(x, w_hidden)
        H = relu(Zh)
        # Output layer
        yHat = np.dot(H, w_output)

        # Layer Error
        if i%25 == 0: 
            print i, cost(yHat,y)
        Eo = cost_prime(yHat,y)

        Eh = np.dot(Eo, w_output.T)*relu_prime(Zh)

        # Cost derivative for weights
        dWo = np.dot(Eo.T, H).T
        dWh = np.dot(Eh.T, x).T

        # Update weights
        w_hidden -= lr * dWh
        w_output -= lr * dWo

backprop(x,y,w1,w2)
print
backprop_numpy(x,y,w1,w2)

0 2931687.0931147994
25 11025.313264349239
50 704.850679062471
75 54.5865905486278
100 4.584337325208547
125 0.4039859658828533
150 0.03691188800458872
175 0.0034834734360745297
200 0.00034079533700559993
225 3.459250467675314e-05
250 3.642108389412839e-06
275 3.9762377439754667e-07
300 4.491794061724478e-08
325 5.2330354686945186e-09
350 6.25889605349028e-10
375 7.650478422533618e-11
400 9.515959874516949e-12
425 1.1999777900439918e-12
450 1.5296756132026948e-13
475 1.9665134845742937e-14

0 2931687.0931147994
25 11025.313264349254
50 704.8506790624699
75 54.58659054862747
100 4.584337325208587
125 0.40398596588295244
150 0.03691188800463184
175 0.0034834734360802335
200 0.00034079533700742605
225 3.4592504676815905e-05
250 3.642108389390279e-06
275 3.976237743989936e-07
300 4.491794063816992e-08
325 5.233035473884851e-09
350 6.258896035202688e-10
375 7.650478491147152e-11
400 9.51595995633827e-12
425 1.1999778265914711e-12
450 1.5296750978969234e-13
475 1.9665125447343667e-14


In [10]:
import torch


dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

In [11]:
x_t = torch.tensor(x, device=device, dtype = dtype)
y_t = torch.tensor(y, device=device, dtype = dtype)
# Randomly initialize weights

w1_t = torch.tensor(w1, device=device, dtype = dtype)
w2_t = torch.tensor(w2, device=device, dtype = dtype)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x_t.mm(w1_t)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2_t)

    # Compute and print loss
    loss = (y_pred - y_t).pow(2).sum().item()
    if t%25 == 0: 
        print t, loss

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y_t)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2_t.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x_t.t().mm(grad_h)

    # Update weights using gradient descent
    w1_t -= learning_rate * grad_w1
    w2_t -= learning_rate * grad_w2

0 2931686.75
25 11025.3076172
50 704.838317871
75 54.585193634
100 4.58413743973
125 0.403986006975
150 0.0369503945112
175 0.00357747706585
200 0.000459823611891
225 0.000104585691588
250 3.68825276382e-05
275 1.66584795807e-05
300 9.07837511477e-06
325 5.60935586691e-06
350 3.82332063964e-06
375 2.8110389394e-06
400 2.22174026021e-06
425 1.7269418322e-06
450 1.48425442603e-06
475 1.31212118504e-06


In [12]:
x_t = torch.tensor(x, device=device, dtype = dtype)
y_t = torch.tensor(y, device=device, dtype = dtype)
# Randomly initialize weights

w1_t = torch.tensor(w1, device=device, dtype = dtype, requires_grad=True)
w2_t = torch.tensor(w2, device=device, dtype = dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y using operations on Tensors; these
    # are exactly the same operations we used to compute the forward pass using
    # Tensors, but we do not need to keep references to intermediate values since
    # we are not implementing the backward pass by hand.
    y_pred = x_t.mm(w1_t).clamp(min=0).mm(w2_t)

    # Compute and print loss using operations on Tensors.
    # Now loss is a Tensor of shape (1,)
    # loss.item() gets the a scalar value held in the loss.
    loss = (y_pred - y_t).pow(2).sum()
    if t%25 == 0: 
        print t, loss.item()

    # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Tensors with requires_grad=True.
    # After this call w1.grad and w2.grad will be Tensors holding the gradient
    # of the loss with respect to w1 and w2 respectively.
    loss.backward()

    # Manually update weights using gradient descent. Wrap in torch.no_grad()
    # because weights have requires_grad=True, but we don't need to track this
    # in autograd.
    # An alternative way is to operate on weight.data and weight.grad.data.
    # Recall that tensor.data gives a tensor that shares the storage with
    # tensor, but doesn't track history.
    # You can also use torch.optim.SGD to achieve this.
    with torch.no_grad():
        w1_t -= learning_rate * w1_t.grad
        w2_t -= learning_rate * w2_t.grad

        # Manually zero the gradients after updating weights
        w1_t.grad.zero_()
        w2_t.grad.zero_()

0 2931686.75
25 11025.3076172
50 704.838317871
75 54.585193634
100 4.58413743973
125 0.403986006975
150 0.0369503945112
175 0.00357747706585
200 0.000459823611891
225 0.000104585691588
250 3.68825276382e-05
275 1.66584795807e-05
300 9.07837511477e-06
325 5.60935586691e-06
350 3.82332063964e-06
375 2.8110389394e-06
400 2.22174026021e-06
425 1.7269418322e-06
450 1.48425442603e-06
475 1.31212118504e-06


In [13]:
class MyReLU(torch.autograd.Function):
    """
    We can implement our own custom autograd Functions by subclassing
    torch.autograd.Function and implementing the forward and backward passes
    which operate on Tensors.
    """

    @staticmethod
    def forward(ctx, input):
        """
        In the forward pass we receive a Tensor containing the input and return
        a Tensor containing the output. ctx is a context object that can be used
        to stash information for backward computation. You can cache arbitrary
        objects for use in the backward pass using the ctx.save_for_backward method.
        """
        ctx.save_for_backward(input)
        return input.clamp(min=0)

    @staticmethod
    def backward(ctx, grad_output):
        """
        In the backward pass we receive a Tensor containing the gradient of the loss
        with respect to the output, and we need to compute the gradient of the loss
        with respect to the input.
        """
        input, = ctx.saved_tensors
        grad_input = grad_output.clone()
        grad_input[input < 0] = 0
        return grad_input

In [14]:
w1_t = torch.tensor(w1, device=device, dtype = dtype, requires_grad=True)
w2_t = torch.tensor(w2, device=device, dtype = dtype, requires_grad=True)
loss_fn = torch.nn.MSELoss(reduction='sum')
learning_rate = 1e-6
for t in range(500):
    # To apply our Function, we use Function.apply method. We alias this as 'relu'.
    relu = MyReLU.apply

    # Forward pass: compute predicted y using operations; we compute
    # ReLU using our custom autograd operation.
    y_pred = relu(x_t.mm(w1_t)).mm(w2_t)

    # Compute and print loss
    loss = (y_pred - y_t).pow(2).sum()
    if t%25 == 0: 
        loss_other = loss_fn(y_pred, y_t)
        print t, loss.item(), loss_other.item()

    # Use autograd to compute the backward pass.
    loss.backward()

    # Update weights using gradient descent
    with torch.no_grad():
        w1_t -= learning_rate * w1_t.grad
        w2_t -= learning_rate * w2_t.grad

        # Manually zero the gradients after updating weights
        w1_t.grad.zero_()
        w2_t.grad.zero_()

0 2931686.75 2931687.0
25 11025.3076172 11025.3076172
50 704.838317871 704.838439941
75 54.585193634 54.5851860046
100 4.58413743973 4.58413743973
125 0.403986006975 0.403986006975
150 0.0369503945112 0.0369503907859
175 0.00357747706585 0.00357747729868
200 0.000459823611891 0.000459823582787
225 0.000104585691588 0.000104585698864
250 3.68825276382e-05 3.68825240002e-05
275 1.66584795807e-05 1.66584795807e-05
300 9.07837511477e-06 9.07837602426e-06
325 5.60935586691e-06 5.60935586691e-06
350 3.82332063964e-06 3.82332018489e-06
375 2.8110389394e-06 2.81103916677e-06
400 2.22174026021e-06 2.22174026021e-06
425 1.7269418322e-06 1.7269418322e-06
450 1.48425442603e-06 1.48425453972e-06
475 1.31212118504e-06 1.31212118504e-06


In [15]:
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)

# The nn package also contains definitions of popular loss functions; in this
# case we will use Mean Squared Error (MSE) as our loss function.
loss_fn = torch.nn.MSELoss(reduction='sum')

learning_rate = 1e-4
for t in range(500):
    # Forward pass: compute predicted y by passing x to the model. Module objects
    # override the __call__ operator so you can call them like functions. When
    # doing so you pass a Tensor of input data to the Module and it produces
    # a Tensor of output data.
    y_pred = model(x_t)

    # Compute and print loss. We pass Tensors containing the predicted and true
    # values of y, and the loss function returns a Tensor containing the
    # loss.
    loss = loss_fn(y_pred, y_t)
    if t%25 == 0: 
        print t, loss.item()

    # Zero the gradients before running the backward pass.
    model.zero_grad()

    # Backward pass: compute gradient of the loss with respect to all the learnable
    # parameters of the model. Internally, the parameters of each Module are stored
    # in Tensors with requires_grad=True, so this call will compute gradients for
    # all learnable parameters in the model.
    loss.backward()

    # Update the weights using gradient descent. Each parameter is a Tensor, so
    # we can access its gradients like we did before.
    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate * param.grad

0 59030.7890625
25 331.591674805
50 347.495849609
75 131.317733765
100 64.0915679932
125 27.1185035706
150 13.235871315
175 6.00596237183
200 2.89607429504
225 1.34787130356
250 0.645911574364
275 0.304044306278
300 0.144949376583
325 0.0685587823391
350 0.0326169840991
375 0.0154535165057
400 0.00733995297924
425 0.00347711169161
450 0.00165378209203
475 0.000785832933616


In [16]:
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)
loss_fn = torch.nn.MSELoss(reduction='sum')

# Use the optim package to define an Optimizer that will update the weights of
# the model for us. Here we will use Adam; the optim package contains many other
# optimization algoriths. The first argument to the Adam constructor tells the
# optimizer which Tensors it should update.
learning_rate = 1e-4
# optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) #converges very slowly
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
for t in range(500):
    # Forward pass: compute predicted y by passing x to the model.
    y_pred = model(x_t)

    # Compute and print loss.
    loss = loss_fn(y_pred, y_t)
    if t%25 == 0: 
        print t, loss.item()

    # Before the backward pass, use the optimizer object to zero all of the
    # gradients for the variables it will update (which are the learnable
    # weights of the model). This is because by default, gradients are
    # accumulated in buffers( i.e, not overwritten) whenever .backward()
    # is called. Checkout docs of torch.autograd.backward for more details.
    optimizer.zero_grad()

    # Backward pass: compute gradient of the loss with respect to model
    # parameters
    loss.backward()

    # Calling the step function on an Optimizer makes an update to its
    # parameters
    optimizer.step()

0 59058.3867188
25 234.900405884
50 42.7491378784
75 5.22825193405
100 0.643084526062
125 0.0781154930592
150 0.00952059030533
175 0.00115811789874
200 0.000140730466228
225 1.70790754055e-05
250 2.00380077331e-06
275 1.99203086027e-07
300 1.42790979041e-08
325 6.08520567269e-10
350 3.99294819431e-10
375 2.88254087266e-10
400 8.48112691187e-11
425 2.24336105248e-11
450 4.74269512551e-11
475 5.06803488065e-11
