In [4]:
import numpy as np

In [5]:
BATCH_SIZE = 64
INPUT_DIMENSION = 1000
HIDDEN_DIMENSION = 100
OUTPUT_DIMENSION = 10
LEANING_RATE = 1e-6

In [6]:
x = np.random.randn(BATCH_SIZE, INPUT_DIMENSION)
y = np.random.randn(BATCH_SIZE, OUTPUT_DIMENSION)

In [7]:
w1 = np.random.randn(INPUT_DIMENSION, HIDDEN_DIMENSION)
w2 = np.random.randn(HIDDEN_DIMENSION, OUTPUT_DIMENSION)

In [11]:
def forward_pass():
    h = x.dot(w1)
    h_relu = np.maxinum(h, 0)
    y_pred = h_relu.dot(w2)
    loss = np.square(y_pred - y).sum()
    print(loss)

In [12]:
def backward_pass():
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)
    
    w1 -= LEARNING_RATE * grad_w1
    w2 -= LEARNING_RATE * grad_w2

In [13]:
import torch

In [14]:
dtype = torch.float
device = torch.device("cpu")

In [15]:
N, D_in, H, D_out = 64, 1000, 100, 10

In [16]:
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

In [17]:
w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)

In [18]:
learning_rate = 1e-6

In [19]:
for t in range(500):
    # forward
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)
    
    loss = (y_pred - y).pow(2).sum().item()
    if t % 100 == 99:
        print(t, loss)
    
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)
    
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2
    

99 679.5360107421875
199 3.765549898147583
299 0.03461253643035889
399 0.0006114020943641663
499 7.162222027545795e-05


In [22]:
a1 = np.array([1,2,3])
a2 = np.array([4,5,6])
a3 = np.dot(a1, a2)
print(a3)

32
