In [2]:
import torch

device = torch.device("cuda:0")

## pytorch手动前向传播/反向传播

In [3]:
n, dim_in, dim_hidden, dim_out = 64, 1000, 100, 10

x = torch.randn(n, dim_in, device=device)
y = torch.randn(n, dim_out, device=device)

w1 = torch.randn(dim_in, dim_hidden, device=device)
w2 = torch.rand(dim_hidden, dim_out, device=device)

$$
\frac{\partial y}{\partial y_{pred}} = 2(y_{pred} - y) \\
\frac{\partial y}{\partial w_2} = \frac{\partial y_{pred}}{\partial w_2} \cdot \frac{\partial y}{\partial y_{pred}}
$$

In [20]:
learning_rate = 1e-6
for t in range(500):
    
    # 前向传播
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)

    # Loss
    loss = (y_pred - y).pow(2).mean().item()
    if t % 100 == 99:
            print(t, loss)

    # 反向传播
    grad_y_pred = 2 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)
    
    # Update weights using gradient descent
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

99 16.625436782836914
199 3.523359775543213
299 1.0563571453094482
399 0.3721386194229126
499 0.1454431563615799


## pytorch自动前向传播/反向传播

In [22]:
x = torch.randn(n, dim_in, device=device)
y = torch.randn(n, dim_out, device=device)
w1 = torch.randn(dim_in, dim_hidden, device=device, requires_grad=True)
w2 = torch.randn(dim_hidden, dim_out, device=device, requires_grad=True)

In [26]:
for t in range(500):
    # 前向传播
    y_pred = x.mm(w1).clamp(min=0).mm(w2)
    
    loss = (y_pred - y).pow(2).sum()
    if t % 100 == 99:
        print(t, loss.item())
    
    loss.backward()
    
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()

99 459.09027099609375
199 3.5914642810821533
299 0.045708272606134415
399 0.0009173029102385044
499 8.977029210655019e-05
