[Reference](https://medium.com/data-science-collective/pytorch-tensors-explained-433ea1a91c0f)

In [1]:
import torch
import torch.nn as nn

# Set seed for reproducibility
torch.manual_seed(0)

# Dummy data
x = torch.randn(5, 2)     # 5 samples, 2 features
y = torch.randn(5, 1)     # Target values

# Simple MLP: 2 -> 3 -> 1
model = nn.Sequential(
    nn.Linear(2, 3),
    nn.ReLU(),
    nn.Linear(3, 1)
)

# Loss and optimizer
loss_fn = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

# One training step
y_pred = model(x)
loss = loss_fn(y_pred, y)
loss.backward()           # autograd does all the work here!
optimizer.step()

In [2]:
import numpy as np

np.random.seed(0)

# Dummy input and target
x = np.random.randn(5, 2)  # 5 samples, 2 features
y = np.random.randn(5, 1)  # 5 samples, 1 output

# Initialize weights
W1 = np.random.randn(2, 3)  # (input_dim, hidden_dim)
W2 = np.random.randn(3, 1)  # (hidden_dim, output_dim)

# Forward pass
z1 = x @ W1                # shape: (5, 3)
a1 = np.maximum(0, z1)     # ReLU activation
y_pred = a1 @ W2           # shape: (5, 1)

# Compute loss (MSE)
loss = np.mean((y_pred - y)**2)
print(f"Loss before: {loss:.4f}")

# Backward pass (manual gradients)

# dL/dy_pred
grad_y_pred = 2 * (y_pred - y) / y.shape[0]  # shape: (5, 1)

# dL/dW2 = a1^T @ grad_y_pred
grad_W2 = a1.T @ grad_y_pred                 # shape: (3, 1)

# dL/da1 = grad_y_pred @ W2^T
grad_a1 = grad_y_pred @ W2.T                # shape: (5, 3)

# dL/dz1 = grad_a1 * ReLU'(z1)
grad_z1 = grad_a1 * (z1 > 0).astype(float)   # shape: (5, 3)

# dL/dW1 = x^T @ grad_z1
grad_W1 = x.T @ grad_z1                      # shape: (2, 3)

# Gradient descent step
lr = 0.01
W1 -= lr * grad_W1
W2 -= lr * grad_W2

# Forward again after update
z1 = x @ W1
a1 = np.maximum(0, z1)
y_pred = a1 @ W2
loss = np.mean((y_pred - y)**2)
print(f"Loss after: {loss:.4f}")

Loss before: 1.7328
Loss after: 1.4521
