In [None]:
import math
import torch

print("torch", torch.__version__)
device = "cpu"
torch.manual_seed(0)

## 1) Tensors vs NumPy arrays (conceptual)

Important differences:

- tensors can live on GPU (`cuda`) and track gradients (`requires_grad=True`)
- ops build a computation graph (for differentiable tensors)
- many ops are similar to NumPy but not identical


In [None]:
x = torch.tensor([1.0, 2.0, 3.0], device=device)
w = torch.randn(3, device=device, requires_grad=True)

y = (w * x).sum()
y.backward()
print("y", y.item())
print("grad w", w.grad)

### Common gotcha: gradients accumulate

Calling `.backward()` adds into `.grad`. You typically zero gradients each step.


In [None]:
w.grad.zero_()
y2 = (w * x).sum()
y2.backward()
print("grad after fresh backward", w.grad)

## 2) From math to code: linear regression

We’ll fit a model $at{y} = Xw + b$ by minimizing mean squared error (MSE).

Key idea: autograd gives gradients, and gradient descent updates parameters.


In [None]:
# Synthetic data
n, d = 200, 3
X = torch.randn(n, d, device=device)
w_true = torch.tensor([2.0, -1.0, 0.5], device=device)
b_true = torch.tensor(0.7, device=device)
noise = 0.1 * torch.randn(n, device=device)
y = X @ w_true + b_true + noise

# Parameters to learn
w = torch.zeros(d, device=device, requires_grad=True)
b = torch.zeros((), device=device, requires_grad=True)


def mse(yhat, y):
    return ((yhat - y) ** 2).mean()


lr = 0.1
for step in range(200):
    yhat = X @ w + b
    loss = mse(yhat, y)
    loss.backward()

    with torch.no_grad():
        w -= lr * w.grad
        b -= lr * b.grad
        w.grad.zero_()
        b.grad.zero_()

    if step % 50 == 0:
        print(step, float(loss))

print("w learned", w.detach().cpu().numpy())
print("b learned", float(b))

## 3) `nn.Module` version (still minimal)

This is the same model, but packaged the PyTorch way.


In [None]:
import torch.nn as nn

model = nn.Linear(d, 1, bias=True).to(device)
opt = torch.optim.SGD(model.parameters(), lr=0.1)

for step in range(200):
    yhat = model(X).squeeze(-1)
    loss = ((yhat - y) ** 2).mean()
    opt.zero_grad()
    loss.backward()
    opt.step()

    if step % 50 == 0:
        print(step, float(loss))

w_learned = model.weight.detach().squeeze(0)
b_learned = model.bias.detach().squeeze(0)
print("w learned", w_learned.cpu().numpy())
print("b learned", float(b_learned))

# Exercises

## Exercise A — Manual gradient check

For linear regression with MSE, derive gradients w.r.t. `w` and `b` and compare to autograd on a single batch.

## Exercise B — Logistic regression

Generate a 2D synthetic classification dataset and train logistic regression with `binary_cross_entropy_with_logits`.

## Exercise C — L2 regularization

Add $ambda w^2$ to the loss and observe how weights shrink as $ambda$ increases.


In [None]:
# Starter for Exercise B (logistic regression)
n = 400
X = torch.randn(n, 2, device=device)
true_w = torch.tensor([1.5, -2.0], device=device)
true_b = torch.tensor(-0.25, device=device)
logits = X @ true_w + true_b
y = (logits > 0).float()

model = nn.Linear(2, 1).to(device)
opt = torch.optim.SGD(model.parameters(), lr=0.2)

for step in range(200):
    logits = model(X).squeeze(-1)
    loss = torch.nn.functional.binary_cross_entropy_with_logits(logits, y)
    opt.zero_grad()
    loss.backward()
    opt.step()

    if step % 50 == 0:
        with torch.no_grad():
            pred = (logits > 0).float()
            acc = (pred == y).float().mean()
        print(step, float(loss), float(acc))