---
## Chapter 5 – First Steps with PyTorch
---

## Mini-Lab* 

In [1]:
import numpy as np
import torch

np_rng = np.random.default_rng(42)
torch.manual_seed(42)

# NumPy baseline
arr = np_rng.normal(size=(3, 2))
norm_numpy = np.linalg.norm(arr, axis=1)

# PyTorch equivalent
ten = torch.tensor(arr, dtype=torch.float32)
norm_torch = ten.norm(dim=1)

print(norm_numpy)
print(norm_torch)

[1.08370634 1.20326181 2.34567896]
tensor([1.0837, 1.2033, 2.3457])


In [2]:
x = torch.tensor([3.5], requires_grad=True)
lr = 0.1

for step in range(28):
    loss = (x - 2.0) ** 2 # simple quadratic
    loss.backward()
    with torch.no_grad():
        x -= lr * x.grad
    x.grad.zero_()
    print(f"step {step:02d} | x={x.item():.4f} | loss={loss.item():.6f}")

step 00 | x=3.2000 | loss=2.250000
step 01 | x=2.9600 | loss=1.440000
step 02 | x=2.7680 | loss=0.921600
step 03 | x=2.6144 | loss=0.589824
step 04 | x=2.4915 | loss=0.377488
step 05 | x=2.3932 | loss=0.241592
step 06 | x=2.3146 | loss=0.154619
step 07 | x=2.2517 | loss=0.098956
step 08 | x=2.2013 | loss=0.063332
step 09 | x=2.1611 | loss=0.040532
step 10 | x=2.1288 | loss=0.025941
step 11 | x=2.1031 | loss=0.016602
step 12 | x=2.0825 | loss=0.010625
step 13 | x=2.0660 | loss=0.006800
step 14 | x=2.0528 | loss=0.004352
step 15 | x=2.0422 | loss=0.002785
step 16 | x=2.0338 | loss=0.001783
step 17 | x=2.0270 | loss=0.001141
step 18 | x=2.0216 | loss=0.000730
step 19 | x=2.0173 | loss=0.000467
step 20 | x=2.0138 | loss=0.000299
step 21 | x=2.0111 | loss=0.000191
step 22 | x=2.0089 | loss=0.000123
step 23 | x=2.0071 | loss=0.000078
step 24 | x=2.0057 | loss=0.000050
step 25 | x=2.0045 | loss=0.000032
step 26 | x=2.0036 | loss=0.000021
step 27 | x=2.0029 | loss=0.000013


## Exercises 
1. Write a small function using broadcasting only; verify shape math explicitly.
2. Build a mini autograd example (scalar or vector) and sanity-check gradients.

## Exercise 1
### Step 1 – Use Broadcasting

In [3]:
import torch

def pairwise_distances(A, B):
    """
    Compute pairwise Euclidean distances between A and B using broadcasting only.
    A: tensor of shape [m, d]
    B: tensor of shape [n, d]
    Returns: tensor of shape [m, n]
    """
    # Reshape for broadcasting
    A_exp = A[:, None, :]    # [m, 1, d]
    B_exp = B[None, :, :]    # [1, n, d]
    
    # Subtract with broadcasting → [m, n, d]
    diff = A_exp - B_exp
    
    # Square, sum along the last dimension (d)
    dist_sq = (diff ** 2).sum(dim=-1)
    
    # Take sqrt → [m, n]
    return torch.sqrt(dist_sq)


## Step 2 – Verify Shape Math

In [4]:
A = torch.tensor([[1.0, 2.0], [3.0, 4.0]])   # shape [2, 2]
B = torch.tensor([[5.0, 6.0], [7.0, 8.0], [1.0, 0.0]])  # shape [3, 2]

print("A shape:", A.shape)
print("B shape:", B.shape)

D = pairwise_distances(A, B)
print("D shape:", D.shape)
print(D)

A shape: torch.Size([2, 2])
B shape: torch.Size([3, 2])
D shape: torch.Size([2, 3])
tensor([[5.6569, 8.4853, 2.0000],
        [2.8284, 5.6569, 4.4721]])


## Exercise 2
### Mini Autograd Example (Scalar + Vector Cases)
### Scalar Case

$$
y = w^2 + 3w + 1
$$

$$
\frac{dy}{dw} = 2w + 3
$$

$$
\frac{dy}{dw}\Big|_{w=2} = 2(2) + 3 = 7
$$


In [5]:
# Define a scalar tensor with gradient tracking
w = torch.tensor(2.0, requires_grad=True)

# Define a simple differentiable function
y = w**2 + 3*w + 1   # y = w^2 + 3w + 1

# Compute gradient dy/dw
y.backward()

print("w:", w.item())
print("y:", y.item())
print("w.grad:", w.grad.item())

w: 2.0
y: 11.0
w.grad: 7.0


### Vector Case

$$
y = \sum_i \big( w_i^2 + 3w_i + 1 \big)
$$

$$
\frac{\partial y}{\partial w_i} = 2w_i + 3
$$

$$
\frac{\partial y}{\partial w} =
\begin{bmatrix}
2(1)+3 \\
2(2)+3 \\
2(3)+3
\end{bmatrix}
=
\begin{bmatrix}
5 \\
7 \\
9
\end{bmatrix}
$$


In [6]:
# Reset
w = torch.tensor([1.0, 2.0, 3.0], requires_grad=True)

# Vectorized function: y = (w^2 + 3w + 1).sum()
y = (w**2 + 3*w + 1).sum()

# Backprop
y.backward()

print("w:", w)
print("y:", y.item())
print("w.grad:", w.grad)

w: tensor([1., 2., 3.], requires_grad=True)
y: 35.0
w.grad: tensor([5., 7., 9.])


### Optional Gradient Check

In [7]:
from torch.autograd import gradcheck

w = torch.tensor([1.0, 2.0, 3.0], dtype=torch.double, requires_grad=True)

def func(x):
    return (x**2 + 3*x + 1).sum()

test = gradcheck(func, (w,), eps=1e-6, atol=1e-4)
print("Gradcheck passed:", test)


Gradcheck passed: True


### Summary Formulas

$$
y_{\text{scalar}} = w^2 + 3w + 1
$$

$$
\frac{dy}{dw} = 2w + 3
$$

$$
y_{\text{vector}} = \sum_i (w_i^2 + 3w_i + 1)
$$

$$
\nabla_w y = [\, 2w_1 + 3,\; 2w_2 + 3,\; 2w_3 + 3 \,]
$$


### Exercise 1 — Environment Check
Run the environment check and confirm versions and device information.


In [8]:
# Run PyTorch environment check
import torch, platform

print(f"Python version  : {platform.python_version()}")
print(f"Torch version   : {torch.__version__}")
print(f"CUDA available  : {torch.cuda.is_available()}")
print(f"CUDA device     : {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU only'}")

Python version  : 3.12.11
Torch version   : 2.8.0
CUDA available  : False
CUDA device     : CPU only


### Exercise 2 — NumPy ↔ PyTorch Equivalence
- Tensor creation 
- Broadcasting  
- Reduction


$$
\text{Broadcasting: } (3\times1) + (1\times2) \Rightarrow (3\times2)
$$

$$
\text{Reduction: } \text{mean}(a_{ij}) = \frac{1}{n} \sum_{i,j} a_{ij}
$$

In [9]:
import numpy as np

# Tensor creation
a_np = np.array([[1, 2], [3, 4]])
a_torch = torch.tensor([[1, 2], [3, 4]])
print("Tensor equality:", np.allclose(a_np, a_torch.numpy()))

# Broadcasting
b_np = np.array([[1], [2], [3]])
c_np = np.array([10, 20])
res_np = b_np + c_np

b_torch = torch.tensor([[1], [2], [3]])
c_torch = torch.tensor([10, 20])
res_torch = b_torch + c_torch

print("Broadcast check:", np.allclose(res_np, res_torch.numpy()))

# Reduction
print("Numpy mean:", a_np.mean(), "Torch mean:", a_torch.float().mean().item())

Tensor equality: True
Broadcast check: True
Numpy mean: 2.5 Torch mean: 2.5


### Exercise 3 — Autograd Demo
Create tensor `x` with `requires_grad=True`, compute a quadratic loss, backpropagate, and inspect gradients.

#### Function

$$
y = (x - 3)^2 + 2x
$$


#### Derivative

$$
\frac{dy}{dx} = 2(x - 3) + 2
$$


$ At 𝑥 = 5 x=5 $

$$
\frac{dy}{dx}\Big|_{x=5} = 2(5 - 3) + 2 = 6
$$


In [10]:
x = torch.tensor(5.0, requires_grad=True)
y = (x - 3)**2 + 2*x  # sample quadratic loss

y.backward()
print("x:", x.item())
print("y:", y.item())
print("dy/dx:", x.grad.item())

x: 5.0
y: 14.0
dy/dx: 6.0


### Exercise 4 — Manual Gradient Descent
Perform a few steps of gradient descent on \( y=(x-3)^2 \) and track progress.

#### Function
$$
y = (x - 3)^2
$$


#### Gradient
$$
\frac{dy}{dx} = 2(x - 3)
$$

#### Gradient Descent Update Rule
$$
x_{t+1} = x_t - \eta \, \frac{dy}{dx}
$$


#### At Convergence
$$
x^* = 3,\quad y^* = 0
$$


In [11]:
x = torch.tensor(0.0, requires_grad=True)

lr = 0.1
for step in range(6):
    y = (x - 3)**2
    y.backward()

    with torch.no_grad():
        print(f"Step {step}: x={x.item():.4f}, y={y.item():.4f}, grad={x.grad.item():.4f}")
        x -= lr * x.grad
        x.grad.zero_()

Step 0: x=0.0000, y=9.0000, grad=-6.0000
Step 1: x=0.6000, y=5.7600, grad=-4.8000
Step 2: x=1.0800, y=3.6864, grad=-3.8400
Step 3: x=1.4640, y=2.3593, grad=-3.0720
Step 4: x=1.7712, y=1.5099, grad=-2.4576
Step 5: x=2.0170, y=0.9664, grad=-1.9661


### Exercise 5 — torch.no_grad() Demonstration
Update a parameter within `torch.no_grad()` and verify gradients stay frozen.

#### Function
$$
y = (w - 5)^2
$$


#### Gradient
$$
\frac{dy}{dw} = 2(w - 5)
$$


#### Update Step
$$
w_{\text{new}} = w_{\text{old}} - \eta \, \frac{dy}{dw}
$$

#### When inside `torch.no_grad()`
$$
\nabla_w \text{ (update) } = 0
$$


In [12]:
w = torch.tensor(2.0, requires_grad=True)

# Forward + backward
loss = (w - 5)**2
loss.backward()
print("Before update → w.grad:", w.grad.item())

# Parameter update without tracking
with torch.no_grad():
    w -= 0.3 * w.grad
    w.grad.zero_()

print("After update → w:", w.item())
print("Grad after update (should be 0):", w.grad.item())

Before update → w.grad: -6.0
After update → w: 3.8000001907348633
Grad after update (should be 0): 0.0


## Challenges Chapter 5
### Challenge 1 — Linear Regression: NumPy → PyTorch
We’ll port a NumPy-based linear regression to pure PyTorch tensors and compare results to `scikit-learn`’s LinearRegression.

$$
\hat{y} = X w + b
$$

$$
\mathcal{L} = \frac{1}{N} \sum_i (\hat{y}_i - y_i)^2
$$

$$
\nabla_w \mathcal{L} = \frac{2}{N} X^\top (Xw + b - y)
$$


In [13]:
from sklearn.linear_model import LinearRegression

# Generate synthetic data
np.random.seed(0)
X_np = np.random.rand(100, 1)
y_np = 3 * X_np.squeeze() + 2 + np.random.randn(100) * 0.1

# Convert to torch tensors
X = torch.tensor(X_np, dtype=torch.float32)
y = torch.tensor(y_np, dtype=torch.float32).view(-1, 1)

# Initialize parameters
w = torch.randn(1, 1, requires_grad=True)
b = torch.randn(1, requires_grad=True)

# Train with gradient descent
lr = 0.1
for _ in range(200):
    y_pred = X @ w + b
    loss = ((y_pred - y)**2).mean()
    loss.backward()
    with torch.no_grad():
        w -= lr * w.grad
        b -= lr * b.grad
        w.grad.zero_()
        b.grad.zero_()

print(f"PyTorch results → w={w.item():.4f}, b={b.item():.4f}, loss={loss.item():.6f}")

# Compare to scikit-learn
lr_sklearn = LinearRegression().fit(X_np, y_np)
print(f"Sklearn results  → w={lr_sklearn.coef_[0]:.4f}, b={lr_sklearn.intercept_:.4f}")

PyTorch results → w=2.9029, b=2.0682, loss=0.010637
Sklearn results  → w=2.9937, b=2.0222


### Challenge 2 — CPU vs GPU Benchmark
Compare CPU and GPU matrix multiply speed, including data transfer overheads.

$$
C = A \times B
$$

$$
t_{\text{total}} = t_{\text{transfer}}^{(h \to d)} + t_{\text{compute}}^{(gpu)} + t_{\text{transfer}}^{(d \to h)}
$$


In [14]:
import time

sizes = [100, 500, 1000, 5000, 10000]
if torch.cuda.is_available():
    device = torch.device("cuda")
    for n in sizes:
        A_cpu = torch.randn(n, n)
        B_cpu = torch.randn(n, n)

        # CPU benchmark
        start = time.time()
        _ = A_cpu @ B_cpu
        cpu_time = time.time() - start

        # GPU benchmark (including transfers)
        start = time.time()
        A_gpu, B_gpu = A_cpu.to(device), B_cpu.to(device)
        torch.cuda.synchronize()
        _ = A_gpu @ B_gpu
        torch.cuda.synchronize()
        gpu_time = time.time() - start

        print(f"{n:>5}×{n:>5} | CPU: {cpu_time:.4f}s | GPU: {gpu_time:.4f}s")
else:
    print("⚠️ No CUDA device available")

⚠️ No CUDA device available


In [15]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print("Device:", device)

sizes = [100, 500, 1000, 2000, 5000]
for n in sizes:
    A = torch.randn(n, n, device=device)
    B = torch.randn(n, n, device=device)
    torch.mps.synchronize() if device.type == "mps" else None
    start = time.time()
    _ = A @ B
    torch.mps.synchronize() if device.type == "mps" else None
    print(f"{n:>5}×{n:>5} | Time: {time.time()-start:.4f}s")

Device: mps
  100×  100 | Time: 0.0054s
  500×  500 | Time: 0.0029s
 1000× 1000 | Time: 0.0046s
 2000× 2000 | Time: 0.0219s
 5000× 5000 | Time: 0.1762s


### Challenge 3 — Two-Layer MLP (Manual Tensor Ops)

$$
z_1 = X W_1 + b_1
$$

$$
a_1 = \text{ReLU}(z_1) = \max(0, z_1)
$$

$$
\hat{y} = a_1 W_2 + b_2
$$

$$
\mathcal{L} = \frac{1}{N}\sum_i (\hat{y}_i - y_i)^2
$$


In [16]:
torch.manual_seed(0)

# Synthetic data
X = torch.randn(5, 3)
y = torch.randn(5, 1)

# Parameters
W1 = torch.randn(3, 4, requires_grad=True)
b1 = torch.randn(4, requires_grad=True)
W2 = torch.randn(4, 1, requires_grad=True)
b2 = torch.randn(1, requires_grad=True)

# Forward pass (manual ops)
z1 = X @ W1 + b1
a1 = torch.relu(z1)
y_pred = a1 @ W2 + b2
loss = ((y_pred - y)**2).mean()

# Backward pass
loss.backward()

print("Loss:", loss.item())
print("Grad W1:", W1.grad.shape)
print("Grad W2:", W2.grad.shape)

Loss: 14.349802017211914
Grad W1: torch.Size([3, 4])
Grad W2: torch.Size([4, 1])


## Challenge 4 — Reproducibility (Set All Seeds)
$$
\text{Seed all RNGs → Python, NumPy, PyTorch (CPU/GPU)}
$$

$$
\text{Reproducibility: } f(\text{seeded inputs}) = \text{constant output across runs}
$$


In [17]:
import random

def set_all_seeds(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Demonstrate reproducibility
set_all_seeds(123)
x1 = torch.randn(3)
set_all_seeds(123)
x2 = torch.randn(3)

print("Identical tensors:", torch.allclose(x1, x2))
print("x1:", x1)
print("x2:", x2)

Identical tensors: True
x1: tensor([-0.1115,  0.1204, -0.3696])
x2: tensor([-0.1115,  0.1204, -0.3696])
