
# PyTorch `Conv2d` Practice — Shapes, Hyperparameters, and Parameter Counts


**What you'll do:**
- Practice computing **output shapes** for `Conv2d` given **kernel size, stride, padding, dilation**.
- Design hyperparameter **combinations** to reach a **target output size** from a given input.
- Understand **parameter sharing**, compute **trainable parameters** per filter and per layer.
- Compare conv parameter counts to an equivalent **fully connected (MLP)**.


## Setup & Helper Functions

In [1]:

import math
import torch
import torch.nn as nn

def _to_pair(x):
    if isinstance(x, tuple): return x
    return (x, x)

def conv2d_output_hw(H, W, kernel_size=3, stride=1, padding=0, dilation=1):
    """Compute (H_out, W_out) for Conv2d using PyTorch's formula.
    out = floor((in + 2*pad - dilation*(kernel-1) - 1)/stride + 1)
    Accepts ints or (h,w) tuples for kernel_size/stride/padding/dilation.
    """
    kH, kW = _to_pair(kernel_size)
    sH, sW = _to_pair(stride)
    pH, pW = _to_pair(padding)
    dH, dW = _to_pair(dilation)
    H_out = math.floor((H + 2*pH - dH*(kH-1) - 1)/sH + 1)
    W_out = math.floor((W + 2*pW - dW*(kW-1) - 1)/sW + 1)
    return H_out, W_out

def conv2d_output_shape(C_in, H, W, C_out, **kwargs):
    H2, W2 = conv2d_output_hw(H, W, **kwargs)
    return (C_out, H2, W2)

def enumerate_combos(H, W, H_target, W_target,
                     ks_range=(1,7), stride_range=(1,3), pad_range=(0,5), dil_range=(1,3)):
    """Enumerate (kH,kW),(sH,sW),(pH,pW),(dH,dW) that map (H,W)->(H_target,W_target).
    Ranges are inclusive small search windows.
    """
    sols = []
    for k in range(ks_range[0], ks_range[1]+1):
        for s in range(stride_range[0], stride_range[1]+1):
            for p in range(pad_range[0], pad_range[1]+1):
                for d in range(dil_range[0], dil_range[1]+1):
                    h2, w2 = conv2d_output_hw(H, W, kernel_size=k, stride=s, padding=p, dilation=d)
                    if h2 == H_target and w2 == W_target:
                        sols.append(((k,k),(s,s),(p,p),(d,d)))
    return sols

def same_padding_for(kernel_size, dilation=1, stride=1):
    """For stride=1, return padding that preserves spatial size (if integer).
    Uses effective_kernel = dilation*(k-1) + 1, padding = (effective_kernel - 1)/2
    """
    k = kernel_size
    eff = dilation*(k-1) + 1
    pad = (eff - 1) / 2
    return pad  # may be non-integer if (k,d) incompatible with exact 'same'



## Part A — Output Shape Practice

Use the formula (and optionally the helper) to compute output sizes. Then verify with PyTorch.



### **Problem A1**  
Input **(C=3, H=32, W=32)** → Conv2d with:
- `out_channels=16`, `kernel_size=3`, `stride=1`, `padding=1`, `dilation=1`  
**Task:** Compute the output shape `(C_out, H_out, W_out)`.

<mark>H_out = floor((H_in + 2 * padding - dilation * (kernel_size - 1) - 1) / stride + 1)


In [7]:

# TODO: Compute by hand, then check with helper
H_out, W_out = conv2d_output_hw(32, 32, kernel_size=3, stride=1, padding=1)
print("Your answer (fill mentally): (16, 32, 32) expected")
print("Helper:", (16, H_out, W_out))

# Verify by PyTorch
x = torch.randn(1, 3, 32, 32)
conv = nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1, dilation=1, bias=False)
y = conv(x)
print("PyTorch:", tuple(y.shape[1:]))  # (C_out,H_out,W_out)


Your answer (fill mentally): (16, 32, 32) expected
Helper: (16, 32, 32)
PyTorch: (16, 32, 32)


In [5]:
#test problem
# TODO: Compute by hand, then check with helper
H_out, W_out = conv2d_output_hw(64, 64, kernel_size=3, stride=2, padding=2, dilation=0)
print("Your answer (fill mentally): (16, 32, 32) expected")
print("Helper:", (16, H_out, W_out))

# Verify by PyTorch
x = torch.randn(1, 3, 32, 32)
conv = nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1, dilation=1, bias=False)
y = conv(x)
print("PyTorch:", tuple(y.shape[1:]))  # (C_out,H_out,W_out)


Your answer (fill mentally): (16, 32, 32) expected
Helper: (16, 34, 34)
PyTorch: (16, 32, 32)



### **Problem A2**  
Input **(C=1, H=28, W=28)** → Conv2d with:
- `out_channels=32`, `kernel_size=3`, `stride=2`, `padding=1`, `dilation=1`  
**Task:** Compute `(32, H_out, W_out)`.

<mark>H_out = floor((H_in + 2 * padding - dilation * (kernel_size - 1) - 1) / stride + 1)


In [None]:

H_out, W_out = conv2d_output_hw(28, 28, kernel_size=3, stride=2, padding=1, dilation=1)
print("Helper:", (32, H_out, W_out))

x = torch.randn(1, 1, 28, 28)
conv = nn.Conv2d(1, 32, 3, stride=2, padding=1, dilation=1, bias=False)
print("PyTorch:", tuple(conv(x).shape[1:]))



### **Problem A3** (asymmetric hyperparameters)  
Input **(C=8, H=128, W=128)** → Conv2d with:
- `out_channels=8`, `kernel_size=(3,5)`, `stride=(2,1)`, `padding=(1,2)`, `dilation=(1,1)`  
**Task:** Compute `(8, H_out, W_out)`.


In [None]:

H_out, W_out = conv2d_output_hw(128, 128, kernel_size=(3,5), stride=(2,1), padding=(1,2), dilation=(1,1))
print("Helper:", (8, H_out, W_out))

x = torch.randn(1, 8, 128, 128)
conv = nn.Conv2d(8, 8, kernel_size=(3,5), stride=(2,1), padding=(1,2), dilation=(1,1), bias=False)
print("PyTorch:", tuple(conv(x).shape[1:]))



### **Problem A4** (dilation)  
Input **(C=3, H=64, W=96)** → Conv2d with:
- `out_channels=12`, `kernel_size=7`, `stride=2`, `padding=3`, `dilation=2`  
**Task:** Compute `(12, H_out, W_out)`.


In [None]:

H_out, W_out = conv2d_output_hw(64, 96, kernel_size=7, stride=2, padding=3, dilation=2)
print("Helper:", (12, H_out, W_out))

x = torch.randn(1, 3, 64, 96)
conv = nn.Conv2d(3, 12, kernel_size=7, stride=2, padding=3, dilation=2, bias=False)
print("PyTorch:", tuple(conv(x).shape[1:]))



### **Problem A5** (“same” size with dilation)  
For **stride=1**, find the **padding** needed to preserve size `H_out=H` for a given `kernel_size` and `dilation`.

- Formula: `effective_kernel = dilation*(kernel - 1) + 1`, then  
  `padding = (effective_kernel - 1)/2` (must be an integer to be exact).

**Task:** For `H=W=32`, `kernel_size=3`, `dilation=3`, `stride=1` — compute the padding that preserves size.

<mark>H_out = floor((H_in + 2 * padding - dilation * (kernel_size - 1) - 1) / stride + 1)


In [None]:

pad = same_padding_for(kernel_size=3, dilation=3, stride=1)
print("Required padding (per side):", pad, "→ use padding=3 to keep size with stride=1")
print("Check:", conv2d_output_hw(32, 32, kernel_size=3, stride=1, padding=3, dilation=3))



## Part B — Design Hyperparameters to Hit a Target Size

Given input `(H, W)` and a **target** `(H_target, W_target)`, propose `(kernel, stride, padding, dilation)` that achieve it.
Try by hand first, then use the enumerator to check/collect solutions.



### **Problem B1**  
Input `H=W=32` → Target `H=W=16`. Propose **three** different `(k, s, p, d)` combos.

<mark>H_out = floor((H_in + 2 * padding - dilation * (kernel_size - 1) - 1) / stride + 1)


In [None]:

sols = enumerate_combos(32, 32, 16, 16,
                        ks_range=(1,7), stride_range=(1,3), pad_range=(0,5), dil_range=(1,3))
print(f"Found {len(sols)} combos (showing first 10):")
for i, sol in enumerate(sols[:10]):
    (k,_),(s,_),(p,_),(d,_) = sol
    print(f"{i+1:02d}: k={k}, s={s}, p={p}, d={d}")



### **Problem B2**  
Input `H=W=64` → Target `H=W=31`. Find **one** valid set `(k, s, p, d)`.

<mark>H_out = floor((H_in + 2 * padding - dilation * (kernel_size - 1) - 1) / stride + 1)


In [None]:

sols = enumerate_combos(64, 64, 31, 31,
                        ks_range=(1,7), stride_range=(1,3), pad_range=(0,7), dil_range=(1,3))
print("Some solutions:", sols[:1])



### **Problem B3**  
Input `(H, W) = (72, 96)` → Target `(36, 48)`. Propose at least **two** solutions.


In [None]:

sols = enumerate_combos(72, 96, 36, 48,
                        ks_range=(1,7), stride_range=(1,3), pad_range=(0,5), dil_range=(1,3))
print(f"Solutions found: {len(sols)} (first 10):")
for sol in sols[:1]: print(sol)



## Part C — Parameter Sharing & Trainable Parameter Counts

**Key idea:** In conv layers, a small filter (e.g., 3×3×C_in) is **shared** across all spatial positions.  
You learn that filter **once**, then slide it over the image. This yields **far fewer parameters** than a fully connected layer over all pixels.

**Counts:**
- **Per filter:** params = `kH * kW * C_in` (+ 1 bias if used)
- **Per layer with `C_out` filters:** params = `(kH*kW*C_in + bias) * C_out` where `bias` is 1 if enabled else 0


In [None]:

def conv2d_param_count(C_in, C_out, kH, kW, bias=True):
    per_filter = kH * kW * C_in + (1 if bias else 0)
    return per_filter * C_out

def mlp_param_count(sizes, bias=True):
    """sizes = [in_dim, h1, h2, ..., out_dim]"""
    total = 0
    for a, b in zip(sizes[:-1], sizes[1:]):
        total += a * b + (b if bias else 0)
    return total

# Examples
print("Conv example (C_in=3, C_out=64, k=3x3, bias=True):",
      conv2d_param_count(3, 64, 3, 3, bias=True))

print("Two conv layers stack:")
p1 = conv2d_param_count(3, 32, 3, 3, bias=True)
p2 = conv2d_param_count(32, 64, 3, 3, bias=True)
print("  layer1:", p1, "  layer2:", p2, "  total:", p1+p2)



### **Problem C1**  
Compute params for: `C_in=3, C_out=64, kernel=5×5`, with and without bias.


In [None]:

print("With bias:", conv2d_param_count(3, 64, 5, 5, bias=True))
print("No bias :", conv2d_param_count(3, 64, 5, 5, bias=False))



### **Problem C2**  
Two-layer conv stack:  
- Layer1: `C_in=3 → C_out=16`, `k=3×3`, bias **True**  
- Layer2: `C_in=16 → C_out=32`, `k=3×3`, bias **False**  
**Task:** Compute total trainable params.


In [None]:

p1 = conv2d_param_count(3, 16, 3, 3, bias=True)
p2 = conv2d_param_count(16, 32, 3, 3, bias=False)
print("Total params:", p1 + p2)



### **Compare to a Fully Connected MLP**

Consider an RGB input **32×32×3** flattened to `3072` units, then an MLP with hidden `256` and output `10` classes:
- MLP sizes: `[3072, 256, 10]`  
- Conv alternative: `Conv2d(3→64, k=3×3)` followed by `Conv2d(64→64, k=3×3)` followed by max pool 2d.

**Task:** Compare parameter counts.


In [None]:

mlp_params = mlp_param_count([32*32*3, 256, 10], bias=True)
conv_a = conv2d_param_count(3, 64, 3, 3, bias=True)
conv_b = conv2d_param_count(64, 64, 3, 3, bias=True)
print("MLP params:", mlp_params)
print("Conv stack params (two 3x3 convs):", conv_a + conv_b)
# print("final FC layer in conv net: assumming 32*32*64 input, 10 output:", mlp_param_count([32*32*64, 10], bias=True))
print("After max poool2d on output of conv stack (16*16*64 input, 10 output):", mlp_param_count([16*16*64, 10], bias=True))
print("Conv net total (with final FC):", conv_a + conv_b + mlp_param_count([16*16*64, 10], bias=True))

#so the final linear layer with the 64 filters is whats blowing up the size of the conv net
#can reduce this by using fewer filters in final layer



In [None]:

import math
import torch, torch.nn as nn

def _to_pair(x): return x if isinstance(x, tuple) else (x, x)

def conv2d_output_hw(H, W, kernel_size=3, stride=1, padding=0, dilation=1):
    kH, kW = _to_pair(kernel_size)
    sH, sW = _to_pair(stride)
    pH, pW = _to_pair(padding)
    dH, dW = _to_pair(dilation)
    H_out = math.floor((H + 2*pH - dH*(kH-1) - 1)/sH + 1)
    W_out = math.floor((W + 2*pW - dW*(kW-1) - 1)/sW + 1)
    return H_out, W_out

def conv2d_param_count(C_in, C_out, kH, kW, bias=True):
    per_filter = kH * kW * C_in + (1 if bias else 0)
    return per_filter * C_out

def mlp_param_count(sizes, bias=True):
    tot=0
    for a,b in zip(sizes[:-1], sizes[1:]):
        tot += a*b + (b if bias else 0)
    return tot

# Part A checks
print("A1:", (16,) + conv2d_output_hw(32, 32, 3, 1, 1, 1))
print("A2:", (8,)  + conv2d_output_hw(64, 64, 5, 2, 0, 1))
print("A3:", (32,) + conv2d_output_hw(28, 28, 3, 2, 1, 1))
print("A4:", (8,)  + conv2d_output_hw(128, 128, (3,5), (2,1), (1,2), (1,1)))
print("A5:", (12,) + conv2d_output_hw(64, 96, 7, 2, 3, 2))
# A6 padding for same with stride=1, k=3, d=3:
eff = 3* (3-1) + 1
pad = (eff - 1)//2
print("A6 padding:", pad)

# Part B example solutions (show small subsets)
def enumerate_combos(H, W, H_target, W_target,
                     ks_range=(1,7), stride_range=(1,3), pad_range=(0,5), dil_range=(1,3)):
    sols = []
    for k in range(ks_range[0], ks_range[1]+1):
        for s in range(stride_range[0], stride_range[1]+1):
            for p in range(pad_range[0], pad_range[1]+1):
                for d in range(dil_range[0], dil_range[1]+1):
                    h2, w2 = conv2d_output_hw(H, W, kernel_size=k, stride=s, padding=p, dilation=d)
                    if h2 == H_target and w2 == W_target:
                        sols.append((k,s,p,d))
    return sols

print("\nB1 32→16 examples:", enumerate_combos(32,32,16,16)[:5])
print("B2 64→31 examples:", enumerate_combos(64,64,31,31, pad_range=(0,7))[:5])
print("B3 72×96→36×48 examples:", enumerate_combos(72,96,36,48)[:5])

# Part C counts
print("\nC1 with bias:",  conv2d_param_count(3, 64, 5, 5, True))
print("C1 no bias:",       conv2d_param_count(3, 64, 5, 5, False))

p1 = conv2d_param_count(3, 16, 3, 3, True)
p2 = conv2d_param_count(16, 32, 3, 3, False)
print("C2 total:", p1 + p2)

mlp = mlp_param_count([32*32*3, 256, 10], True)
convA = conv2d_param_count(3, 64, 3, 3, True)
convB = conv2d_param_count(64, 64, 3, 3, True)
print("MLP params:", mlp)
print("Two 3x3 convs params:", convA + convB)
