# Chapter 4: Deep Learning

**Welcome to Chapter 4**. This notebook contains the listings for Chapter 4, which explains the fundamentals of Deep Learning in PyTorch.

# Listing 4-1 A Simply PyTorch Model
This listing implements a subset of the general skeleton for the end-to-end lifecycle of a PyTorch project, which includes loading and preparing data, defining or loading a model, specifying the loss function and optimizer, training with validation, evaluating performance, and saving the model for later use. This code demonstrates the essential stages of a working PyTorch program without including optional steps such as inference pipelines or model deployment.

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms

# 1. Load and prepare data
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
])

# Load the full MNIST training set (60,000) and the official test set (10,000)
full_train_dataset = datasets.MNIST(".", train=True, download=True, transform=transform)
test_dataset = datasets.MNIST(".", train=False, transform=transform)

# Split the 60,000 training examples into train + validation
train_size = int(0.9 * len(full_train_dataset))  # 54,000
val_size = len(full_train_dataset) - train_size  # 6,000
train_dataset, val_dataset = torch.utils.data.random_split(full_train_dataset, [train_size, val_size])

train_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=64,
    shuffle=True
)

val_loader = torch.utils.data.DataLoader(
    val_dataset,
    batch_size=1000,
    shuffle=False
)

test_loader = torch.utils.data.DataLoader(
    test_dataset,
    batch_size=1000,
    shuffle=False
)

# 2. Define model
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(28 * 28, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 10)

    def forward(self, x):
        x = x.view(-1, 28 * 28)   # flatten 28x28 -> 784
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.fc3(x)        # raw class scores (one per digit)

model = Net()

# 3. Specify loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 4. Train model
for epoch in range(5):
    model.train()
    running_loss = 0.0

    for data, target in train_loader:
        optimizer.zero_grad()
        output = model(data)                 # predictions (raw scores)
        loss = criterion(output, target)     # compare predictions to labels
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    avg_train_loss = running_loss / len(train_loader)

    # 5. Validate model during training (validation set, not test set)
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        for data, target in val_loader:
            output = model(data)
            loss = criterion(output, target)
            val_loss += loss.item()

            preds = output.argmax(dim=1)
            correct += (preds == target).sum().item()
            total += target.size(0)

    avg_val_loss = val_loss / len(val_loader)
    val_accuracy = correct / total * 100

    print(
        f"Epoch {epoch + 1}: "
        f"train loss={avg_train_loss:.4f}, "
        f"val loss={avg_val_loss:.4f}, "
        f"val acc={val_accuracy:.2f}%"
    )

# 6. Final test (run once at the end)
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for data, target in test_loader:
        output = model(data)
        preds = output.argmax(dim=1)
        correct += (preds == target).sum().item()
        total += target.size(0)

test_accuracy = correct / total * 100
print(f"Final Test Accuracy: {test_accuracy:.2f}%")

# 7. Save model
torch.save(model.state_dict(), "mnist_model.pt")
print("Model saved to mnist_model.pt")


#Listing 4-2 Hyperparameter Tuning with Optuna


In [None]:
# Listing 4-2 Hyperparameter tuning with Optuna (learning rate + batch size)

# If Optuna is not installed in your environment, uncomment:
# !pip -q install optuna

import optuna
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms

# ----------------------------
# Model (same as Listing 4-1)
# ----------------------------
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(28 * 28, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 10)

    def forward(self, x):
        x = x.view(-1, 28 * 28)   # flatten 28x28 -> 784
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.fc3(x)

# ----------------------------
# Data + device
# ----------------------------
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
])

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

full_train = datasets.MNIST(".", train=True, download=True, transform=transform)

# Train/validation split (do NOT tune on the test set)
train_size = int(0.9 * len(full_train))
val_size = len(full_train) - train_size
train_ds, val_ds = torch.utils.data.random_split(full_train, [train_size, val_size])

criterion = nn.CrossEntropyLoss()

def train_one_epoch(model, loader, optimizer):
    model.train()
    total_loss = 0.0

    for x, y in loader:
        x, y = x.to(device), y.to(device)

        optimizer.zero_grad()        # clear old gradients
        preds = model(x)             # forward pass
        loss = criterion(preds, y)   # compute loss
        loss.backward()              # backward pass
        optimizer.step()             # update parameters

        total_loss += loss.item()

    return total_loss / len(loader)

@torch.no_grad()
def evaluate_accuracy(model, loader):
    model.eval()
    correct = 0
    total = 0

    for x, y in loader:
        x, y = x.to(device), y.to(device)
        preds = model(x).argmax(dim=1)
        correct += (preds == y).sum().item()
        total += y.size(0)

    return correct / total

def objective(trial):
    # Suggest hyperparameters
    lr = trial.suggest_float("learning_rate", 1e-4, 1e-2, log=True)
    batch_size = trial.suggest_categorical("batch_size", [32, 64, 128])

    # DataLoaders for this trial
    train_loader = torch.utils.data.DataLoader(train_ds, batch_size=batch_size, shuffle=True)
    val_loader = torch.utils.data.DataLoader(val_ds, batch_size=512, shuffle=False)

    # Fresh model per trial
    model = Net().to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr)

    # Keep trials cheap
    epochs_per_trial = 2
    for _ in range(epochs_per_trial):
        train_one_epoch(model, train_loader, optimizer)

    # Validation score to maximize
    val_acc = evaluate_accuracy(model, val_loader)
    return val_acc

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20)

print("Best trial:")
print("  Validation accuracy:", study.best_value)
print("  Best hyperparameters:", study.best_params)


# Listing 4-3 Using the Trained Model to Predict Custom Digits
This code reloads the saved model and uses it to classify new images. Because the model is already trained, the process is straightforward: load the stored weights, prepare the input images using the same preprocessing steps as during training, and then make predictions on the new images.


In [None]:
# Minimal MNIST inference in Colab

import io
import torch
import torch.nn as nn
import torch.nn.functional as F
from PIL import Image
from google.colab import files
import torchvision.transforms as T

# 1) Model (same architecture as training)
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(28*28, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 10)
    def forward(self, x):
        x = x.view(-1, 28*28)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.fc3(x)

# 2) Load trained weights
model = Net()
model.load_state_dict(torch.load("/content/mnist_model.pt", map_location="cpu"))
model.eval()

# 3) Preprocessing (match MNIST: 1×28×28 + same normalization)
preprocess = T.Compose([
    T.Grayscale(),          # ensure 1 channel, since MNIST is grayscale
    T.Resize((28, 28)),
    T.ToTensor(),
    T.Normalize((0.1307,), (0.3081,))
])

# 4) Upload image(s) and predict
@torch.no_grad()
def predict(name, content):
    img = Image.open(io.BytesIO(content))
    x = preprocess(img).unsqueeze(0)      # [1,1,28,28]
    pred = model(x).argmax(dim=1).item()
    print(f"The predicted digit for {name} is: {pred}")

print("Upload MNIST-like digit image(s):")
for fname, content in files.upload().items():
    predict(fname, content)


#Listing 4-4 Reusing a Trained Model to Classify New Images through Transfer Learning

This code reuses the learned weights from our MNIST network and adapts it to classify letters in the EMNIST dataset. It copies the weights for the shared layers (fc1 and fc2), replaces the final lay-er (fc3) to output 26 classes, trains only that new layer for a few epochs, and then optionally fine tunes the second layer and the head with a smaller learning rate.

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms

# 1) Shared preprocessing
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
])

# 2) Model definition (same as MNIST, but allow a different number of output classes)
class Net(nn.Module):
    def __init__(self, num_classes=10):
        super().__init__()
        self.fc1 = nn.Linear(28 * 28, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, num_classes)

    def forward(self, x):
        x = x.view(-1, 28 * 28)   # flatten 28x28 -> 784
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.fc3(x)        # class scores

# 3) Load the pretrained MNIST model weights (fc1/fc2/fc3 trained for 10 digits)
mnist_model = Net(num_classes=10)
mnist_model.load_state_dict(torch.load("mnist_model.pt", map_location="cpu"))

# 4) Load EMNIST Letters data (26 classes)
# Note: EMNIST letters labels are 1..26, so we shift them to 0..25 in the loop.
train_loader = torch.utils.data.DataLoader(
    datasets.EMNIST('.', split='letters', train=True, download=True, transform=transform),
    batch_size=64,
    shuffle=True
)

test_loader = torch.utils.data.DataLoader(
    datasets.EMNIST('.', split='letters', train=False, download=True, transform=transform),
    batch_size=1000,
    shuffle=False
)

# 5) Create a new model with a 26-class output head, then copy shared weights
model = Net(num_classes=26)

model.fc1.load_state_dict(mnist_model.fc1.state_dict())  # reuse low-level features
model.fc2.load_state_dict(mnist_model.fc2.state_dict())  # reuse higher-level features
# model.fc3 is new (26 outputs) and starts randomly initialized

criterion = nn.CrossEntropyLoss()

# 6) # Phase 1: freeze fc1/fc2 and train only the new classifier head (fc3)
for p in model.fc1.parameters():
    p.requires_grad = False
for p in model.fc2.parameters():
    p.requires_grad = False

optimizer = optim.Adam(model.fc3.parameters(), lr=1e-3)

for epoch in range(3):
    model.train()
    running_loss = 0.0

    for data, target in train_loader:
        target = target - 1  # 1..26 -> 0..25

        optimizer.zero_grad()             # clear old gradients
        output = model(data)              # forward pass
        loss = criterion(output, target)  # compute loss
        loss.backward()                   # compute gradients
        optimizer.step()                  # update fc3 weights

        running_loss += loss.item()

    avg_train_loss = running_loss / len(train_loader)

    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for data, target in test_loader:
            target = target - 1
            output = model(data)
            loss = criterion(output, target)
            val_loss += loss.item()

            preds = output.argmax(dim=1)
            correct += (preds == target).sum().item()
            total += target.size(0)

    avg_val_loss = val_loss / len(test_loader)
    val_accuracy = correct / total * 100

    print(
        f"[Head-only] Epoch {epoch + 1}: "
        f"train loss={avg_train_loss:.4f}, "
        f"val loss={avg_val_loss:.4f}, "
        f"val acc={val_accuracy:.2f}%"
    )

# 7) Phase 2 (optional): fine-tune fc2 + fc3 with a smaller learning rate
for p in model.fc2.parameters():
    p.requires_grad = True

optimizer = optim.Adam(
    list(model.fc2.parameters()) + list(model.fc3.parameters()),
    lr=1e-4
)

for epoch in range(2):
    model.train()
    running_loss = 0.0

    for data, target in train_loader:
        target = target - 1

        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    avg_train_loss = running_loss / len(train_loader)

    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for data, target in test_loader:
            target = target - 1
            output = model(data)
            loss = criterion(output, target)
            val_loss += loss.item()

            preds = output.argmax(dim=1)
            correct += (preds == target).sum().item()
            total += target.size(0)

    avg_val_loss = val_loss / len(test_loader)
    val_accuracy = correct / total * 100

    print(
        f"[Fine-tune] Epoch {epoch + 1}: "
        f"train loss={avg_train_loss:.4f}, "
        f"val loss={avg_val_loss:.4f}, "
        f"val acc={val_accuracy:.2f}%"
    )

# 8) Save the transferred model
torch.save(model.state_dict(), "emnist_letters_from_mnist.pt")
print("Transferred model saved to emnist_letters_from_mnist.pt")
