#Chapter 5 Neural Bulding Blocks
Before a neural network can recognize a digit, detect objects, understand a sentence, translate text, or complete your email, it needs to *see*, *remember*, *interpret*, and *compress* information. There are four foundational deep learning architectures that make this possible: Autoencoders. CNNs, RNNs, and Transformers. Chapter 5 explains these architectures through examples and visual explanations.


# Listing 5-1 Reconstructing Images from the MNIST Dataset Using an Autoencoder
This listing builds a small autoencoder that learns to compress and reconstruct handwritten digits from the MNIST dataset.

In [None]:
# ------------------------------------------------------
# Step 1: Imports and Setup
# ------------------------------------------------------
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import matplotlib.pyplot as plt

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ------------------------------------------------------
# Step 2: Load MNIST
# ------------------------------------------------------
transform = transforms.ToTensor()
train_data = datasets.MNIST(
    root="data", train=True, transform=transform, download=True
)
train_loader = DataLoader(train_data, batch_size=256, shuffle=True)

# ------------------------------------------------------
# Step 3: Define a simple fully connected autoencoder
# ------------------------------------------------------
class Autoencoder(nn.Module):
    def __init__(self):
        super().__init__()
        # Encoder: 784 -> 64 -> 16 (bottleneck)
        self.encoder = nn.Sequential(
            nn.Flatten(),
            nn.Linear(28 * 28, 64),
            nn.ReLU(),
            nn.Linear(64, 16),
            nn.ReLU(),
        )
        # Decoder: 16 -> 64 -> 784
        self.decoder = nn.Sequential(
            nn.Linear(16, 64),
            nn.ReLU(),
            nn.Linear(64, 28 * 28),
            nn.Sigmoid(),  # output pixels in [0, 1]
        )

        # ---------------------------------------------------
        # Improving the Output - To get sharper images,
        # comment out the original encoder and decoder above,
        # and uncomment this wider encoder and decoder:
        # ---------------------------------------------------
        # self.encoder = nn.Sequential(
        #     nn.Flatten(),
        #     nn.Linear(28 * 28, 128),
        #     nn.ReLU(),
        #     nn.Linear(128, 64),
        #     nn.ReLU()         # 64-dimensional bottleneck
        # )
        # self.decoder = nn.Sequential(
        #     nn.Linear(64, 128),
        #     nn.ReLU(),
        #     nn.Linear(128, 28 * 28),
        #     nn.Sigmoid()
        # )

    def forward(self, x):
        z = self.encoder(x)
        out = self.decoder(z)
        return out.view(-1, 1, 28, 28)

model = Autoencoder().to(device)
criterion = nn.MSELoss() # MSE works well for grayscale
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# ------------------------------------------------------
# Step 4: Train the autoencoder to reconstruct its input
# ------------------------------------------------------
num_epochs = 3

for epoch in range(num_epochs):
    running_loss = 0.0
    for imgs, _ in train_loader:
        imgs = imgs.to(device)

        optimizer.zero_grad()
        outputs = model(imgs)
        loss = criterion(outputs, imgs)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * imgs.size(0)

    epoch_loss = running_loss / len(train_loader.dataset)
    print(f"Epoch {epoch + 1}: loss {epoch_loss:.4f}")

# ------------------------------------------------------
# Step 5: Visualize original and reconstructed digits
# ------------------------------------------------------
model.eval()
imgs, _ = next(iter(train_loader))
imgs = imgs[:8].to(device)

with torch.no_grad():
    decoded = model(imgs).cpu()

imgs = imgs.cpu()

plt.figure(figsize=(12, 3))
for i in range(8):
    # Original
    ax = plt.subplot(2, 8, i + 1)
    plt.imshow(imgs[i].squeeze(), cmap="gray")
    ax.axis("off")

    # Reconstructed
    ax = plt.subplot(2, 8, i + 9)
    plt.imshow(decoded[i].squeeze(), cmap="gray")
    ax.axis("off")

plt.suptitle("Original digits (top) vs reconstructed digits (bottom)")
plt.tight_layout()
plt.show()


# Listing 5-2 Using reconstruction error from an autoencoder to detect anomalies
This program reuses the MNIST setup and teaches the network what “normal” looks like by training it only on digits 0 through 7. Later, we evaluate it on the full test set, including digits 8 and 9, and see how the reconstruction error changes.

In [None]:
# ------------------------------------------------------
# Step 1: Imports and Setup
# ------------------------------------------------------
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torchvision import datasets, transforms
import matplotlib.pyplot as plt
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# ------------------------------------------------------------------
# Step 2: Define a simple fully connected autoencoder
# ------------------------------------------------------------------
class Autoencoder(nn.Module):
    def __init__(self):
        super().__init__()
        # Encoder: 784 -> 64 -> 16 (bottleneck)
        self.encoder = nn.Sequential(
            nn.Flatten(),
            nn.Linear(28 * 28, 64),
            nn.ReLU(),
            nn.Linear(64, 16),
            nn.ReLU(),
        )
        # Decoder: 16 -> 64 -> 784
        self.decoder = nn.Sequential(
            nn.Linear(16, 64),
            nn.ReLU(),
            nn.Linear(64, 28 * 28),
            nn.Sigmoid(),  # output pixels in [0, 1]
        )

    def forward(self, x):
        z = self.encoder(x)
        out = self.decoder(z)
        return out.view(-1, 1, 28, 28)

# ------------------------------------------------------------------
# Step 3: Load MNIST train and test sets
# ------------------------------------------------------------------
transform = transforms.ToTensor()

train_data = datasets.MNIST(
    root="data",
    train=True,
    transform=transform,
    download=True,
)

test_data = datasets.MNIST(
    root="data",
    train=False,
    transform=transform,
    download=True,
)

# ------------------------------------------------------------------
# Step 4: Keep only digits 0–7 for training (normal data)
# ------------------------------------------------------------------
train_mask = train_data.targets < 8
x_train_normal = train_data.data[train_mask].float() / 255.0   # [N, 28, 28]
x_train_normal = x_train_normal.unsqueeze(1)                   # [N, 1, 28, 28]

normal_loader = DataLoader(x_train_normal, batch_size=256, shuffle=True)

# ------------------------------------------------------------------
# Step 5: Create and train a new autoencoder on normal digits only
# ------------------------------------------------------------------
model = Autoencoder().to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

num_epochs = 5

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0

    for imgs in normal_loader:
        imgs = imgs.to(device)

        optimizer.zero_grad()
        recon = model(imgs)
        loss = criterion(recon, imgs)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * imgs.size(0)

    epoch_loss = running_loss / len(normal_loader.dataset)
    print(f"Epoch {epoch + 1}: loss {epoch_loss:.4f}")

# ------------------------------------------------------------------
# Step 6: Score the entire test set by reconstruction error
# ------------------------------------------------------------------
x_test = test_data.data.float() / 255.0   # [N, 28, 28]
x_test = x_test.unsqueeze(1)              # [N, 1, 28, 28]
y_test = test_data.targets                # digit labels

test_ds = TensorDataset(x_test, y_test)
test_loader = DataLoader(test_ds, batch_size=256, shuffle=False)

model.eval()
all_errors = []
all_labels = []

with torch.no_grad():
    for imgs, labels in test_loader:
        imgs = imgs.to(device)
        recon = model(imgs)

        # Mean squared error per image, averaged over all pixels
        err = ((imgs - recon) ** 2).mean(dim=[1, 2, 3]).cpu()

        all_errors.append(err)
        all_labels.append(labels)

errors = torch.cat(all_errors).numpy()
labels = torch.cat(all_labels).numpy()

# ------------------------------------------------------------------
# Step 7: Compare reconstruction error for normal digits (0-7)
#         and unusual digits (8-9)
# ------------------------------------------------------------------
normal_mask = labels < 8
anom_mask = labels >= 8

# Use a threshold based on NORMAL data only (typical anomaly-detection practice)
threshold = np.percentile(errors[normal_mask], 95)

# Use shared bin edges so the histograms are directly comparable
bins = np.linspace(errors.min(), errors.max(), 41)

plt.figure(figsize=(8, 4))

# Option A: counts with a clearer y-label
plt.hist(
    errors[normal_mask],
    bins=bins,
    histtype="step",
    label="Digits 0–7 (test set)",
)
plt.hist(
    errors[anom_mask],
    bins=bins,
    histtype="step",
    linestyle="--",
    label="Digits 8–9 (test set)",
)

# If you prefer Option B, uncomment density=True in both hist calls
# and change ylabel.
# plt.hist(..., density=True, ...)
# plt.ylabel("Probability density")

plt.axvline(threshold, color="black", linestyle=":", label="Threshold (95th pct of digits 0–7)")

plt.xlabel("Reconstruction error (MSE per image)")
plt.ylabel("Number of test images (per bin)")
plt.title("Autoencoder anomaly detection on MNIST")
plt.legend()
plt.tight_layout()
plt.show()

# ------------------------------------------------------------------
# Step 8: Confirm what contributes to the right tail
# ------------------------------------------------------------------
tail_mask = errors >= threshold

# (A) Tail composition by digit label
unique, counts = np.unique(labels[tail_mask], return_counts=True)
tail_counts = dict(zip(unique.tolist(), counts.tolist()))
total_tail = int(tail_mask.sum())

print(f"Images in the tail (error ≥ threshold): {total_tail}")
print("Tail breakdown by digit label:")
for d in range(10):
    c = tail_counts.get(d, 0)
    if c > 0:
        print(f"  Digit {d}: {c} ({c / total_tail:.1%})")

# (B) Show a few highest-error examples and their labels
k = 12
idx_sorted = np.argsort(errors)[::-1]
top_idx = idx_sorted[:k]

print("\nTop error examples (label, error):")
for i in top_idx:
    print(f"  {int(labels[i])}, {errors[i]:.4f}")

# Visualize those top-k images
fig, axs = plt.subplots(1, k, figsize=(1.2 * k, 2))
for ax, i in zip(axs, top_idx):
    ax.imshow(x_test[i].squeeze().numpy(), cmap="gray")
    ax.set_title(str(int(labels[i])))
    ax.axis("off")
plt.suptitle("Highest reconstruction errors (titles are true labels)")
plt.tight_layout()
plt.show()

#Listing 5-3 Using a pretrained YOLOv5 model to detect objects in an image
This example lets you upload an image and count the objects in it using a pre-trained YOLOv5 object-detection model.

In [None]:
# ------------------------------------------------------------------
# Step 1: Set up YOLOv5 and install dependencies
# ------------------------------------------------------------------
!git clone https://github.com/ultralytics/yolov5  # clone
%cd yolov5
%pip install -r requirements.txt

# ------------------------------------------------------------------
# Step 2: Imports
# ------------------------------------------------------------------
import torch
import cv2
from collections import Counter
import matplotlib.pyplot as plt
from PIL import Image
from google.colab import files
import io

# ------------------------------------------------------------------
# Step 3: Upload an image
# ------------------------------------------------------------------
uploaded = files.upload()
image_path = next(iter(uploaded))  # take the first uploaded file

# ------------------------------------------------------------------
# Step 4: Load pretrained model
#         This example uses the medium-sized YOLOv5m variant.
# ------------------------------------------------------------------
model = torch.hub.load('.', 'yolov5m', source='local')

# ------------------------------------------------------------------
# Step 5: Run inference
# ------------------------------------------------------------------
results = model(image_path)

# ------------------------------------------------------------------
# Step 6: Count detected classes
# ------------------------------------------------------------------
detections = results.pandas().xyxy[0]
class_names = detections['name'].tolist()
counts = Counter(class_names)

print("\nDetected Objects:")
for label, count in counts.items():
    print(f"- {label}: {count}")

# ------------------------------------------------------------------
# Step 7: Display image with bounding boxes
# ------------------------------------------------------------------
results.render()
img = Image.fromarray(results.ims[0])
plt.figure(figsize=(10, 6))
plt.imshow(img)
plt.axis('off')
plt.title("Detected Objects")
plt.show()

#Listing 5-4 YOLOv5 video object detection implemented as repeated image detection using YOLOv5
This code runs YOLOv5 on an uploaded video inside a Colab notebook and prints a compact per-second table. With a few additional steps, the same pattern can support traffic monitoring, wildlife observation, or automated inspection on a conveyor belt.

In [None]:
# ----------------------------------------------------------------------
# Step 1: Suppress nonessential warnings (including AMP FutureWarnings)
# ----------------------------------------------------------------------
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# ----------------------------------------------------------------------
# Step 2: Setup and imports
# ----------------------------------------------------------------------
!git clone https://github.com/ultralytics/yolov5
%cd yolov5
%pip install -r requirements.txt

import torch
import cv2
from collections import Counter, defaultdict
from google.colab import files
import math

# ----------------------------------------------------------------------
# Step 3: Load pretrained model
# ----------------------------------------------------------------------
model = torch.hub.load('.', 'yolov5m', source='local')
model.conf = 0.25  # confidence threshold (optional)

# ----------------------------------------------------------------------
# Step 4: Upload video
# ----------------------------------------------------------------------
print("Please upload a video file (e.g., .mp4)")
uploaded = files.upload()
video_source = next(iter(uploaded.keys()))

cap = cv2.VideoCapture(video_source)
if not cap.isOpened():
    raise RuntimeError(f"Could not open video source: {video_source}")

# ----------------------------------------------------------------------
# Step 5: Determine FPS (fallback if missing)
# ----------------------------------------------------------------------
fps = cap.get(cv2.CAP_PROP_FPS)
if not fps or math.isnan(fps) or fps < 1:
    fps = 30.0

# ----------------------------------------------------------------------
# Step 6: Limit runtime for notebook safety
# ----------------------------------------------------------------------
MAX_SECONDS = 15
max_frames = int(MAX_SECONDS * fps)

# ----------------------------------------------------------------------
# Step 7: Store per-second MAX counts (non-accumulating)
# ----------------------------------------------------------------------
per_second_max = defaultdict(Counter)

frame_idx = 0  # 0-based
while True:
    ret, frame_bgr = cap.read()
    if not ret:
        break

    if frame_idx >= max_frames:
        break

    second = int(frame_idx / fps)
    frame_idx += 1

    # OpenCV uses BGR; YOLO expects RGB
    frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)

    # Run inference
    results = model(frame_rgb)
    detections = results.pandas().xyxy[0]
    frame_counts = Counter(detections["name"].tolist())

    # Keep the maximum count seen in any frame during this second
    for label, count in frame_counts.items():
        if count > per_second_max[second][label]:
            per_second_max[second][label] = count

cap.release()

# ----------------------------------------------------------------------
# Step 8: Final per-second table
# ----------------------------------------------------------------------
print("\nPer-second detection summary (max per-frame counts):\n")

for second in sorted(per_second_max.keys()):
    counts = per_second_max[second]
    summary = ", ".join(f"{k}={v}" for k, v in counts.most_common()) if counts else "(none)"
    print(f"t={second:2d}s | {summary}")


# Listing 5-5 Removing backgrounds with a segmentation CNN
This example uses a pretrained DeepLabV3 network, which preserves the spatial structure of the image and assigns a class label to each pixel. The result is a detailed map that shows which parts of the scene belong to the subject and which parts are background. The code uses this map to remove the background of the image.

In [None]:
# CNN example: Remove the background with DeepLabV3 segmentation
# Upload one image (a child, a person, etc.) and the CNN will
# output a transparent PNG.

# ----------------------------------------------------------------------
# Step 1: Imports and Setup
# ----------------------------------------------------------------------
import io
import numpy as np
import torch
from torchvision import models
from PIL import Image
from google.colab import files
from IPython.display import display

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# ----------------------------------------------------------------------
# Step 2: Load pretrained DeepLabV3 model and its transforms
# ----------------------------------------------------------------------
weights = models.segmentation.DeepLabV3_ResNet50_Weights.DEFAULT
model = models.segmentation.deeplabv3_resnet50(weights=weights).to(device).eval()
preprocess = weights.transforms()

# ----------------------------------------------------------------------
# Step 3: Upload an image
# ----------------------------------------------------------------------
print("Please upload a photo")
uploaded = files.upload()
filename = next(iter(uploaded.keys()))

img = Image.open(io.BytesIO(uploaded[filename])).convert("RGB")

# ----------------------------------------------------------------------
# Step 4: Run the image through the CNN
# ----------------------------------------------------------------------
input_tensor = preprocess(img).unsqueeze(0).to(device)

with torch.no_grad():
    output = model(input_tensor)["out"][0]  # [num_classes, H, W]

labels = output.argmax(0).cpu().numpy()

# ----------------------------------------------------------------------
# Step 5: Build a foreground mask: anything not background (class 0)
# ----------------------------------------------------------------------
foreground_mask = labels != 0

# ----------------------------------------------------------------------
# Step 6: Make the background transparent (RGBA + alpha channel)
# ----------------------------------------------------------------------
img_resized = img.resize((labels.shape[1], labels.shape[0])).convert("RGBA")
img_np = np.array(img_resized).copy()

alpha = np.where(foreground_mask, 255, 0).astype(np.uint8)
img_np[..., 3] = alpha

# ----------------------------------------------------------------------
# Step 7: Display the result
# ----------------------------------------------------------------------
result_pil = Image.fromarray(img_np)
print("Showing result…")
display(result_pil)

# ----------------------------------------------------------------------
# Step 8: Optional: save a PNG with transparency
# ----------------------------------------------------------------------
out_name = "background_removed.png"
result_pil.save(out_name)
print("Saved:", out_name)

# Listing 5-6 A simple LSTM model that trains on historical NVDA prices and plots the network’s forecast against the actual market data
This code trains on historical data and evaluates the LSTM on the last part of that same history. It is not forecasting future dates beyond the dataset. It is learning to map a window of recent prices to the next price inside the same historical period. This setup is useful for learning and for checking whether the network has captured basic trends.  

In [None]:
# ----------------------------------------------------------------------
# Step 1: Setup and Imports
# ----------------------------------------------------------------------
!pip install yfinance matplotlib scikit-learn torch --quiet
import yfinance as yf
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

# ----------------------------------------------------------------------
# Step 2: Download and scale the time-series data
# ----------------------------------------------------------------------
def get_timeseries(ticker, start, end, seq_len=20):
    data = yf.download(ticker, start=start, end=end)
    prices = data[['Close']].values.astype('float32')
    scaler = MinMaxScaler()

    # ----------------------------------------------------------------------
    # Step 2a: In strict forecasting pipelines, scalers are fit
    #          on training data only
    # ----------------------------------------------------------------------
    scaled = scaler.fit_transform(prices)

    X, y = [], []
    for i in range(len(scaled) - seq_len):
        X.append(scaled[i:i+seq_len])
        y.append(scaled[i+seq_len])
    return torch.tensor(X), torch.tensor(y), scaler, data.index[seq_len:]

# ----------------------------------------------------------------------
# Step 3: Define the LSTM model
# ----------------------------------------------------------------------
class PriceLSTM(nn.Module):
    def __init__(self, hidden_size=50):
        super().__init__()
        # input_size=1 (one feature: closing price)
        # hidden_size controls the dimensionality of the internal state
        # batch_first=True means input shape is (batch, sequence_length, features)
        self.lstm = nn.LSTM(1, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)
    def forward(self, x):
        out, _ = self.lstm(x)
        # out has shape (batch_size, sequence_length, hidden_size)
        # out[:, -1, :] selects the final time step for each sequence
        return self.fc(out[:, -1, :])

# ----------------------------------------------------------------------
# Step 4: Train the model
# ----------------------------------------------------------------------
def train_model(ticker='NVDA', start='2024-01-01', end='2027-12-31', epochs=200):
    X, y, scaler, dates = get_timeseries(ticker, start, end)
    split = int(0.8 * len(X))
    X_train, X_test = X[:split], X[split:]
    y_train, y_test = y[:split], y[split:]

    model = PriceLSTM()
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    model.train()

    for _ in range(epochs):
        optimizer.zero_grad()
        loss = criterion(model(X_train), y_train)
        loss.backward()
        optimizer.step()

    return model, X_test, y_test, scaler, dates[split:]

# ----------------------------------------------------------------------
# Step 5: Evaluate and visualize predictions
# ----------------------------------------------------------------------
def evaluate_model(model, X_test, y_test, scaler, dates, ticker):
    model.eval()
    with torch.no_grad():
        preds = model(X_test)
    actual = scaler.inverse_transform(y_test)
    predicted = scaler.inverse_transform(preds.numpy())

    plt.figure(figsize=(12, 6))
    plt.plot(dates, actual, label="Actual", color="black")
    plt.plot(dates, predicted, label="Predicted", linestyle="--", color="gray")
    plt.title(f"{ticker} Price Forecast")
    plt.xlabel("Date"); plt.ylabel("Price")
    plt.legend(); plt.grid(True); plt.tight_layout(); plt.show()

# ----------------------------------------------------------------------
# Step 6: Run the experiment
# ----------------------------------------------------------------------
model, X_test, y_test, scaler, dates = train_model(
    ticker="NVDA",
    start="2018-01-01",
    end="2023-12-31",
    epochs=50,
)
evaluate_model(model, X_test, y_test, scaler, dates, "NVDA")

# Listing 5-7: Sequence Modeling Demo: RNN vs. LSTM vs. GRU
Listing 5-7 in the Google Colab notebook presents a compact, end-to-end sequence modeling demonstration comparing three recurrent architectures: a basic RNN, an LSTM, and a GRU. The code is intentionally small and self-contained so that each transformation is visible.

The goal is conceptual clarity, not state-of-the-art language modeling. You will see how text becomes numbers, how sequences become supervised training examples, and how recurrent architectures transform short context into a probability distribution over the next word.


In [None]:
# ---------------------------------------------
# Step 1. Import Required Libraries
# ---------------------------------------------
# PyTorch: Deep learning framework for building neural networks
# torch.nn: Neural network modules (layers, loss functions)
# torch.optim: Optimization algorithms (Adam, SGD, etc.)
# numpy: Numerical computing library
# re: Regular expressions for text preprocessing
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from collections import Counter
import re

# ---------------------------------------------
# Step 2. Set Random Seeds for Reproducibility
# ---------------------------------------------
# Setting seeds ensures that random operations produce the same results
# every time the program runs, making experiments reproducible
torch.manual_seed(42)
np.random.seed(42)

# ---------------------------------------------
# Step 3. Define Training Corpus
# ---------------------------------------------
# This is our training data - a small collection of simple sentences
# The models will learn patterns from these sentences to predict next words
CORPUS = """
Mary had a little lamb
the cat sat on the mat
the dog sat on the log
the bird flew over the tree
The quick brown fox jumps over the lazy dog
the fish swam in the sea
Mary had a dog and they played together
the cat and the dog played together
the bird sang in the tree
the fish jumped out of the sea
the dog ran to the park
Mary had a bird that flew away
the cat climbed up the tree
the bird built a nest in the tree
"""

# ---------------------------------------------
# Step 4. Define RNN Model Architecture
# ---------------------------------------------
# RNN (Recurrent Neural Network) is the simplest sequential model
# It processes sequences one element at a time, maintaining a hidden state
# Problem: Struggles with long-term dependencies due to vanishing gradients
class RNNModel(nn.Module):
    """Simple RNN model for sequence prediction"""
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(RNNModel, self).__init__()
        self.hidden_dim = hidden_dim

        # Embedding layer: Converts word indices to dense vectors
        # vocab_size: Number of unique words in vocabulary
        # embedding_dim: Size of the embedding vector for each word
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        # RNN layer: Processes sequences and maintains hidden state
        # batch_first=True means input shape is (batch, sequence, features)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)

        # Fully connected layer: Maps hidden state to vocabulary predictions
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden=None):
        # Convert word indices to embeddings
        embedded = self.embedding(x)

        # Process sequence through RNN
        # output: all hidden states, hidden: final hidden state
        output, hidden = self.rnn(embedded, hidden)

        # Take only the last time step's output for prediction
        output = self.fc(output[:, -1, :])
        return output, hidden

# ---------------------------------------------
# Step 5. Define LSTM Model Architecture
# ---------------------------------------------
# LSTM (Long Short-Term Memory) improves upon RNN with memory cells
# It uses gates (input, forget, output) to control information flow
# Advantage: Better at capturing long-term dependencies in sequences
class LSTMModel(nn.Module):
    """LSTM model for sequence prediction"""
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(LSTMModel, self).__init__()
        self.hidden_dim = hidden_dim

        # Embedding layer: Same as RNN, converts words to vectors
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        # LSTM layer: Uses gates to selectively remember/forget information
        # Has both hidden state (h) and cell state (c)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)

        # Output layer: Maps LSTM output to vocabulary predictions
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden=None):
        # Convert word indices to embeddings
        embedded = self.embedding(x)

        # Process through LSTM
        # hidden contains both h (hidden state) and c (cell state)
        output, hidden = self.lstm(embedded, hidden)

        # Use last time step for prediction
        output = self.fc(output[:, -1, :])
        return output, hidden

# ---------------------------------------------
# Step 6. Define GRU Model Architecture
# ---------------------------------------------
# GRU (Gated Recurrent Unit) is a simplified version of LSTM
# Uses only 2 gates (reset and update) instead of 3
# Advantage: Fewer parameters than LSTM, often trains faster with similar performance
class GRUModel(nn.Module):
    """GRU model for sequence prediction"""
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(GRUModel, self).__init__()
        self.hidden_dim = hidden_dim

        # Embedding layer: Converts word indices to dense vectors
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        # GRU layer: Simpler than LSTM but still handles long-term dependencies
        # Uses reset gate and update gate to control information flow
        self.gru = nn.GRU(embedding_dim, hidden_dim, batch_first=True)

        # Output layer: Projects GRU output to vocabulary size
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden=None):
        # Convert word indices to embeddings
        embedded = self.embedding(x)

        # Process through GRU
        # GRU only has hidden state (no separate cell state like LSTM)
        output, hidden = self.gru(embedded, hidden)

        # Use last time step's output for prediction
        output = self.fc(output[:, -1, :])
        return output, hidden

# ---------------------------------------------
# Step 7. Define SequencePredictor Class
# ---------------------------------------------
# This class orchestrates the entire workflow:
# - Data preprocessing and vocabulary building
# - Creating training sequences
# - Training models
# - Making predictions
class SequencePredictor:
    """Handles data preprocessing, training, and prediction"""
    def __init__(self, corpus, embedding_dim=32, hidden_dim=64, seq_length=3):
        # seq_length: How many words to use as context for prediction
        # embedding_dim: Size of word embedding vectors
        # hidden_dim: Size of RNN/LSTM/GRU hidden state
        self.seq_length = seq_length
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim

        # Preprocess the corpus into a list of words
        self.words = self._preprocess(corpus)

        # Build vocabulary: unique words sorted alphabetically
        self.vocab = sorted(set(self.words))
        self.vocab_size = len(self.vocab)

        # Create bidirectional mappings between words and indices
        # word2idx: Convert words to numbers for neural network input
        # idx2word: Convert predictions back to words
        self.word2idx = {word: idx for idx, word in enumerate(self.vocab)}
        self.idx2word = {idx: word for word, idx in self.word2idx.items()}

        # Create training data: sequences of words and their next word
        self.X_train, self.y_train = self._create_sequences()

        # Initialize all three model architectures for comparison
        self.rnn_model = RNNModel(self.vocab_size, embedding_dim, hidden_dim)
        self.lstm_model = LSTMModel(self.vocab_size, embedding_dim, hidden_dim)
        self.gru_model = GRUModel(self.vocab_size, embedding_dim, hidden_dim)

    # ---------------------------------------------
    # Step 8. Text Preprocessing Method
    # ---------------------------------------------
    def _preprocess(self, text):
        """Tokenize and clean text"""
        # Convert all text to lowercase for consistency
        text = text.lower()

        # Remove all non-alphabetic characters (keep only letters and spaces)
        # This removes punctuation, numbers, etc.
        text = re.sub(r'[^a-z\s]', '', text)

        # Split text into individual words
        words = text.split()
        return words

    # ---------------------------------------------
    # Step 9. Create Training Sequences
    # ---------------------------------------------
    def _create_sequences(self):
        """Create input-output sequences for training"""
        # X: Input sequences (context words)
        # y: Target words (what comes next)
        X, y = [], []

        # Slide a window through the text
        # For each position, take seq_length words as input
        # and the next word as the target to predict
        for i in range(len(self.words) - self.seq_length):
            # Extract sequence of words as context
            sequence = self.words[i:i + self.seq_length]

            # The word immediately after the sequence is the target
            target = self.words[i + self.seq_length]

            # Convert words to indices (neural networks need numbers)
            X.append([self.word2idx[word] for word in sequence])
            y.append(self.word2idx[target])

        # Convert to PyTorch tensors for training
        return torch.tensor(X), torch.tensor(y)

    # ---------------------------------------------
    # Step 10. Model Training Method
    # ---------------------------------------------
    def train_model(self, model, epochs=100, lr=0.01):
        """Train a given model"""
        # CrossEntropyLoss: Standard loss for classification tasks
        # Combines softmax and negative log likelihood
        criterion = nn.CrossEntropyLoss()

        # Adam optimizer: Adaptive learning rate optimization algorithm
        # Generally works well without much tuning
        optimizer = optim.Adam(model.parameters(), lr=lr)

        # Set model to training mode (enables dropout, batch norm, etc.)
        model.train()

        # Training loop: Iterate through the dataset multiple times
        for epoch in range(epochs):
            # Zero out gradients from previous iteration
            optimizer.zero_grad()

            # Forward pass: Get model predictions
            output, _ = model(self.X_train)

            # Calculate loss: How wrong are the predictions?
            loss = criterion(output, self.y_train)

            # Backward pass: Calculate gradients
            loss.backward()

            # Update model parameters based on gradients
            optimizer.step()

            # Print progress every 20 epochs
            if (epoch + 1) % 20 == 0:
                print(f"  Epoch {epoch + 1}/{epochs}, Loss: {loss.item():.4f}")

    # ---------------------------------------------
    # Step 11. Prediction Method
    # ---------------------------------------------
    def predict_next_word(self, model, input_phrase):
        """Predict the next word given an input phrase"""
        # Set model to evaluation mode (disables dropout, etc.)
        model.eval()

        # Clean and tokenize the input phrase
        words = self._preprocess(input_phrase)

        # Handle phrases shorter than required sequence length
        # Pad with 'the' (a common word) to reach seq_length
        if len(words) < self.seq_length:
            print(f"  Warning: Input phrase has {len(words)} words, padding to {self.seq_length} words")
            words = ['the'] * (self.seq_length - len(words)) + words

        # If phrase is longer, take only the last seq_length words
        words = words[-self.seq_length:]

        # Convert words to indices, handling unknown words gracefully
        indices = []
        for word in words:
            if word in self.word2idx:
                indices.append(self.word2idx[word])
            else:
                # Unknown words are replaced with 'the'
                print(f"  Warning: Unknown word '{word}', using 'the' instead")
                indices.append(self.word2idx['the'])

        # Create input tensor with batch dimension
        input_tensor = torch.tensor([indices])

        # Make prediction without computing gradients (saves memory)
        with torch.no_grad():
            # Get model output (logits)
            output, _ = model(input_tensor)

            # Convert logits to probabilities using softmax
            probabilities = torch.softmax(output, dim=1)

            # Get the word with highest probability
            predicted_idx = torch.argmax(probabilities, dim=1).item()

            # Get confidence score for the prediction
            confidence = probabilities[0][predicted_idx].item()

        # Convert index back to word
        predicted_word = self.idx2word[predicted_idx]
        return predicted_word, confidence

# ---------------------------------------------
# Step 12. Main Function - Program Entry Point
# ---------------------------------------------
def main():
    print("=" * 70)
    print("PyTorch Sequence Modeling Demo: RNN vs LSTM vs GRU")
    print("=" * 70)
    print("\nInitializing models and preparing data...")

    # ---------------------------------------------
    # Step 12a. Initialize the Predictor
    # ---------------------------------------------
    # Create predictor with:
    # - embedding_dim=32: Each word represented as 32-dimensional vector
    # - hidden_dim=64: RNN/LSTM/GRU hidden state size
    # - seq_length=3: Use 3 words of context to predict the next word
    predictor = SequencePredictor(CORPUS, embedding_dim=32, hidden_dim=64, seq_length=3)

    # Display dataset statistics
    print(f"\nVocabulary size: {predictor.vocab_size}")
    print(f"Training sequences: {len(predictor.X_train)}")
    print(f"Vocabulary: {predictor.vocab[:15]}...")

    # ---------------------------------------------
    # Step 12b. Train All Three Models
    # ---------------------------------------------
    # Train each model for 100 epochs to learn word patterns

    print("\n" + "-" * 70)
    print("Training RNN Model...")
    print("-" * 70)
    predictor.train_model(predictor.rnn_model, epochs=100)

    print("\n" + "-" * 70)
    print("Training LSTM Model...")
    print("-" * 70)
    predictor.train_model(predictor.lstm_model, epochs=100)

    print("\n" + "-" * 70)
    print("Training GRU Model...")
    print("-" * 70)
    predictor.train_model(predictor.gru_model, epochs=100)

    print("\n" + "=" * 70)
    print("Training Complete! Now you can test the models.")
    print("=" * 70)

    # ---------------------------------------------
    # Step 12c. Interactive Prediction Loop
    # ---------------------------------------------
    # Allow user to test the models with custom input phrases
    while True:
        print("\n" + "-" * 70)
        user_input = input("\nEnter a phrase (or 'quit' to exit): ").strip()

        # Check for exit commands
        if user_input.lower() in ['quit', 'exit', 'q']:
            print("\nThank you for using the Sequence Modeling Demo!")
            break

        # Validate input
        if not user_input:
            print("Please enter a valid phrase.")
            continue

        print(f"\nInput phrase: '{user_input}'")
        print("\nPredictions:")

        # Get predictions from all three models
        # Each model predicts the next word and provides confidence score

        # RNN prediction
        rnn_word, rnn_conf = predictor.predict_next_word(predictor.rnn_model, user_input)
        print(f"  RNN:  '{rnn_word}' (confidence: {rnn_conf:.2%})")

        # LSTM prediction
        lstm_word, lstm_conf = predictor.predict_next_word(predictor.lstm_model, user_input)
        print(f"  LSTM: '{lstm_word}' (confidence: {lstm_conf:.2%})")

        # GRU prediction
        gru_word, gru_conf = predictor.predict_next_word(predictor.gru_model, user_input)
        print(f"  GRU:  '{gru_word}' (confidence: {gru_conf:.2%})")

        # Show complete predicted sentences
        print("\nComplete predictions:")
        print(f"  RNN:  '{user_input} {rnn_word}'")
        print(f"  LSTM: '{user_input} {lstm_word}'")
        print(f"  GRU:  '{user_input} {gru_word}'")

# ---------------------------------------------
# Step 13. Program Entry Point
# ---------------------------------------------
# This ensures main() only runs when script is executed directly
# (not when imported as a module)
if __name__ == "__main__":
    main()

# Listing 5-8 A simple interactive example of sentiment analysis
This example uses Hugging Face Transformers to classify the sentiment of a sentence with DistilBERT, a lightweight version of BERT.

In [None]:
# ---------------------------------------------
# Step 1. Setup and Imports
# ---------------------------------------------
!pip install transformers --quiet

from transformers import pipeline

# ---------------------------------------------
# Step 2. Load a sentiment-analysis pipeline
#         (DistilBERT by default)
# ---------------------------------------------
classifier = pipeline("sentiment-analysis")

# ---------------------------------------------
# Step 3. Print User Instructions
# ---------------------------------------------
print("Enter a message to analyze its sentiment.")
print("Type 'quit' to exit.\n")

# ---------------------------------------------
# Step 4. Interactive Input Loop
# ---------------------------------------------
while True:

    # ---------------------------------------------
    # Step 4a. Read and normalize user input
    # ---------------------------------------------
    s = input("Your message: ").strip()

    # ---------------------------------------------
    # Step 4b. Exit condition
    # ---------------------------------------------
    if s.lower() in {"quit", "exit"}:
        break

    # ---------------------------------------------
    # Step 4c. Basic validation: reject empty input
    # ---------------------------------------------
    if not s:
        print("Please type a non-empty message.\n")
        continue

    # ---------------------------------------------
    # Step 5. Run Transformer Inference
    # ---------------------------------------------
    result = classifier(s)[0]

    # ---------------------------------------------
    # Step 6. Extract and Present the Result
    # ---------------------------------------------
    label = result["label"]
    score = result["score"] * 100
    print(f"→ {label} ({score:.1f}%)\n")


# Listing 5-9 A small question-answering example
This simple example lets you type context and then ask a question about it.

In [None]:
# ---------------------------------------------
# Step 1. Import the pipeline helper
# ---------------------------------------------
from transformers import pipeline

# ---------------------------------------------
# Step 2. Load a question-answering pipeline
#         (default pretrained model)
# ---------------------------------------------
qa = pipeline("question-answering")

# ---------------------------------------------
# Step 3. Collect user input
# ---------------------------------------------
context = input("Enter context: ")
question = input("Now enter your question: ")

# ---------------------------------------------
# Step 4. Run Transformer inference
# ---------------------------------------------
result = qa(question=question, context=context)

# ---------------------------------------------
# Step 5. Extract and display the answer
# ---------------------------------------------
print(f"Answer: {result['answer']} (score: {result['score']:.2f})")

# Listing 5-10 A compact implementation of the chat-with-my-data pattern
The following code asks the user to upload a PDF file through the browser. Once the file is uploaded, the script extracts the text from each page and stores it as a single context string. The user can then enter questions about the document. Each question, together with the document text, is passed to a pretrained question-answering model. The model uses attention to locate the most relevant span and returns an answer with a confidence score.

In [None]:
# ---------------------------------------------
# Step 1. Install required libraries (Colab)
# ---------------------------------------------
# transformers: pretrained Transformer models + pipelines
# PyMuPDF (fitz): PDF parsing and text extraction
!pip install transformers --quiet
!pip install PyMuPDF --quiet

# ---------------------------------------------
# Step 2. Import dependencies
# ---------------------------------------------
from transformers import pipeline
import fitz  # PyMuPDF
from google.colab import files

# ---------------------------------------------
# Step 3. Upload a PDF from your local machine
# ---------------------------------------------
# Colab returns a dict: {filename: bytes}
uploaded = files.upload()

# Take the first uploaded file
file_name = next(iter(uploaded))
pdf_data = uploaded[file_name]

# ---------------------------------------------
# Step 4. Extract text from the uploaded PDF
# ---------------------------------------------
def extract_text_from_pdf_bytes(pdf_bytes):
    """Extract all page text from a PDF stored as raw bytes."""
    doc = fitz.open(stream=pdf_bytes, filetype="pdf")
    text = ""
    for page in doc:
        text += page.get_text()
    doc.close()
    return text

context = extract_text_from_pdf_bytes(pdf_data)

# ---------------------------------------------
# Step 5. Initialize a question-answering pipeline
# ---------------------------------------------
qa = pipeline("question-answering")

# ---------------------------------------------
# Step 6. Interactive Q&A loop over the PDF text
# ---------------------------------------------
print("\nPDF loaded. Ask me anything about its contents. Type 'quit' to exit.")

while True:
    # -----------------------------------------
    # Step 6a. Read and validate the question
    # -----------------------------------------
    question = input("\nYour question: ").strip()

    if question.lower() == "quit":
        print("Exiting Q&A.")
        break

    if len(question) == 0:
        print("Please enter a valid question.")
        continue

    # -----------------------------------------
    # Step 6b. Run extractive QA inference
    # -----------------------------------------
    try:
        result = qa(question=question, context=context)

        # -------------------------------------
        # Step 6c. Present the answer span
        # -------------------------------------
        print(f"Answer: {result['answer']} (score: {result['score']:.2f})")

    except Exception as e:
        # -------------------------------------
        # Step 6d. Basic error handling
        # -------------------------------------
        print(f"Could not answer the question. Reason: {e}")