#Chapter 5 Neuron Bulding Blocks
Before a neural network can recognize a digit, detect objects, understand a sentence, translate text, or complete your email, it needs to *see*, *remember*, *interpret*, and *compress* information. There are four foundational deep learning architectures that make this possible: CNNs, RNNs, Transformers, and Autoencoders. Chapter 5 explains these architectures through examples and visual explanations.


# Listing 5-1 Reconstructing Images from the MNIST Dataset Using an Autoencoder
This listing builds a small autoencoder that learns to compress and reconstruct handwritten digits from the MNIST dataset.

In [None]:
# Autoencoder example: reconstructing MNIST digits in PyTorch

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import matplotlib.pyplot as plt

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 1. Load MNIST
transform = transforms.ToTensor()
train_data = datasets.MNIST(
    root="data", train=True, transform=transform, download=True
)
train_loader = DataLoader(train_data, batch_size=256, shuffle=True)

# 2. Define a simple fully connected autoencoder
class Autoencoder(nn.Module):
    def __init__(self):
        super().__init__()
        # Encoder: 784 -> 64 -> 16 (bottleneck)
        self.encoder = nn.Sequential(
            nn.Flatten(),
            nn.Linear(28 * 28, 64),
            nn.ReLU(),
            nn.Linear(64, 16),
            nn.ReLU(),
        )
        # Decoder: 16 -> 64 -> 784
        self.decoder = nn.Sequential(
            nn.Linear(16, 64),
            nn.ReLU(),
            nn.Linear(64, 28 * 28),
            nn.Sigmoid(),  # output pixels in [0, 1]
        )

    def forward(self, x):
        z = self.encoder(x)
        out = self.decoder(z)
        return out.view(-1, 1, 28, 28)

model = Autoencoder().to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# 3. Train the autoencoder to reconstruct its input
num_epochs = 3

for epoch in range(num_epochs):
    running_loss = 0.0
    for imgs, _ in train_loader:
        imgs = imgs.to(device)

        optimizer.zero_grad()
        outputs = model(imgs)
        loss = criterion(outputs, imgs)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * imgs.size(0)

    epoch_loss = running_loss / len(train_loader.dataset)
    print(f"Epoch {epoch + 1}: loss {epoch_loss:.4f}")

# 4. Visualize original and reconstructed digits
model.eval()
imgs, _ = next(iter(train_loader))
imgs = imgs[:8].to(device)

with torch.no_grad():
    decoded = model(imgs).cpu()

imgs = imgs.cpu()

plt.figure(figsize=(12, 3))
for i in range(8):
    # Original
    ax = plt.subplot(2, 8, i + 1)
    plt.imshow(imgs[i].squeeze(), cmap="gray")
    ax.axis("off")

    # Reconstructed
    ax = plt.subplot(2, 8, i + 9)
    plt.imshow(decoded[i].squeeze(), cmap="gray")
    ax.axis("off")

plt.suptitle("Original digits (top) vs reconstructed digits (bottom)")
plt.tight_layout()
plt.show()


# Listing 5-2 Using reconstruction error from an autoencoder to detect anomalies
This program reuses the MNIST setup and teaches the network what “normal” looks like by training it only on digits 0 through 7. Later, we evaluate it on the full test set, including digits 8 and 9, and see how the reconstruction error changes.

In [None]:
# Using reconstruction error from an autoencoder to detect anomalies

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torchvision import datasets, transforms
import matplotlib.pyplot as plt
import numpy as np

# 0. Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# 1. Define a simple fully connected autoencoder

class Autoencoder(nn.Module):
    def __init__(self):
        super().__init__()
        # Encoder: 784 -> 64 -> 16 (bottleneck)
        self.encoder = nn.Sequential(
            nn.Flatten(),
            nn.Linear(28 * 28, 64),
            nn.ReLU(),
            nn.Linear(64, 16),
            nn.ReLU(),
        )
        # Decoder: 16 -> 64 -> 784
        self.decoder = nn.Sequential(
            nn.Linear(16, 64),
            nn.ReLU(),
            nn.Linear(64, 28 * 28),
            nn.Sigmoid(),  # output pixels in [0, 1]
        )

    def forward(self, x):
        z = self.encoder(x)
        out = self.decoder(z)
        return out.view(-1, 1, 28, 28)

# 2. Load MNIST train and test sets

transform = transforms.ToTensor()

train_data = datasets.MNIST(
    root="data",
    train=True,
    transform=transform,
    download=True,
)

test_data = datasets.MNIST(
    root="data",
    train=False,
    transform=transform,
    download=True,
)

# 3. Keep only digits 0–7 for training (normal data)

train_mask = train_data.targets < 8
x_train_normal = train_data.data[train_mask].float() / 255.0   # [N, 28, 28]
x_train_normal = x_train_normal.unsqueeze(1)                    # [N, 1, 28, 28]

normal_loader = DataLoader(x_train_normal, batch_size=256, shuffle=True)

# 4. Create and train a new autoencoder on normal digits only

model = Autoencoder().to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

num_epochs = 5

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0

    for imgs in normal_loader:
        imgs = imgs.to(device)

        optimizer.zero_grad()
        recon = model(imgs)
        loss = criterion(recon, imgs)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * imgs.size(0)

    epoch_loss = running_loss / len(normal_loader.dataset)
    print(f"Epoch {epoch + 1}: loss {epoch_loss:.4f}")

# 5. Score the entire test set by reconstruction error

x_test = test_data.data.float() / 255.0   # [N, 28, 28]
x_test = x_test.unsqueeze(1)              # [N, 1, 28, 28]
y_test = test_data.targets                # digit labels

test_ds = TensorDataset(x_test, y_test)
test_loader = DataLoader(test_ds, batch_size=256, shuffle=False)

model.eval()
all_errors = []
all_labels = []

with torch.no_grad():
    for imgs, labels in test_loader:
        imgs = imgs.to(device)
        recon = model(imgs)

        # Mean squared error per image, averaged over all pixels
        err = ((imgs - recon) ** 2).mean(dim=[1, 2, 3]).cpu()

        all_errors.append(err)
        all_labels.append(labels)

errors = torch.cat(all_errors).numpy()
labels = torch.cat(all_labels).numpy()

# 6. Compare reconstruction error for normal digits (0–7) and unusual digits (8–9)

normal_mask = labels < 8
anom_mask = labels >= 8

# Threshold at the ninety fifth percentile of all errors
threshold = np.percentile(errors, 95)

plt.figure(figsize=(8, 4))
plt.hist(
    errors[normal_mask],
    bins=40,
    histtype="step",
    label="Digits 0–7",
)
plt.hist(
    errors[anom_mask],
    bins=40,
    histtype="step",
    linestyle="--",
    label="Digits 8–9",
)
plt.axvline(threshold, color="black", linestyle=":", label="Threshold")

plt.xlabel("Reconstruction error")
plt.ylabel("Count")
plt.title("Autoencoder anomaly detection on MNIST")
plt.legend()
plt.tight_layout()
plt.show()


#Listing 5-3 Using a pretrained YOLOv5 model to detect objects in an image
This example lets you upload an image and count the objects in it using a pre-trained YOLOv5 object-detection model.

In [None]:
# Set up YOLOv5 and install dependencies
!git clone https://github.com/ultralytics/yolov5  # clone
%cd yolov5
%pip install -r requirements.txt

# Imports
import torch
import cv2
from collections import Counter
import matplotlib.pyplot as plt
from PIL import Image
from google.colab import files
import io

# Upload an image
uploaded = files.upload()
image_path = next(iter(uploaded))  # take the first uploaded file

# Load pretrained model
model = torch.hub.load('.', 'yolov5m', source='local')

# Run inference
results = model(image_path)

# Count detected classes
detections = results.pandas().xyxy[0]
class_names = detections['name'].tolist()
counts = Counter(class_names)

print("\nDetected Objects:")
for label, count in counts.items():
    print(f"- {label}: {count}")

# Display image with bounding boxes
results.render()
img = Image.fromarray(results.ims[0])
plt.figure(figsize=(10, 6))
plt.imshow(img)
plt.axis('off')
plt.title("Detected Objects")
plt.show()


#Listing 5-4 YOLOv5 video object detection implemented as repeated image detection using YOLOv5
This code runs YOLOv5 on an uploaded video inside a Colab notebook and prints a compact per-second table. With a few additional steps, the same pattern can support traffic monitoring, wildlife observation, or automated inspection on a conveyor belt.

In [None]:
# --- Suppress nonessential warnings (including AMP FutureWarnings) ---
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# --- Setup ---
!git clone https://github.com/ultralytics/yolov5
%cd yolov5
%pip install -r requirements.txt

import torch
import cv2
from collections import Counter, defaultdict
from google.colab import files
import math

# --- Load pretrained model ---
model = torch.hub.load('.', 'yolov5m', source='local')
model.conf = 0.25  # confidence threshold (optional)

# --- Upload video ---
print("Please upload a video file (e.g., .mp4)")
uploaded = files.upload()
video_source = next(iter(uploaded.keys()))

cap = cv2.VideoCapture(video_source)
if not cap.isOpened():
    raise RuntimeError(f"Could not open video source: {video_source}")

# --- Determine FPS (fallback if missing) ---
fps = cap.get(cv2.CAP_PROP_FPS)
if not fps or math.isnan(fps) or fps < 1:
    fps = 30.0

# --- Limit runtime for notebook safety ---
MAX_SECONDS = 15
max_frames = int(MAX_SECONDS * fps)

# Store per-second MAX counts (non-accumulating)
per_second_max = defaultdict(Counter)

frame_idx = 0  # 0-based
while True:
    ret, frame_bgr = cap.read()
    if not ret:
        break

    if frame_idx >= max_frames:
        break

    second = int(frame_idx / fps)
    frame_idx += 1

    # OpenCV uses BGR; YOLO expects RGB
    frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)

    # Run inference
    results = model(frame_rgb)
    detections = results.pandas().xyxy[0]
    frame_counts = Counter(detections["name"].tolist())

    # Keep the maximum count seen in any frame during this second
    for label, count in frame_counts.items():
        if count > per_second_max[second][label]:
            per_second_max[second][label] = count

cap.release()

# --- Final per-second table ---
print("\nPer-second detection summary (max per-frame counts):\n")

for second in sorted(per_second_max.keys()):
    counts = per_second_max[second]
    summary = ", ".join(f"{k}={v}" for k, v in counts.most_common()) if counts else "(none)"
    print(f"t={second:2d}s | {summary}")


# Listing 5-5 Removing backgrounds with a segmentation CNN
This example uses a pretrained DeepLabV3 network, which preserves the spatial structure of the image and assigns a class label to each pixel. The result is a detailed map that shows which parts of the scene belong to the subject and which parts are background. The code uses this map to remove the background of the image.

In [None]:
# CNN example: Remove the background with DeepLabV3 segmentation
# Upload one image (a child, a person, etc.) and the CNN will output a transparent PNG.

import io
import numpy as np
import torch
from torchvision import models
from PIL import Image
from google.colab import files
from IPython.display import display

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# 1. Load pretrained DeepLabV3 model and its transforms
weights = models.segmentation.DeepLabV3_ResNet50_Weights.DEFAULT
model = models.segmentation.deeplabv3_resnet50(weights=weights).to(device).eval()
preprocess = weights.transforms()

# 2. Upload an image
print("Please upload a photo")
uploaded = files.upload()
filename = next(iter(uploaded.keys()))

img = Image.open(io.BytesIO(uploaded[filename])).convert("RGB")

# 3. Run the image through the CNN
input_tensor = preprocess(img).unsqueeze(0).to(device)

with torch.no_grad():
    output = model(input_tensor)["out"][0]  # [num_classes, H, W]

labels = output.argmax(0).cpu().numpy()

# 4. Build a foreground mask: anything not background (class 0)
foreground_mask = labels != 0

# 5. Make the background transparent (RGBA + alpha channel)
img_resized = img.resize((labels.shape[1], labels.shape[0])).convert("RGBA")
img_np = np.array(img_resized).copy()

alpha = np.where(foreground_mask, 255, 0).astype(np.uint8)
img_np[..., 3] = alpha

# 6. Display the result
result_pil = Image.fromarray(img_np)
print("Showing result…")
display(result_pil)

# Optional: save a PNG with transparency
out_name = "background_removed.png"
result_pil.save(out_name)
print("Saved:", out_name)


# Listing 5-6 A simple LSTM model that trains on historical NVDA prices and plots the network’s forecast against the actual market data
This code trains on historical data and evaluates the LSTM on the last part of that same history. It is not forecasting future dates beyond the dataset. It is learning to map a window of recent prices to the next price inside the same historical period. This setup is useful for learning and for checking whether the network has captured basic trends.  

In [None]:
# pip install yfinance matplotlib scikit-learn torch --quiet
import yfinance as yf
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

def get_timeseries(ticker, start, end, seq_len=20):
    data = yf.download(ticker, start=start, end=end)
    prices = data[['Close']].values.astype('float32')
    scaler = MinMaxScaler()
    scaled = scaler.fit_transform(prices)

    X, y = [], []
    for i in range(len(scaled) - seq_len):
        X.append(scaled[i:i+seq_len])
        y.append(scaled[i+seq_len])
    return torch.tensor(X), torch.tensor(y), scaler, data.index[seq_len:]

class PriceLSTM(nn.Module):
    def __init__(self, hidden_size=50):
        super().__init__()
        self.lstm = nn.LSTM(1, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)
    def forward(self, x):
        out, _ = self.lstm(x)
        return self.fc(out[:, -1, :])

def train_model(ticker='NVDA', start='2024-01-01', end='2027-12-31', epochs=200):
    X, y, scaler, dates = get_timeseries(ticker, start, end)
    split = int(0.8 * len(X))
    X_train, X_test = X[:split], X[split:]
    y_train, y_test = y[:split], y[split:]

    model = PriceLSTM()
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    for _ in range(epochs):
        optimizer.zero_grad()
        loss = criterion(model(X_train), y_train)
        loss.backward()
        optimizer.step()

    return model, X_test, y_test, scaler, dates[split:]

def evaluate_model(model, X_test, y_test, scaler, dates, ticker):
    model.eval()
    with torch.no_grad():
        preds = model(X_test)
    actual = scaler.inverse_transform(y_test)
    predicted = scaler.inverse_transform(preds.numpy())

    plt.figure(figsize=(12, 6))
    plt.plot(dates, actual, label="Actual", color="black")
    plt.plot(dates, predicted, label="Predicted", linestyle="--", color="gray")
    plt.title(f"{ticker} Price Forecast")
    plt.xlabel("Date"); plt.ylabel("Price")
    plt.legend(); plt.grid(True); plt.tight_layout(); plt.show()

# Example run
model, X_test, y_test, scaler, dates = train_model(
    ticker="NVDA",
    start="2018-01-01",
    end="2023-12-31",
    epochs=50,
)
evaluate_model(model, X_test, y_test, scaler, dates, "NVDA")

# Listing 5-7 A simple character-level prediction model
The following example shows a character-level RNN that learns to predict the next letter in a short phrase typed by the user. The model trains live in your notebook and begins to recognize patterns within seconds.


In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import ipywidgets as widgets
from IPython.display import display, clear_output

# ---- 1. Model definition in PyTorch ----
class SimpleCharRNN(nn.Module):
    def __init__(self, vocab_size, embed_dim=8, hidden_size=32):
        super(SimpleCharRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.rnn = nn.RNN(input_size=embed_dim, hidden_size=hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        # x: (batch, seq_len)
        embedded = self.embedding(x)              # (batch, seq_len, embed_dim)
        output, _ = self.rnn(embedded)            # (batch, seq_len, hidden_size)
        last_output = output[:, -1, :]            # (batch, hidden_size)
        logits = self.fc(last_output)             # (batch, vocab_size)
        return logits

def build_and_predict_rnn(user_text):
    # ---- 2. Preprocessing ----
    chars = sorted(set(user_text))
    c2i = {c: i for i, c in enumerate(chars)}
    i2c = {i: c for c, i in c2i.items()}
    vocab_size = len(chars)

    # Convert characters to integer indices
    seq = [c2i[c] for c in user_text]
    # Inputs: all but last char
    X = torch.tensor(seq[:-1], dtype=torch.long).unsqueeze(1)   # (batch, seq_len=1)
    # Targets: next char index
    y = torch.tensor(seq[1:], dtype=torch.long)                 # (batch,)

    # ---- 3. Model setup ----
    model = SimpleCharRNN(vocab_size=vocab_size, embed_dim=8, hidden_size=32)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters())

    # ---- 4. Training ----
    model.train()
    epochs = 300
    for epoch in range(epochs):
        optimizer.zero_grad()
        logits = model(X)            # (batch, vocab_size)
        loss = criterion(logits, y)
        loss.backward()
        optimizer.step()

    # ---- 5. Predictions ----
    model.eval()
    print("\nCharacter Predictions:")
    with torch.no_grad():
        for ch in user_text[:-1]:
            idx = torch.tensor([[c2i[ch]]], dtype=torch.long)   # shape (1, 1)
            logits = model(idx)
            probs = torch.softmax(logits, dim=-1)
            next_idx = torch.argmax(probs, dim=-1).item()
            next_char = i2c[next_idx]
            print(f"{ch} → {next_char}")

# ---- 6. Interactive Widget Interface ----
text_input = widgets.Text(value="Hello, RNN World!", description='Text:')
run_button = widgets.Button(description='Predict')
output = widgets.Output()

def on_run_clicked(b):
    with output:
        clear_output()
        user_text = text_input.value.strip()
        if len(user_text) < 3:
            print("Please enter at least 3 characters.")
        else:
            build_and_predict_rnn(user_text)

run_button.on_click(on_run_clicked)
display(widgets.VBox([text_input, run_button, output]))


# Listing 5-8 A simple interactive example of sentiment analysis
This example uses Hugging Face Transformers to classify the sentiment of a sentence with DistilBERT, a lightweight version of BERT.

In [None]:
# Analyzing Sentiment with a Transformer (interactive)
!pip install transformers --quiet

from transformers import pipeline

# Load a sentiment-analysis pipeline (DistilBERT by default)
classifier = pipeline("sentiment-analysis")

print("Enter a message to analyze its sentiment.")
print("Type 'quit' to exit.\n")

while True:
    s = input("Your message: ").strip()
    if s.lower() in {"quit", "exit"}:
        break
    if not s:
        print("Please type a non-empty message.\n")
        continue
    result = classifier(s)[0]
    label = result["label"]
    score = result["score"] * 100
    print(f"→ {label} ({score:.1f}%)\n")


# Listing 5-9 A small question-answering example
This simple example lets you type context and then ask a question about it.

In [None]:
from transformers import pipeline

qa = pipeline("question-answering")

context = input("Enter context: ")
question = input("Now enter your question: ")

result = qa(question=question, context=context)
print(f"Answer: {result['answer']} (score: {result['score']:.2f})")


# Listing 5-10 A compact implementation of the chat-with-my-data pattern
The following code asks the user to upload a PDF file through the browser. Once the file is up-loaded, the script extracts the text from each page and stores it as a single context string. The user can then enter questions about the document. Each question, together with the document text, is passed to a pretrained question-answering model. The model uses attention to locate the most relevant span and returns an answer with a confidence score.

In [None]:
# Install required libraries
!pip install transformers
!pip install PyMuPDF

from transformers import pipeline
import fitz  # PyMuPDF
from IPython.display import display
from google.colab import files
import io

# Ask user to upload a PDF file
uploaded = files.upload()

# Take the first uploaded file
file_name = next(iter(uploaded))
pdf_data = uploaded[file_name]

# Extract text from the PDF
def extract_text_from_pdf_bytes(pdf_bytes):
    doc = fitz.open(stream=pdf_bytes, filetype="pdf")
    text = ""
    for page in doc:
        text += page.get_text()
    doc.close()
    return text

context = extract_text_from_pdf_bytes(pdf_data)

# Initialize the QA pipeline
qa = pipeline("question-answering")

# Interactive Q&A loop
print("\nPDF loaded. Ask me anything about its contents. Type 'quit' to exit.")

while True:
    question = input("\nYour question: ").strip()
    if question.lower() == "quit":
        print("Exiting Q&A.")
        break
    if len(question) == 0:
        print("Please enter a valid question.")
        continue

    try:
        result = qa(question=question, context=context)
        print(f"Answer: {result['answer']} (score: {result['score']:.2f})")
    except Exception as e:
        print(f"Could not answer the question. Reason: {e}")
