# Convolutional Neural Network

A convolutional neural network (CNN) is a type of feedforward neural network that learns features via filter (or kernel) optimization. This type of deep learning network has been applied to process and make predictions from many different types of data including text, images and audio. Convolution-based networks are the de-facto standard in deep learning-based approaches to computer vision and image processing, and have only recently been replaced—in some cases—by newer deep learning architectures such as the transformer.
Vanishing gradients and exploding gradients, seen during backpropagation in earlier neural networks, are prevented by the regularization that comes from using shared weights over fewer connections. For example, for each neuron in the fully-connected layer, 10,000 weights would be required for processing an image sized 100 × 100 pixels. However, applying cascaded convolution (or cross-correlation) kernels, only 25 weights for each convolutional layer are required to process 5x5-sized tiles. Higher-layer features are extracted from wider context windows, compared to lower-layer features.

## Simple Form

In [2]:
# Import Libraries

import numpy as np

In [26]:
# Fake "Image": 1 channel, 4x4 pixels, 2D array

image = np.array([
    [1, 2, 0, 1],
    [0, 1, 3, 1],
    [2, 2, 1, 0],
    [1, 0, 1, 3]
])

# Simple 2x2 filter (kernel)

kernel = np.array([
    [1, 0],
    [0, 1]
])

In [27]:
# Convolution without padding or stride

def convolve2d(image, kernel):
    # Get dimensions of image and kernel
    h, w = image.shape
    kh, kw = kernel.shape
    # Calculate output dimensions
    output = np.zeros((h - kh + 1, w - kw + 1))

    # Perform convolution
    for i in range(output.shape[0]):
        for j in range(output.shape[1]):
            # Extract the region of interest from the image
            region = image[i:i + kh, j:j + kw]
            # Perform element-wise multiplication and sum the result
            output[i, j] = np.sum(region * kernel)
    return output

In [28]:
# Apply ReLU activation function
# ReLU: Rectified Linear Unit
# It replaces negative values with zero and keeps positive values unchanged
# This is a common activation function in neural networks
# It helps to introduce non-linearity in the model
# and allows the model to learn complex patterns

def relu(x):
    return np.maximum(0, x)

In [39]:
# Apply max pooling
# Max pooling is a down-sampling technique used in convolutional neural networks
# It reduces the spatial dimensions of the input while retaining important features
# It takes the maximum value from a defined window (e.g., 2x2) and moves it across the input
# This helps to reduce the number of parameters and computations in the network
# It also helps to make the model more robust to small translations in the input

def max_pooling(feature_map):
    # Get dimensions of the feature map
    h, w = feature_map.shape
    pooled_h = h // 2
    pooled_w = w // 2
    pooled = np.zeros((pooled_h, pooled_w))
    # Perform max pooling
    for i in range(pooled_h):
        for j in range(pooled_w):
            # Extract the region of interest from the feature map
            region = feature_map[i * 2:i * 2 + 2, j * 2:j * 2 + 2]      # 2x2 pooling window
            # Take the maximum value from the region                       
            pooled[i, j] = np.max(region)
            
    return pooled
    
    

In [40]:
# Run through steps
conv_out = convolve2d(image, kernel)
relu_out = relu(conv_out)
pooled_out = max_pooling(relu_out)

In [44]:
# Print results

# Ensure all required variables are defined
print("Convolution Output:\n", conv_out)    # Convolution result
print("After ReLU:\n", relu_out)            # After ReLU activation 
print("After Max Pooling:\n", pooled_out)   # After max pooling

Convolution Output:
 [[2. 5. 1.]
 [2. 2. 3.]
 [2. 3. 4.]]
After ReLU:
 [[2. 5. 1.]
 [2. 2. 3.]
 [2. 3. 4.]]
After Max Pooling:
 [[5.]]


## PyTorch Form

In [45]:
# Import Libraries

import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

In [46]:
# Transform images to tensor and normalize to [0, 1] range

trnasform = transforms.ToTensor()


In [47]:
# Download and load MNIST dataset
# MNIST is a dataset of handwritten digits (0-9) commonly used for image classification tasks
# It contains 60,000 training images and 10,000 test images

train_data = datasets.MNIST(root='./data', train=True, download=True, transform=trnasform)
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)

100.0%
100.0%
100.0%
100.0%


In [60]:
# CNN Model Definition

class SimpleCNN(nn.Module):
    def __init__(self):
        super(SimpleCNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 8, kernel_size = 3)    # 1 input channel, 8 output channels, 3x3 kernel
        self.pool = nn.MaxPool2d(2, 2)                   # 2x2 max pooling with stride 2
        self.fc1 = nn.Linear(8 * 13 * 13, 10)            # Fully connected layer (input size: 8*13*13, output size: 10 classes)

        # The output of the convolution, ReLU activation, and max pooling steps are printed.

    def forward(self, x):
        x = self.pool(torch.relu(self.conv1(x)))  # Convolution + ReLU + Max Pooling
        x = x.view(-1, 8 * 13 * 13)               # Flatten the tensor 
        x = self.fc1(x)                           # Fully connected layer
        return x

In [61]:
# Initialize the model, loss function, and optimizer

model = SimpleCNN()
criterion = nn.CrossEntropyLoss()  # Loss function for multi-class classification
optimizer = optim.Adam(model.parameters(), lr = 0.001)  # Adam optimizer with learning rate of 0.001


In [62]:
# Training Loop

for images, labels in train_loader:
    outputs = model(images)  # Forward pass through the model
    loss = criterion(outputs, labels)  # Calculate loss
    
    optimizer.zero_grad()  # Zero the gradients
    loss.backward()  # Backward pass to calculate gradients
    optimizer.step()  # Update model parameters using optimizer
    break   # Break after one batch for demonstration purposes

print("Output logits for 64 images:", outputs.shape)  # Output shape after forward pass

Output logits for 64 images: torch.Size([64, 10])


## PyTorch With GPU

In [64]:
# Import Libraries

import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

In [65]:
# Transform images to tensor and normalize to [0, 1] range

trnasform = transforms.ToTensor()


In [66]:
# Download and load MNIST dataset
# MNIST is a dataset of handwritten digits (0-9) commonly used for image classification tasks
# It contains 60,000 training images and 10,000 test images

train_data = datasets.MNIST(root='./data', train=True, download=True, transform=trnasform)
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)

In [67]:
# GPU Device Check
# Check if GPU is available and set device accordingly

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)  # Print the device being used (CPU or GPU)

Using device: cuda


In [68]:
# CNN Model Definition

class SimpleCNN(nn.Module):
    def __init__(self):
        super(SimpleCNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 8, kernel_size = 3)    # 1 input channel, 8 output channels, 3x3 kernel
        self.pool = nn.MaxPool2d(2, 2)                   # 2x2 max pooling with stride 2
        self.fc1 = nn.Linear(8 * 13 * 13, 10)            # Fully connected layer (input size: 8*13*13, output size: 10 classes)

        # The output of the convolution, ReLU activation, and max pooling steps are printed.

    def forward(self, x):
        x = self.pool(torch.relu(self.conv1(x)))  # Convolution + ReLU + Max Pooling
        x = x.view(-1, 8 * 13 * 13)               # Flatten the tensor 
        x = self.fc1(x)                           # Fully connected layer
        return x

In [69]:
# Initialize the model, loss function, and optimizer

model = SimpleCNN().to(device)  # Move model to GPU if available
criterion = nn.CrossEntropyLoss()  # Loss function for multi-class classification
optimizer = optim.Adam(model.parameters(), lr = 0.001)  # Adam optimizer with learning rate of 0.001


In [70]:
# Training Loop

for images, labels in train_loader:
    images, labels = images.to(device), labels.to(device)  # Move data to GPU if available
    outputs = model(images)  # Forward pass through the model
    loss = criterion(outputs, labels)  # Calculate loss
    
    optimizer.zero_grad()  # Zero the gradients
    loss.backward()  # Backward pass to calculate gradients
    optimizer.step()  # Update model parameters using optimizer
    break   # Break after one batch for demonstration purposes

print("Output logits for 64 images:", outputs.shape)  # Output shape after forward pass

Output logits for 64 images: torch.Size([64, 10])
