## Deep Learning (AI5100): Assignment-3

Name - ARIF KHAN PATHAN (CS23MTECH11024)

## 1) Self-Attention for Object Recognition with CNNs

In [None]:
# Importing required packages
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

In [None]:
# self-attention mechanism
class SelfAttention(nn.Module):
    def __init__(self, in_channels):
        super(SelfAttention, self).__init__()
        # Query convolutional layer
        self.query_conv = nn.Conv2d(in_channels, in_channels // 8, kernel_size=1)

        # Key convolutional layer
        self.key_conv = nn.Conv2d(in_channels, in_channels // 8, kernel_size=1)

        # Value convolutional layer
        self.value_conv = nn.Conv2d(in_channels, in_channels, kernel_size=1)

        # Scaling factor gamma -> zero
        self.gamma = nn.Parameter(torch.zeros(1))

    # Forward function
    def forward(self, x):
        # Compute query, key, and value
        Q = self.query_conv(x)
        Q = Q.view(Q.size(0), -1, Q.size(2) * Q.size(3))
        Q = Q.permute(0, 2, 1)

        K = self.key_conv(x)
        K = K.view(K.size(0), -1, K.size(2) * K.size(3))

        energy = torch.bmm(Q, K)
        attention = F.softmax(energy, dim=-1)

        V = self.value_conv(x)
        V = V.view(V.size(0), -1, V.size(2) * V.size(3))

        # Reshaping and apply scaling
        output = torch.bmm(V, attention.permute(0, 2, 1))
        output = self.gamma * output.view(x.size()) + x
        # returning output
        return output


In [None]:
# Define the CNN architecture with self-attention
class CNNWithSelfAttention(nn.Module):
    def __init__(self):
        super(CNNWithSelfAttention, self).__init__()
        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, padding=1)
        self.sa1 = SelfAttention(16)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, padding=1)
        self.sa2 = SelfAttention(32)
        self.conv3 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.sa3 = SelfAttention(64)
        self.conv4 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
        self.sa4 = SelfAttention(128)
        self.conv5 = nn.Conv2d(128, 10, kernel_size=3, padding=1)

        # Global average pooling
        # Reduce the spatial dimensions of image maps while preserving important information.
        self.global_avg_pool = nn.AdaptiveAvgPool2d(1)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.sa1(x)
        x = F.relu(self.conv2(x))
        x = self.sa2(x)
        x = F.relu(self.conv3(x))
        x = self.sa3(x)
        x = F.relu(self.conv4(x))
        x = self.sa4(x)
        x = self.conv5(x)

        # Pooling function for Image
        x = self.global_avg_pool(x)
        x = x.view(x.size(0), -1)
        return F.log_softmax(x, dim=1)


In [None]:
# Load CIFAR-10 dataset
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

test_dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=True)


In [None]:
# Initialize the model, optimizer, and loss function
model = CNNWithSelfAttention()

# Defining Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
# Cross Entropy loss function
criterion = nn.CrossEntropyLoss()

# Training loop
num_epochs = 10
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for batch_idx, (data, targets) in enumerate(train_loader):
        data, targets = data.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(data)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f'Epoch {epoch+1}, Loss: {running_loss/len(train_loader):.4f}')

# Save the trained model
torch.save(model.state_dict(), 'cnn_with_self_attention.pth')


In [None]:
# Set the model to evaluation mode
model.eval()
correct = 0
total = 0

# Disable gradient calculation for evaluation
with torch.no_grad():
    for data in test_loader:
        images, labels = data
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

# Calculate the accuracy of the network on the entire test dataset
accuracy_CNN = 100 * correct / total

# Print the accuracy
print('Accuracy : %d %%' % accuracy_CNN)


## 2) Object Recognition with Vision Transformer

In [None]:
# Importing required packages
import torch
import torch.nn as nn
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
from torch import bmm
from torch.optim import Adam
from torch.nn import Parameter

num_channels = 3

In [None]:
# Multihead Attention mechanism
class MultiHeadAttention(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads

        self.query_conv = nn.Linear(embed_dim, embed_dim)
        self.key_conv = nn.Linear(embed_dim, embed_dim)
        self.value_conv = nn.Linear(embed_dim, embed_dim)
        self.output = nn.Linear(embed_dim, embed_dim)

    # Forward pass for the multihead attention
    def forward(self, query, key, value):
        # Input x shape: (batch_size, seq_len, embed_dim)
        batch_size = query.shape[0]

        # Split the embedding into multiple heads and perform linear transformations
        Q = self.query_conv(query).view(batch_size, -1, self.num_heads, self.head_dim).permute(0, 2, 1, 3)  # (batch_size, num_heads, seq_len, head_dim)
        K = self.key_conv(key).view(batch_size, -1, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
        V = self.value_conv(value).view(batch_size, -1, self.num_heads, self.head_dim).permute(0, 2, 1, 3)

        # Calculate attention scores
        qk = torch.matmul(Q, K.permute(0, 1, 3, 2)) / self.head_dim**0.5

        # Apply softmax to get attention weights
        attention = torch.softmax(qk, dim=-1)

        # Apply attention to value
        output = torch.matmul(attention, V)

        # Reshape and concatenate attention heads
        output = output.permute(0, 2, 1, 3).contiguous().view(batch_size, -1, self.embed_dim)

        # Apply output linear layer
        output = self.output(output)

        return output


In [None]:
# Define the Vision Transformer (ViT) Encoder block with Custom Multi-Head Attention
class EncoderBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, mlp_dim, dropout=0.1):
        super(EncoderBlock, self).__init__()
        self.attn = MultiHeadAttention(embed_dim, num_heads)
        self.norm1 = nn.LayerNorm(embed_dim)
        self.mlp = nn.Sequential(
            nn.Linear(embed_dim, mlp_dim),
            nn.ReLU(),
            nn.Linear(mlp_dim, embed_dim)
        )
        self.norm2 = nn.LayerNorm(embed_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # Multi-head self-attention
        attention_output = self.attn(x, x, x)
        x = self.norm1(x + self.dropout(attention_output))

        # Feedforward network
        mlp_output = self.mlp(x)
        x = self.norm2(x + self.dropout(mlp_output))
        return x


In [None]:
# Class for Vision transformer
class VisionTransformer(nn.Module):
    def __init__(self, image_size=224, patch_size=16, num_classes=10, embed_dim = 768, num_heads = 8, num_layers = 6, mlp_dim = 3072, dropout=0.1):
        super(VisionTransformer, self).__init__()
        assert image_size % patch_size == 0, "Image size must be divisible by patch size."
        num_patches = (image_size // patch_size) ** 2
        patch_dim = 3 * patch_size ** 2  # RGB channels * patch_size * patch_size

        # Patch embedding layer
        self.patch_embedding = nn.Conv2d(num_channels, embed_dim, kernel_size=patch_size, stride=patch_size)
        self.cls_token = Parameter(torch.randn(1, 1, embed_dim))  # Class token

        # Positional embeddings
        self.positional_embedding = Parameter(torch.randn(1, (image_size // patch_size) ** 2 + 1, embed_dim))

        # Encoder blocks
        self.encoder = nn.ModuleList([
            EncoderBlock(embed_dim, num_heads, mlp_dim, dropout)
            for _ in range(num_layers)
        ])


        # Classification head
        self.norm = nn.LayerNorm(embed_dim)
        self.fc = nn.Linear(embed_dim, num_classes)

    def forward(self, x):
        A= x.shape[0]
        x = self.patch_embedding(x).flatten(2).transpose(1, 2)  # Output: (batch_size, embed_dim, grid_size, grid_size)
        cls_tokens = self.cls_token.expand(A, -1, -1)
        x = torch.cat((cls_tokens, x), dim=1)#add cls tokens
        x = x + self.positional_embedding  # Add positional embeddings

        # Transformer encoder blocks
        for encoder_block in self.encoder:
            x = encoder_block(x)

         # Take only the first token (class token) output and apply classification layer
        x = self.norm(x[:, 0])
        x = self.fc(x)
        return x


In [None]:
# CIFAR-10 dataset loading and preprocessing
transform = transforms.Compose([
    transforms.Resize(224),  # Resize images to 32x32 (CIFAR-10 size)
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

test_dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=True)


In [None]:
#Train the model on the dataset
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [None]:
# Initialize and train the Vision Transformer model with Custom Multi-Head Attention
model2 = VisionTransformer().to(device)
optimizer = torch.optim.Adam(model2.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

# Training loop
num_epochs = 10

for epoch in range(num_epochs):
    model2.train()
    running_loss = 0.0
    for batch_idx, (data, targets) in enumerate(train_loader):
        data, targets = data.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model2(data)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f'Epoch {epoch+1}, Loss: {running_loss/len(train_loader):.4f}')

# Save the trained model
torch.save(model2.state_dict(), 'vision_transformer.pth')


In [None]:
# Set the model to evaluation mode
model2.eval()
correct = 0
total = 0
# Disable gradient calculation for evaluation
with torch.no_grad():
    for data in test_loader:
        images, labels = data
        outputs = model2(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

# Calculate the accuracy of the network on the entire test dataset
accuracy_VIT = 100 * correct / total

# Print the accuracy
print('Accuracy : %d %%' % accuracy_VIT)

In [None]:
print('Accuracy of CNN with Self-Attention on test images: %.2f %%' % accuracy_CNN)
print('Accuracy of the Vision Transformer on test images: %.2f %%' % accuracy_VIT)