In [None]:
import torch
import torch.nn as nn

class MultiheadAttentionEinsum(nn.Module):
    def __init__(self, embedding_dim, num_heads):
        super(MultiheadAttentionEinsum, self).__init__()
        self.num_heads = num_heads
        self.head_dim = embedding_dim // num_heads

        self.q_linear = nn.Linear(embedding_dim, embedding_dim)
        self.k_linear = nn.Linear(embedding_dim, embedding_dim)
        self.v_linear = nn.Linear(embedding_dim, embedding_dim)
        self.fc_out = nn.Linear(embedding_dim, embedding_dim)

    def forward(self, query, key, value):

        #use matrix multiplication for getting the scores
        scores = torch.matmul(query, key.transpose(-2, -1))
        attn_weights = torch.nn.functional.softmax(scores, dim=-1)
        #scaling the QKT by the root of the number of columns of Q
        scale = size(Q, 2)
        scores /= scale
        #apply another matrix multiplication on QKT/(d_q)^1/2 and V
        attended_values = torch.matmul(attn_weights, value)

        # Linear projection
        out = self.fc_out(attended_values)
        return out

In [None]:
import torch
import torch.nn as nn

class TransformerEncoderLayer(nn.Module):
    def __init__(self, embedding_dim, num_heads):
        super(TransformerEncoderLayer, self).__init__()
        self.multihead_attention = MultiheadAttentionEinsum(embed_dim=embedding_dim, num_heads=num_heads)
        self.feed_forward = nn.Sequential(
            nn.Linear(embedding_dim, 2048),
            nn.ReLU(),
            nn.Linear(2048, embedding_dim)
        )
        self.layer_norm1 = nn.LayerNorm(embedding_dim)
        self.layer_norm2 = nn.LayerNorm(embedding_dim)

    def forward(self, x):
        residual = x
        x = self.layer_norm1(x)
        x = x.permute(1, 0, 2)  # (seq_len, batch_size, embedding_dim)
        attn_output = self.multihead_attention(x, x, x)[0]  # self-attention
        x = attn_output + residual
        x = x.permute(1, 0, 2)  # (batch_size, seq_len, embedding_dim)

        residual = x
        x = self.layer_norm2(x)
        x = self.feed_forward(x)
        x = x + residual

        return x


In [None]:
class VisionTransformer(nn.Module):
    def __init__(self, num_classes, patch_size, embedding_dim, num_heads, num_layers):
        super(VisionTransformer, self).__init__()
        self.patch_embedding = nn.Conv2d(3, embedding_dim, kernel_size=patch_size, stride=patch_size)
        self.positional_encoding = nn.Parameter(torch.randn(1, 14 * 14 + 1, embedding_dim))
        self.transformer_layers = nn.ModuleList([
            nn.TransformerEncoderLayer(d_model=embedding_dim, nhead=num_heads) for _ in range(num_layers)
        ])
        self.fc = nn.Linear(embedding_dim, num_classes)

    def forward(self, x):
        batch_size = x.size(0)
        x = self.patch_embedding(x)
        x = x.flatten(2).transpose(1, 2)
        x = torch.cat((x, self.positional_encoding.repeat(batch_size, 1, 1)), dim=1)
        for layer in self.transformer_layers:
            x = layer(x)
        x = x.mean(dim=1)
        x = self.fc(x)
        return x

In [None]:
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from torchvision.datasets import CIFAR10

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Hyperparameters
num_epochs = 10
batch_size = 64
learning_rate = 0.001
num_classes = 10
patch_size = 16
embedding_dim = 128
num_heads = 8
num_layers = 3

# CIFAR-10 dataset preprocessing
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

# Load CIFAR-10 dataset
train_dataset = CIFAR10(root='./data', train=True, download=True, transform=transform)
test_dataset = CIFAR10(root='./data', train=False, download=True, transform=transform)

# Data loaders
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)


Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


100%|██████████| 170498071/170498071 [00:01<00:00, 102766486.88it/s]


Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified


In [None]:
# Initialize the model
model = VisionTransformer(num_classes, patch_size, embedding_dim, num_heads, num_layers).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
total_steps = len(train_loader)
for epoch in range(num_epochs):

    model.train()  # Set the model to training mode
    total_loss = 0.0
    #loop for going through training data
    for step, (images, labels) in enumerate(train_loader):
        #set gradients to zero again
        optimizer.zero_grad()
        outputs = model(images.to(device))
        loss = criterion(outputs, labels.to(device))
        loss.backward()
        optimizer.step()
        #sum of the losses
        total_loss += loss.item()
        print(f"Epoch [{epoch + 1}/{num_epochs}], Step [{step + 1}/{total_steps}], Loss: {loss.item():.4f}")

    # print the average loss for each of the epochs
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch [{epoch + 1}/{num_epochs}], Average Loss: {avg_loss:.4f}")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Epoch [4/10], Step [482/782], Loss: 1.7195
Epoch [4/10], Step [483/782], Loss: 1.5033
Epoch [4/10], Step [484/782], Loss: 1.7031
Epoch [4/10], Step [485/782], Loss: 1.6472
Epoch [4/10], Step [486/782], Loss: 1.4310
Epoch [4/10], Step [487/782], Loss: 1.7250
Epoch [4/10], Step [488/782], Loss: 1.5021
Epoch [4/10], Step [489/782], Loss: 1.5107
Epoch [4/10], Step [490/782], Loss: 1.7015
Epoch [4/10], Step [491/782], Loss: 1.7823
Epoch [4/10], Step [492/782], Loss: 1.6388
Epoch [4/10], Step [493/782], Loss: 1.5925
Epoch [4/10], Step [494/782], Loss: 1.5908
Epoch [4/10], Step [495/782], Loss: 1.4444
Epoch [4/10], Step [496/782], Loss: 1.6298
Epoch [4/10], Step [497/782], Loss: 1.5968
Epoch [4/10], Step [498/782], Loss: 1.7389
Epoch [4/10], Step [499/782], Loss: 1.4400
Epoch [4/10], Step [500/782], Loss: 1.3617
Epoch [4/10], Step [501/782], Loss: 1.4708
Epoch [4/10], Step [502/782], Loss: 1.8355
Epoch [4/10], Step [503/782], Lo

In [None]:
#Testing Phase
model.eval()
with torch.no_grad():
    correct = 0
    total = 0

    #iterate through the test data
    for images, labels in test_loader:
        outputs = model(images.to(device))
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels.to(device)).sum().item()
    #evaluation metric == accuracy
    accuracy = 100 * correct / total
    print(f'Test Accuracy of the model on the {total} test images: {accuracy:.2f}%')

Test Accuracy of the model on the 10000 test images: 51.63%


#Hyperparameters

In this section I try to change some hyperparameters to see if the model gives us better results. Since the train loss wassn't decreasing stable during the training phase I'm going to set a lower learning rate to hopefully fix this problem.

In [None]:
learning_rate = 0.0001
batch_size = 256

In [None]:
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)

In [None]:
# Initialize the model
model = VisionTransformer(num_classes, patch_size, embedding_dim, num_heads, num_layers).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
total_steps = len(train_loader)
for epoch in range(num_epochs):

    model.train()  # Set the model to training mode
    total_loss = 0.0

    for step, (images, labels) in enumerate(train_loader):
        #set gradients to zero
        optimizer.zero_grad()
        outputs = model(images.to(device))
        loss = criterion(outputs, labels.to(device))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        print(f"Epoch [{epoch + 1}/{num_epochs}], Step [{step + 1}/{total_steps}], Loss: {loss.item():.4f}")

    #print the average loss for each of the epochs
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch [{epoch + 1}/{num_epochs}], Average Loss: {avg_loss:.4f}")


Epoch [1/10], Step [1/196], Loss: 2.3229
Epoch [1/10], Step [2/196], Loss: 2.2655
Epoch [1/10], Step [3/196], Loss: 2.2748
Epoch [1/10], Step [4/196], Loss: 2.2619
Epoch [1/10], Step [5/196], Loss: 2.2236
Epoch [1/10], Step [6/196], Loss: 2.2199
Epoch [1/10], Step [7/196], Loss: 2.1917
Epoch [1/10], Step [8/196], Loss: 2.2138
Epoch [1/10], Step [9/196], Loss: 2.2035
Epoch [1/10], Step [10/196], Loss: 2.2184
Epoch [1/10], Step [11/196], Loss: 2.1698
Epoch [1/10], Step [12/196], Loss: 2.2083
Epoch [1/10], Step [13/196], Loss: 2.1788
Epoch [1/10], Step [14/196], Loss: 2.1666
Epoch [1/10], Step [15/196], Loss: 2.1531
Epoch [1/10], Step [16/196], Loss: 2.1439
Epoch [1/10], Step [17/196], Loss: 2.1527
Epoch [1/10], Step [18/196], Loss: 2.1263
Epoch [1/10], Step [19/196], Loss: 2.1342
Epoch [1/10], Step [20/196], Loss: 2.1575
Epoch [1/10], Step [21/196], Loss: 2.0897
Epoch [1/10], Step [22/196], Loss: 2.1023
Epoch [1/10], Step [23/196], Loss: 2.0873
Epoch [1/10], Step [24/196], Loss: 2.0833
E

The results got better but still not acceptable. So here I apply L2 regularization by adding a L2 term to the loss function so the optimizer prevents weights from getting tpp large by trying to minimize the loss.

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-5)

In [None]:
for epoch in range(num_epochs):

    #model's training mode
    model.train()
    total_loss = 0.0
    #loop for iterating through the training data
    for step, (images, labels) in enumerate(train_loader):
        #set the gradients to zero
        optimizer.zero_grad()
        outputs = model(images.to(device))
        loss = criterion(outputs, labels.to(device))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

        print(f"Epoch [{epoch + 1}/{num_epochs}], Step [{step + 1}/{total_steps}], Loss: {loss.item():.4f}")

    #print the average loss for each of the epochs
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch [{epoch + 1}/{num_epochs}], Average Loss: {avg_loss:.4f}")


Epoch [1/10], Step [1/196], Loss: 1.3333
Epoch [1/10], Step [2/196], Loss: 1.4093
Epoch [1/10], Step [3/196], Loss: 1.4742
Epoch [1/10], Step [4/196], Loss: 1.4634
Epoch [1/10], Step [5/196], Loss: 1.5776
Epoch [1/10], Step [6/196], Loss: 1.4621
Epoch [1/10], Step [7/196], Loss: 1.4020
Epoch [1/10], Step [8/196], Loss: 1.4517
Epoch [1/10], Step [9/196], Loss: 1.4373
Epoch [1/10], Step [10/196], Loss: 1.4292
Epoch [1/10], Step [11/196], Loss: 1.4057
Epoch [1/10], Step [12/196], Loss: 1.4760
Epoch [1/10], Step [13/196], Loss: 1.5560
Epoch [1/10], Step [14/196], Loss: 1.4191
Epoch [1/10], Step [15/196], Loss: 1.5159
Epoch [1/10], Step [16/196], Loss: 1.4721
Epoch [1/10], Step [17/196], Loss: 1.3624
Epoch [1/10], Step [18/196], Loss: 1.3247
Epoch [1/10], Step [19/196], Loss: 1.3791
Epoch [1/10], Step [20/196], Loss: 1.3698
Epoch [1/10], Step [21/196], Loss: 1.4231
Epoch [1/10], Step [22/196], Loss: 1.3963
Epoch [1/10], Step [23/196], Loss: 1.3895
Epoch [1/10], Step [24/196], Loss: 1.3921
E

In [None]:
#Testing Phase
model.eval()
with torch.no_grad():
    correct = 0
    total = 0

    #iterate through the test data
    for images, labels in test_loader:
        outputs = model(images.to(device))
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels.to(device)).sum().item()
    #evaluation metric == accuracy
    accuracy = 100 * correct / total
    print(f'Test Accuracy of the model on the {total} test images: {accuracy:.2f}%')

Test Accuracy of the model on the 10000 test images: 57.48%


Applying all those changes on the model, we got a better result for accuracy. Accuracy has improved about 6 percent.

#Pre-trained models

For testing a pre-trained model on our dataset. I'm using vgg 19 here.

In [None]:
import torchvision
model = torchvision.models.vgg19(pretrained=True)
model.classifier[-1] = nn.Linear(in_features=4096, out_features=num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

Downloading: "https://download.pytorch.org/models/vgg19-dcbb9e9d.pth" to /root/.cache/torch/hub/checkpoints/vgg19-dcbb9e9d.pth
100%|██████████| 548M/548M [00:06<00:00, 95.1MB/s]


In [None]:
batch_size = 128
num_epochs = 5
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)

In [None]:
model = model.to(device)
for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    #iterate through training data
    for step, (images, labels) in enumerate(train_loader):
        #set gradients to zero
        optimizer.zero_grad()
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

        print(f"Epoch [{epoch + 1}/{num_epochs}], Step [{step + 1}/{len(train_loader)}], Loss: {loss.item():.4f}")
    #print the average loss for each of the epochs
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch [{epoch + 1}/{num_epochs}], Average Loss: {avg_loss:.4f}")


  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


Epoch [1/5], Step [1/391], Loss: 2.3719


  return F.conv2d(input, weight, bias, self.stride,
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


Epoch [1/5], Step [2/391], Loss: 2.4631
Epoch [1/5], Step [3/391], Loss: 2.6426
Epoch [1/5], Step [4/391], Loss: 2.3419
Epoch [1/5], Step [5/391], Loss: 2.4737
Epoch [1/5], Step [6/391], Loss: 2.3130
Epoch [1/5], Step [7/391], Loss: 2.3077
Epoch [1/5], Step [8/391], Loss: 2.6169
Epoch [1/5], Step [9/391], Loss: 2.3051
Epoch [1/5], Step [10/391], Loss: 2.3060
Epoch [1/5], Step [11/391], Loss: 2.3027
Epoch [1/5], Step [12/391], Loss: 2.3116
Epoch [1/5], Step [13/391], Loss: 2.3159
Epoch [1/5], Step [14/391], Loss: 2.3256
Epoch [1/5], Step [15/391], Loss: 2.3000
Epoch [1/5], Step [16/391], Loss: 2.3122
Epoch [1/5], Step [17/391], Loss: 2.2693
Epoch [1/5], Step [18/391], Loss: 3.3598
Epoch [1/5], Step [19/391], Loss: 2.3070
Epoch [1/5], Step [20/391], Loss: 2.3154
Epoch [1/5], Step [21/391], Loss: 2.3008
Epoch [1/5], Step [22/391], Loss: 2.3058
Epoch [1/5], Step [23/391], Loss: 2.3004
Epoch [1/5], Step [24/391], Loss: 2.3157
Epoch [1/5], Step [25/391], Loss: 2.3298
Epoch [1/5], Step [26/39

As you can see the amount of loss during each epochs is not decreasing enough. Also during each of the epochs the loss of steps isn't being reduced in a stable manner. I wasn't able to iterate the training loop for more epochs but these are the results for a few number of epochs for this model.

In [None]:
torch.cuda.empty_cache()