In [None]:
#import required lib

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
import torchvision
from torchvision import datasets, transforms
import numpy as np
import random
import matplotlib.pyplot as plt

In [None]:
torch.__version__

'2.6.0+cu124'

In [None]:
# device

In [None]:
!nvidia-smi

/bin/bash: line 1: nvidia-smi: command not found


In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
device

'cpu'

In [None]:
#sed the seed

In [None]:
torch.manual_seed(42)
torch.cuda.manual_seed(42)
random.seed(42)

In [None]:
## setting th hyperparameter

In [None]:
BATCH_SIZE = 258
EPOCHS = 10
LEARNING_RATE = 3e-4
PATCH_SIZE = 4
NUM_CLASSES = 10
IMAGE_SIZE = 32
CHANNELS = 3
EMBED_DIM = 256
NUM_HEADS = 8
MLP_DIM = 512
DROP_RATE = 0.1
DEPTH = 6

In [None]:
# IMAGE TRANSFORMATION

In [None]:
transform = transforms.Compose([transforms.ToTensor(),
                                transforms.Normalize((0.5),(0.5)),
                                ])

In [None]:
# Getting the dataset

In [None]:
train_dataset = datasets.CIFAR10(root = "data",
                                 train=True,
                                 download=True,
                                 transform=transform
                                 )

100%|██████████| 170M/170M [00:02<00:00, 77.7MB/s]


In [None]:
test_dataset = datasets.CIFAR10(root = "data",
                                 train=False,
                                 download=True,
                                 transform=transform
                                 )

In [None]:
train_dataset

Dataset CIFAR10
    Number of datapoints: 50000
    Root location: data
    Split: Train
    StandardTransform
Transform: Compose(
               ToTensor()
               Normalize(mean=0.5, std=0.5)
           )

In [None]:
test_dataset

Dataset CIFAR10
    Number of datapoints: 10000
    Root location: data
    Split: Test
    StandardTransform
Transform: Compose(
               ToTensor()
               Normalize(mean=0.5, std=0.5)
           )

In [None]:
len(train_dataset), len(test_dataset)

(50000, 10000)

In [None]:
## Converting our dataset to dataloaders

In [None]:
train_loader = DataLoader(dataset=train_dataset,
                          batch_size=BATCH_SIZE,
                          shuffle=True
                          )

In [None]:
test_loader = DataLoader(dataset=test_dataset,
                         batch_size=BATCH_SIZE,shuffle=False)

In [None]:
len(train_loader)

194

In [None]:
## VIT

In [None]:
class PatchEmbedding(nn.Module):
  def __init__(self,img_size,patch_size,in_channels,embed_dim):
    super().__init__()
    self.patch_size = patch_size
    self.proj = nn.Conv2d(in_channels = in_channels,
                          out_channels=embed_dim,
                          kernel_size=patch_size,
                          stride=patch_size)
    num_patches = (img_size // patch_size) ** 2
    self.cls_token = nn.Parameter(torch.randn(1,1,embed_dim))
    self.pos_embed = nn.Parameter(torch.randn(1,1+num_patches,embed_dim))

  def forward(self,x:torch.Tensor):
    B = x.size(0)
    x = self.proj(x)
    x = x.flatten(2).transpose(1,2)
    cls_token = self.cls_token.expand(B, -1, -1)
    x = torch.cat((cls_token,x),dim=1)
    x = x + self.pos_embed
    return x

In [None]:
class MLP(nn.Module):
    def __init__(self, in_feature, hidden_feature, drop_rate):
        super().__init__()
        self.fc1 = nn.Linear(in_feature, hidden_feature)
        self.fc2 = nn.Linear(hidden_feature, in_feature)  # map back to original dim
        self.dropout = nn.Dropout(drop_rate)

    def forward(self, x):
        x = self.fc1(x)
        x = F.gelu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.dropout(x)
        return x


In [None]:
class TransformerEncoderLayer(nn.Module):
    def __init__(self, embed_dim, num_heads, mlp_dim, drop_rate):
        super().__init__()
        self.norm1 = nn.LayerNorm(embed_dim)
        self.attn = nn.MultiheadAttention(embed_dim, num_heads, dropout=drop_rate, batch_first=True)
        self.norm2 = nn.LayerNorm(embed_dim)
        self.mlp = MLP(embed_dim, mlp_dim, drop_rate)

    def forward(self, x):
        x = x + self.attn(self.norm1(x), self.norm1(x), self.norm1(x))[0]
        x = x + self.mlp(self.norm2(x))
        return x


In [None]:
class VisionTransformer(nn.Module):
  def __init__(self,img_size,patch_size,in_channels,num_classes,embed_dim,depth,num_head,mlp_dim,drop_rate):
    super().__init__()
    self.patch_embed = PatchEmbedding(img_size,patch_size,in_channels,embed_dim)
    self.encoder = nn.Sequential(
    *[TransformerEncoderLayer(embed_dim, num_head, mlp_dim, drop_rate) for _ in range(depth)])
    self.norm = nn.LayerNorm(embed_dim)
    self.head = nn.Linear(embed_dim,num_classes)

  def forward(self,x):
    x = self.patch_embed(x)
    x = self.encoder(x)
    x = self.norm(x)
    cls_token = x[:,0]
    return self.head(cls_token)

In [None]:
# instantiate model

In [None]:
model = VisionTransformer(IMAGE_SIZE,PATCH_SIZE,CHANNELS,NUM_CLASSES,EMBED_DIM,DEPTH,NUM_HEADS,MLP_DIM,DROP_RATE).to(device)

In [None]:
model

VisionTransformer(
  (patch_embed): PatchEmbedding(
    (proj): Conv2d(3, 256, kernel_size=(4, 4), stride=(4, 4))
  )
  (encoder): Sequential(
    (0): TransformerEncoderLayer(
      (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
      (attn): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
      )
      (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
      (mlp): MLP(
        (fc1): Linear(in_features=256, out_features=512, bias=True)
        (fc2): Linear(in_features=512, out_features=256, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (1): TransformerEncoderLayer(
      (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
      (attn): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
      )
      (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
      (mlp): MLP

In [None]:
# define_loss function and optimizer

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(),lr=LEARNING_RATE)

In [None]:
optimizer

Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.0003
    maximize: False
    weight_decay: 0
)

In [None]:
# training LOOP

In [None]:
def train(model,loader,optimizer,criterion):
  model.train()
  train_loss, correct,total_loss = 0,0,0

  for x ,y in loader:
    x,y = x.to(device), y.to(device)
    optimizer.zero_grad()
    out = model(x)
    loss = criterion(out,y)
    loss.backward()
    optimizer.step()
    total_loss += loss.item() * x.size(0)
    correct += (out.argmax(1) == y).sum().item()
  return train_loss / len(loader.dataset), correct /len(loader.dataset)




In [None]:
def eveluate(model,loader):
  model.eval()
  correct =0
  with torch.inference_mode():
    for x,y in loader:
      x,y = x.to(device), y.to(device)
      out = model(x)
      correct += (out.argmax(dim=1) == y).sum().item()
  return correct / len(loader.dataset)


In [None]:
from tqdm.auto import tqdm

In [None]:
## training
train_accuracies = []
test_accuracies = []
for epoch in tqdm(range(EPOCHS)):
  train_loss,train_accuracy = train(model,train_loader,optimizer,criterion)
  test_acc = eveluate(model,test_loader)
  train_accuracies.append(train_accuracy)
  test_accuracies.append(test_acc)
  print(f"Epoch: {epoch+1}/{EPOCHS},train loss {train_loss:.4f}")

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch: 1/10,train loss 0.3374
Epoch: 2/10,train loss 0.4794
Epoch: 3/10,train loss 0.5357
Epoch: 4/10,train loss 0.5743
Epoch: 5/10,train loss 0.6047
Epoch: 6/10,train loss 0.6359
Epoch: 7/10,train loss 0.6606
Epoch: 8/10,train loss 0.6873
Epoch: 9/10,train loss 0.7139
Epoch: 10/10,train loss 0.7341
