<a href="https://colab.research.google.com/github/DrKalam/Demo/blob/main/ViT_CIfar10_dataset_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
 import torch
 import torch.nn as nn
 import  torch.nn.functional as F
 import torch.optim as optim
 from torch.utils.data import DataLoader
 import torchvision
 from torchvision import datasets , transforms
 import numpy as np
 import random
 import matplotlib.pyplot as plt



#2 . set up device agnostic code


In [None]:
torch.__version__

'2.6.0+cu124'

In [None]:
torchvision.__version__

'0.21.0+cu124'

In [None]:
!nvidia-smi

Thu Jul 24 15:57:51 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   46C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
torch.cuda.is_available()

True

# 3 . Set the Seed

In [None]:
torch.manual_seed(42)
torch.cuda.manual_seed(42)
random.seed(42)

# 4 setting hyperparameters

In [None]:
BATCH_SIZE = 128
EPOCH = 10
LEARNING_RATE = 3e-4
PATCH_SIZE = 4
NUM_CLASSES =10
IMAGE_SIZE = 32
CHANNELS = 3
EMBED_DIM = 256
NUM_HEAD = 8
DEPTH = 6
MLP_DIM = 512
DROP_RATE = 0.1



#5. define image tranformations operation


In [None]:
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5) ,(0.5))

])

#6 getting a dataset

In [None]:
train_dataset = datasets.CIFAR10(root = "data",
                                 train = True,
                                 download=True ,
                                 transform= transform)

100%|██████████| 170M/170M [00:03<00:00, 42.8MB/s]


In [None]:
test_dataset = datasets.CIFAR10(root = "data",
                                 train = False,
                                 download=True ,
                                 transform= transform)

In [None]:
train_dataset

Dataset CIFAR10
    Number of datapoints: 50000
    Root location: data
    Split: Train
    StandardTransform
Transform: Compose(
               ToTensor()
               Normalize(mean=0.5, std=0.5)
           )

In [None]:
test_dataset

Dataset CIFAR10
    Number of datapoints: 10000
    Root location: data
    Split: Test
    StandardTransform
Transform: Compose(
               ToTensor()
               Normalize(mean=0.5, std=0.5)
           )

#7 converting datasets into dataloaders
data into mini batches / batches(of 120 images)

In [None]:

device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [None]:
train_loader = DataLoader(dataset= train_dataset,
                          batch_size = BATCH_SIZE ,
                          shuffle= True)
test_loader = DataLoader(dataset= test_dataset,
                          batch_size = BATCH_SIZE ,
                          shuffle= False
                         )


In [None]:
print(f"DataLoader : {train_loader , test_loader}")
print(f"Lenght of train_loader : {len(train_loader)} batches of { BATCH_SIZE}....")
print(f"Lenght of test_loader : {len(test_loader)} batches of { BATCH_SIZE}....")


DataLoader : (<torch.utils.data.dataloader.DataLoader object at 0x7a5ef42c7550>, <torch.utils.data.dataloader.DataLoader object at 0x7a5ef43366d0>)
Lenght of train_loader : 391 batches of 128....
Lenght of test_loader : 79 batches of 128....


#buiding VIsion transformer components

In [None]:

class PatchEmbedding(nn.Module):
  def __init__(self,
                img_size,
                patch_size,
                in_channels,
                  embed_dim):
    super().__init__()
    self.patch_size = patch_size
    self.proj = nn.Conv2d( in_channels= in_channels ,
                          out_channels= embed_dim,
                            kernel_size= patch_size,
                            stride = patch_size,
                            )
    num_patches = (img_size // patch_size) **2
    self.cls_token  = nn.Parameter(torch.randn(1,1, embed_dim))
    self.pos_embed = nn.Parameter(torch.randn(1, 1 + num_patches, embed_dim))


  def forward(self , x : torch.Tensor):
    B = x.size(0)
    x = self.proj(x) # ( B, E , H/p , W/P)
    x = x.flatten(2).transpose(1,2) # (B ,N , E)
    cls_token = self.cls_token.expand (B, -1 , -1)
    x = torch.cat((cls_token, x), dim = 1)
    x = x + self.pos_embed
    return x


#define MLP

In [None]:
class MLP(nn.Module):
    def __init__(self,
                 in_features,
                 hidden_features,
                 drop_rate):
        super().__init__()
        self.fc1 = nn.Linear(in_features = in_features,
                             out_features=hidden_features)



        self.fc2 = nn.Linear(in_features = hidden_features,
                             out_features = in_features)

        self.dropout =  nn.Dropout(drop_rate)

    def forward(self, x):
        x = self.dropout(F.gelu(self.fc1(x)))
        x = self.dropout(self.fc2(x))

        return x

#building tranformer encoder layer

In [None]:
class TransformerEncorderLayer(nn.Module):
  def __init__ ( self, embed_dim, num_heads,mlp_dim,drop_rate):
     super().__init__()
     self.norm1 = nn.LayerNorm(embed_dim)
     self.attn = nn.MultiheadAttention(embed_dim,num_heads,dropout = drop_rate , batch_first = True)
     self.norm2 = nn.LayerNorm(embed_dim)
     self.mlp = MLP(embed_dim,mlp_dim, drop_rate)

  def forward(self, x):
    x = x + self.attn(self.norm1(x) ,self.norm1(x), self.norm1(x)) [0]
    x =  x + self.mlp(self.norm2(x))
    return x

#creating the class vision transformer
 ## by combining all the three above

In [None]:

class VisionTransformer(nn.Module):
  def __init__(self,img_size,patch_size,in_channels, num_classes, embed_dim , depth,num_heads,mlp_dim , drop_rate):

     super().__init__()
     self.patch_embed = PatchEmbedding(img_size, patch_size,in_channels,embed_dim)
     self.encoder = nn.Sequential(*[
         TransformerEncorderLayer(embed_dim , num_heads, mlp_dim ,drop_rate)
         for _ in range(depth)

     ])
     self.norm = nn.LayerNorm(embed_dim )
     self.head = nn.Linear (embed_dim , num_classes) #act as classifier
  def forward (self , x):
    x = self.patch_embed(x)
    x = self.encoder(x)
    x = self.norm(x)
    cls_token = x [:, 0]
    return self.head(cls_token)

#Instantiate model

In [None]:
model = VisionTransformer(
                          IMAGE_SIZE , PATCH_SIZE , CHANNELS ,NUM_CLASSES, EMBED_DIM , DEPTH ,NUM_HEAD, MLP_DIM , DROP_RATE).to(device)







In [None]:
model

VisionTransformer(
  (patch_embed): PatchEmbedding(
    (proj): Conv2d(3, 256, kernel_size=(4, 4), stride=(4, 4))
  )
  (encoder): Sequential(
    (0): TransformerEncorderLayer(
      (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
      (attn): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
      )
      (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
      (mlp): MLP(
        (fc1): Linear(in_features=256, out_features=512, bias=True)
        (fc2): Linear(in_features=512, out_features=256, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (1): TransformerEncorderLayer(
      (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
      (attn): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
      )
      (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
      (mlp): M

## 9. Defining a loss funtion and optimizer

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=model.parameters(),
                             lr= LEARNING_RATE)

In [None]:
optimizer


Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.0003
    maximize: False
    weight_decay: 0
)

In [None]:
criterion


CrossEntropyLoss()

## 10.defining a training loop funtion

In [None]:
def train(model, loader,optimizer, criterion) :
  model.train()
  total_loss , correct = 0,0
  for x , y in loader :
    #mving / sending the data into the target device
    x ,y = x.to(device), y.to(device)
    optimizer.zero_grad()
    # 1 . forward pass (model out put raw logits)
    out = model(x)
    #2 . calculate loss (per batch)
    loss = criterion( out,y)
    #3 perform backpropogation
    loss.backward()
    # 4. perform Gradient descent
    optimizer.step()

    total_loss += loss.item() * x.size(0)
    correct += (out.argmax(1) == y).sum().item()
  #loss is to be scladed
  return total_loss / len(loader.dataset) , correct/ len(loader.dataset)



In [None]:
def evaluate (model , loader):
  model.eval()
  correct = 0
  with torch.inference_mode():
    for x , y in loader :
      x ,y = x.to(device), y.to(device)
      out = model(x)
      correct += (out.argmax(dim=1) == y).sum().item()
  return correct / len(loader.dataset)

### traning

In [None]:
from tqdm.auto import tqdm

In [None]:


train_accuracies, test_accuracies = [], []

for epoch in tqdm(range(EPOCH)):
    train_loss, train_acc = train(model, train_loader, optimizer, criterion)
    test_acc = evaluate(model, test_loader)

    train_accuracies.append(train_acc)
    test_accuracies.append(test_acc)

    print(f"Epoch: {epoch+1}/{EPOCH}, "
          f"Train loss: {train_loss:.4f}, "
          f"Train acc: {train_acc:.2f}%, "
          f"Test acc: {test_acc:.2f}%")


  0%|          | 0/10 [00:00<?, ?it/s]

Epoch: 1/10, Train loss: 1.7331, Train acc: 0.37%, Test acc: 0.48%
Epoch: 2/10, Train loss: 1.3834, Train acc: 0.50%, Test acc: 0.53%
Epoch: 3/10, Train loss: 1.2328, Train acc: 0.56%, Test acc: 0.57%
Epoch: 4/10, Train loss: 1.1271, Train acc: 0.60%, Test acc: 0.58%
Epoch: 5/10, Train loss: 1.0379, Train acc: 0.63%, Test acc: 0.61%
Epoch: 6/10, Train loss: 0.9662, Train acc: 0.65%, Test acc: 0.62%
Epoch: 7/10, Train loss: 0.8885, Train acc: 0.68%, Test acc: 0.62%
Epoch: 8/10, Train loss: 0.8157, Train acc: 0.71%, Test acc: 0.61%
Epoch: 9/10, Train loss: 0.7457, Train acc: 0.74%, Test acc: 0.63%
Epoch: 10/10, Train loss: 0.6707, Train acc: 0.76%, Test acc: 0.63%


In [None]:
train_accuracies

[0.37082,
 0.50392,
 0.55842,
 0.59672,
 0.62786,
 0.65416,
 0.68238,
 0.70788,
 0.73518,
 0.76212]

In [None]:

test_accuracies


In [None]:
# Plot accuracy
plt.plot(train_accuracies, label="Train Accuracy")
plt.plot(test_accuracies, label="Test Accuracy")
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.legend()
plt.title("Training and Test Accuracy")
plt.show()

In [None]:
import random

In [None]:
len(test_dataset)



In [None]:
test_dataset[0][0].unsqueeze(dim=0).shape

In [None]:
def predict_and_plot_grid(model,
                          dataset,
                          classes,
                          grid_size=3):
    model.eval()
    fig, axes = plt.subplots(grid_size, grid_size, figsize=(9, 9))
    for i in range(grid_size):
        for j in range(grid_size):
            idx = random.randint(0, len(dataset) - 1)
            img, true_label = dataset[idx]
            input_tensor = img.unsqueeze(dim=0).to(device)
            with torch.inference_mode():
                output = model(input_tensor)
                _, predicted = torch.max(output.data, 1)
            img = img / 2 + 0.5
            npimg = img.cpu().numpy()
            axes[i, j].imshow(np.transpose(npimg, (1, 2, 0)))
            color = classes[true_label] == classes[predicted.item()]
            if color:
                c = "g"
            else:
                c = "r"
            axes[i, j].set_title(f"Truth: {classes[true_label]}\n, Predicted: {classes[predicted.item()]}", fontsize=10, c=c)
            axes[i, j].axis("off")
    plt.tight_layout()
    plt.show()

In [None]:
predict_and_plot_grid(model,
                      test_dataset,
                      train_dataset.classes, grid_size=3)