In [2]:
%pip install torchvision

Collecting torchvision
  Downloading torchvision-0.20.1-cp312-cp312-win_amd64.whl.metadata (6.2 kB)
Downloading torchvision-0.20.1-cp312-cp312-win_amd64.whl (1.6 MB)
   ---------------------------------------- 0.0/1.6 MB ? eta -:--:--
   ---------------------------------------- 1.6/1.6 MB 8.3 MB/s eta 0:00:00
Installing collected packages: torchvision
Successfully installed torchvision-0.20.1
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [11]:
import pandas as pd
import numpy as np
from PIL import Image
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms

# Load the dataset
data = pd.read_csv("C:/Users/dang0/Downloads/fer2013.csv")

emotion_labels = ["Angry", "Disgust", "Fear", "Happy", "Sad", "Surprise", "Neutral"]

class FER2013Dataset(Dataset):
    def __init__(self, data, transform=None):
        self.data = data
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        pixels = np.fromstring(self.data.iloc[idx]['pixels'], dtype=int, sep=' ')
        image = pixels.reshape(48, 48).astype(np.uint8)
        image = Image.fromarray(image)
        if self.transform:
            image = self.transform(image)
        label = int(self.data.iloc[idx]['emotion'])
        return image, label

# Data transformations-> overhere i want to finetune the model by augmenting the data

transform = transforms.Compose([
    transforms.Grayscale(num_output_channels=3),
    transforms.RandomHorizontalFlip(p=0.5),  # Flip faces horizontally
    transforms.RandomRotation(10),           # Rotate the image slightly
    transforms.RandomCrop(44),               # Random crop to add variance
    transforms.Resize((48, 48)),             # Resize back to 48x48
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

# Initialize dataset and dataloader
train_data = FER2013Dataset(data[data['Usage'] == 'Training'], transform=transform)
val_data = FER2013Dataset(data[data['Usage'] == 'PublicTest'], transform=transform)
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
val_loader = DataLoader(val_data, batch_size=64, shuffle=False)


In [14]:
from torchvision.models import vit_b_16

# Load pre-trained ViT model
model = vit_b_16(pretrained=True)

print(model)



VisionTransformer(
  (conv_proj): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
  (encoder): Encoder(
    (dropout): Dropout(p=0.0, inplace=False)
    (layers): Sequential(
      (encoder_layer_0): EncoderBlock(
        (ln_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (self_attention): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (dropout): Dropout(p=0.0, inplace=False)
        (ln_2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (mlp): MLPBlock(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU(approximate='none')
          (2): Dropout(p=0.0, inplace=False)
          (3): Linear(in_features=3072, out_features=768, bias=True)
          (4): Dropout(p=0.0, inplace=False)
        )
      )
      (encoder_layer_1): EncoderBlock(
        (ln_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (self_a