First, let's use PyTorch's implementation for vision transformers.

In [1]:
!pip install timm

Collecting timm
  Downloading timm-0.9.16-py3-none-any.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->timm)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->timm)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->timm)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->timm)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch->timm)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch->timm)
  Using cach

In [2]:
import timm
import torch
import os
import numpy as np
from sklearn.model_selection import train_test_split
from torch import nn
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, Subset
from torch.optim import Adam
from torch.nn import CrossEntropyLoss
from google.colab import drive
from tqdm import tqdm

In [3]:
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# %cd "/content/drive/My Drive/DL Project"
%cd /content/drive/MyDrive/'Georgia Tech'/'CS 7643'/'DL Project'

/content/drive/.shortcut-targets-by-id/1Vhu7c9TtXf-INnjjgmUD94b4o35wSPWM/DL Project


In [9]:
base_dir = 'code/data/original_data'

In [10]:
# Define transformations
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize images to 224x224
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalization
])

# Load dataset from directory
dataset = datasets.ImageFolder(base_dir, transform=transform)

# Split dataset into training and validation sets
train_idx, val_idx = train_test_split(np.arange(len(dataset)), test_size=0.2, random_state=42)

# Create training and validation subsets
train_dataset = Subset(dataset, train_idx)
val_dataset = Subset(dataset, val_idx)

# Data loaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

In [11]:
model = timm.create_model('vit_base_patch16_224', pretrained=True, num_classes=20)

if torch.cuda.is_available():
    model = model.cuda()

# Optimizer and loss function
optimizer = Adam(model.parameters(), lr=0.001)
criterion = CrossEntropyLoss()

In [12]:
# Training function
def train_model(model, train_loader, criterion, optimizer, num_epochs=10):
    model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0
        for inputs, labels in tqdm(train_loader):
            if torch.cuda.is_available():
                inputs, labels = inputs.cuda(), labels.cuda()

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item() * inputs.size(0)
        print(f'Epoch {epoch+1} Loss: {running_loss / len(train_loader.dataset)}')

# Train the model
train_model(model, train_loader, criterion, optimizer)

100%|██████████| 101/101 [22:02<00:00, 13.09s/it]


Epoch 1 Loss: 3.3735139740969333


100%|██████████| 101/101 [01:12<00:00,  1.39it/s]


Epoch 2 Loss: 2.526746186128109


100%|██████████| 101/101 [01:12<00:00,  1.38it/s]


Epoch 3 Loss: 2.289847045231516


100%|██████████| 101/101 [01:13<00:00,  1.38it/s]


Epoch 4 Loss: 2.063830394886771


100%|██████████| 101/101 [01:14<00:00,  1.36it/s]


Epoch 5 Loss: 1.9586362900825591


100%|██████████| 101/101 [01:14<00:00,  1.36it/s]


Epoch 6 Loss: 1.8178379998322385


100%|██████████| 101/101 [01:14<00:00,  1.35it/s]


Epoch 7 Loss: 1.7364352158855667


100%|██████████| 101/101 [01:14<00:00,  1.35it/s]


Epoch 8 Loss: 1.7094669438117938


100%|██████████| 101/101 [01:14<00:00,  1.36it/s]


Epoch 9 Loss: 1.6831271931464378


100%|██████████| 101/101 [01:14<00:00,  1.36it/s]

Epoch 10 Loss: 1.6334991752916586





In [13]:
def evaluate_model(model, val_loader):
    model.eval()
    correct = total = 0
    with torch.no_grad():
        for inputs, labels in val_loader:
            if torch.cuda.is_available():
                inputs, labels = inputs.cuda(), labels.cuda()

            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    print(f'Validation Accuracy: {accuracy}%')

# Evaluate the model
evaluate_model(model, val_loader)

Validation Accuracy: 44.48574969021065%


Next, let's test out HuggingFace's transformer implementation.

In [None]:
# import torch
# import numpy as np
# from torchvision import datasets, transforms
# from torch.utils.data import DataLoader, Subset
# from transformers import ViTImageProcessor, ViTForImageClassification, ViTConfig
# from sklearn.model_selection import train_test_split

# # Data setup
# dataset_path = base_dir
# # transform = transforms.Compose([
# #     transforms.Resize((224, 224)),  # Resize all images to the size expected by ViT
# #     transforms.ToTensor(),          # Convert images to tensors
# #     transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalize the images
# # ])
# full_dataset = datasets.ImageFolder(dataset_path, transform=None)

# # Split the dataset into training and validation
# train_idx, val_idx = train_test_split(np.arange(len(full_dataset)), test_size=0.2, random_state=42)
# train_dataset = Subset(full_dataset, train_idx)
# val_dataset = Subset(full_dataset, val_idx)

# # Initialize the image processor
# image_processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224')

# # Custom collate function using the image processor
# def collate_fn(batch):
#     images, labels = zip(*batch)
#     processed_images = torch.stack([image_processor(images=x, return_tensors="pt").pixel_values.squeeze(0) for x in images])
#     labels = torch.tensor(labels)
#     return processed_images, labels

# # Data loaders with custom collate function
# train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
# val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

In [None]:
# # Model setup
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# # Load the configuration from the pre-trained model
# config = ViTConfig.from_pretrained('google/vit-base-patch16-224', num_labels=20)

# # Now load the pre-trained model with the updated configuration
# # Use `ignore_mismatched_sizes=True` to ignore the size mismatches in classifier layers
# model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224', config=config, ignore_mismatched_sizes=True)
# # Setup the model for your specific number of classes
# model.classifier = torch.nn.Linear(model.config.hidden_size, 20)
# model.num_labels = 20
# model.config.num_labels = 20

# model.to(device)

# # Optimizer
# optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# # Training function
# def train(model, train_loader, optimizer, device):
#     model.train()
#     total_loss = 0
#     for images, labels in train_loader:
#         images, labels = images.to(device), labels.to(device)

#         optimizer.zero_grad()
#         outputs = model(images).logits
#         loss = torch.nn.functional.cross_entropy(outputs, labels)
#         loss.backward()
#         optimizer.step()

#         total_loss += loss.item()
#     print(f"Average training loss: {total_loss / len(train_loader)}")

# # Evaluation function
# def evaluate(model, val_loader, device):
#     model.eval()
#     total = 0
#     correct = 0
#     with torch.no_grad():
#         for images, labels in val_loader:
#             images, labels = images.to(device), labels.to(device)
#             outputs = model(images).logits
#             _, predicted = torch.max(outputs, 1)
#             total += labels.size(0)
#             correct += (predicted == labels).sum().item()

#     accuracy = correct / total
#     print(f'Validation Accuracy: {accuracy * 100:.2f}%')

In [None]:
# # Training loop without evaluation each epoch
# num_epochs = 10
# for epoch in range(num_epochs):
#     print(f"Epoch {epoch+1}/{num_epochs}")
#     train(model, train_loader, optimizer, device)

# # Evaluate after training is complete
# print("Evaluating model after training...")
# evaluate(model, val_loader, device)