# Download Gdown

In [1]:
!pip3 install gdown
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
!pip install transformers scikit-learn pillow pandas numpy

Collecting gdown
  Using cached gdown-5.2.0-py3-none-any.whl.metadata (5.8 kB)
Using cached gdown-5.2.0-py3-none-any.whl (18 kB)
Installing collected packages: gdown
Successfully installed gdown-5.2.0
Looking in indexes: https://download.pytorch.org/whl/cu128
Collecting torch
  Downloading https://download.pytorch.org/whl/cu128/torch-2.7.1%2Bcu128-cp313-cp313-win_amd64.whl.metadata (27 kB)
Collecting torchvision
  Downloading https://download.pytorch.org/whl/cu128/torchvision-0.22.1%2Bcu128-cp313-cp313-win_amd64.whl.metadata (6.3 kB)
Collecting torchaudio
  Downloading https://download.pytorch.org/whl/cu128/torchaudio-2.7.1%2Bcu128-cp313-cp313-win_amd64.whl.metadata (6.8 kB)
Downloading https://download.pytorch.org/whl/cu128/torch-2.7.1%2Bcu128-cp313-cp313-win_amd64.whl (3273.0 MB)
   ---------------------------------------- 0.0/3.3 GB ? eta -:--:--
   ---------------------------------------- 0.0/3.3 GB 39.4 MB/s eta 0:01:23
   ---------------------------------------- 0.0/3.3 GB 32.8 M

# Import Library

In [2]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torchvision import datasets, transforms
from torch.utils.data import Dataset, DataLoader, Subset
from torch.optim import AdamW
from transformers import ViTModel, ViTConfig
from transformers import AutoImageProcessor
from torch.amp import GradScaler, autocast
from torch.optim.lr_scheduler import LambdaLR, CosineAnnealingLR
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import confusion_matrix, recall_score, accuracy_score, precision_score, f1_score
from sklearn.preprocessing import LabelEncoder
from PIL import Image
import gdown

# Download csv file

In [4]:
gdown.download(id='1aviFFVPuV7jI1i8C6d8k4nSO73B-yaiJ', output='training.csv', quiet=False)
gdown.download(id='1SRDz3tFS7GUWafccRZnDmVAogSfYrCFl', output='testing.csv', quiet=False)

Downloading...
From (original): https://drive.google.com/uc?id=1aviFFVPuV7jI1i8C6d8k4nSO73B-yaiJ
From (redirected): https://drive.google.com/uc?id=1aviFFVPuV7jI1i8C6d8k4nSO73B-yaiJ&confirm=t&uuid=87726402-83c3-40dc-af3a-e33099a43e91
To: C:\Users\aloys\PycharmProjects\NAIC Competition\training.csv
100%|██████████| 521M/521M [00:14<00:00, 36.5MB/s] 
Downloading...
From: https://drive.google.com/uc?id=1SRDz3tFS7GUWafccRZnDmVAogSfYrCFl
To: C:\Users\aloys\PycharmProjects\NAIC Competition\testing.csv
100%|██████████| 87.2M/87.2M [00:02<00:00, 36.5MB/s]


'testing.csv'

# Load our own dataset

In [24]:
class DataSetLoader(Dataset):
    def __init__(self, csv_file, transform=None):
        self.data = pd.read_csv(csv_file)
        self.transform = transform

        self.label_map = {label: idx for idx, label in enumerate(sorted(self.data['label'].unique()))}
        self.data['label'] = self.data['label'].map(self.label_map)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        pixel_str = self.data['pixels'].iloc[idx]
        image_flat = np.fromstring(pixel_str, sep=',', dtype=np.uint8)

        image = image_flat.reshape((224, 224, 3))
        image = Image.fromarray(image)

        if self.transform:
            image = self.transform(image)

        label = self.data['label'].iloc[idx]

        return image, label

# Setup our device and introduce our pre-trained model

In [25]:
# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print("Cuda available: ", torch.cuda.is_available())

processor = AutoImageProcessor.from_pretrained('google/vit-base-patch16-224')

Cuda available:  True


Fast image processor class <class 'transformers.models.vit.image_processing_vit_fast.ViTImageProcessorFast'> is available for this model. Using slow image processor class. To use the fast image processor class set `use_fast=True`.


# Setup and separate training and validation data

In [30]:
# Data transforms
train_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomResizedCrop(224, scale=(0.8, 1.0)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(15),
    transforms.ColorJitter(brightness=0.3, contrast=0.3, saturation=0.3),
    transforms.ToTensor(),
    transforms.Normalize(mean=processor.image_mean, std=processor.image_std)
])

val_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=processor.image_mean, std=processor.image_std)
])

df = pd.read_csv('training.csv')
labels = df['label']

indices = list(range(len(df)))
train_indices, val_indices = train_test_split(indices, test_size=0.2, stratify=labels, random_state=42)

train_dataset = DataSetLoader('training.csv', train_transform)
val_dataset = DataSetLoader('testing.csv', val_transform)

train_loader = DataLoader(
    train_dataset,
    batch_size=16,
    shuffle=True
)

val_loader = DataLoader(
    val_dataset,
    batch_size=16,
    shuffle=False
)

# Setup and customize our ViT Model

In [27]:
# Load Google ViT model
config = ViTConfig.from_pretrained('google/vit-base-patch16-224')
vit = ViTModel.from_pretrained('google/vit-base-patch16-224')

# Custom head
class ViTForClassification(nn.Module):
    def __init__(self, vit_model, num_classes=8):
        super().__init__()
        self.vit = vit_model
        self.classifier = nn.Sequential(
            nn.Linear(vit.config.hidden_size, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, num_classes),
        )

    def forward(self, x):
        outputs = self.vit(pixel_values=x)
        cls_token = outputs.last_hidden_state[:, 0, :]
        return self.classifier(cls_token)

model = ViTForClassification(vit).to(device)

Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Hyperparameters tuning

> We setup our class weights, epochs number, early stopping mechanism, optimizer and learning rate scheduler to optimize our model performance and provide better generalization through reducing overfitting


In [31]:
class_weights = compute_class_weight('balanced', classes=np.unique(labels), y=labels)
class_weights = torch.tensor(class_weights, dtype=torch.float32).to(device)


num_epochs = 20
warmup_epochs = 5
early_stopping_patience = 5
epochs_no_improvement = 0
best_val_loss = float('inf')
early_stopping = False

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5, weight_decay=1e-4)
scheduler = CosineAnnealingLR(optimizer, T_max=num_epochs - warmup_epochs, eta_min=1e-6)
criterion = nn.CrossEntropyLoss(weight=class_weights, label_smoothing=0.1)
scaler = GradScaler()

def lr_lambda(current_step):
    if current_step < warmup_epochs:
        return float(current_step) / float(max(1, warmup_epochs))
    else:
        return 1.0

scheduler_warmup = LambdaLR(optimizer, lr_lambda)


# Training and validation process

In [32]:
for epoch in range(num_epochs):
    if epoch == 3:
        for param in model.vit.encoder.layer[-4:].parameters():
            param.requires_grad = True
    elif epoch == 6:
        for param in model.vit.encoder.layer.parameters():
            param.requires_grad = True

    model.train()
    train_loss, train_correct, train_total = 0.0, 0, 0

    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()

        with autocast('cuda'):
            outputs = model(inputs)
            loss = criterion(outputs, labels)

        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        scaler.step(optimizer)
        scaler.update()

        train_loss += loss.item() * inputs.size(0)
        _, predicted = torch.max(outputs.data, 1)
        train_total += labels.size(0)
        train_correct += (predicted == labels).sum().item()

    if epoch < warmup_epochs:
        scheduler_warmup.step()

    model.eval()
    val_loss = 0.0
    val_correct = 0
    val_total = 0
    all_preds = []
    all_labels = []

    for inputs, labels in val_loader:
        inputs, labels = inputs.to(device), labels.to(device)

        with autocast('cuda'):
            outputs = model(inputs)
            loss = criterion(outputs, labels)

        val_loss += loss.item() * inputs.size(0)
        _, predicted = torch.max(outputs.data, 1)
        val_total += inputs.size(0)
        val_correct += (predicted == labels).sum().item()
        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

    epoch_train_loss = train_loss / train_total
    epoch_train_acc = 100 * train_correct / train_total
    epoch_val_loss = val_loss / val_total
    epoch_val_acc = 100 * val_correct / val_total
    val_f1 = f1_score(all_labels, all_preds, average='macro')
    acc = accuracy_score(all_labels, all_preds)
    prec = precision_score(all_labels, all_preds, average='macro', zero_division=0)
    rec = recall_score(all_labels, all_preds, average='macro', zero_division=0)
    cm = confusion_matrix(all_labels, all_preds)
    scheduler.step()

    if epoch_val_loss < best_val_loss:
        best_val_loss = epoch_val_loss
        epochs_no_improvement = 0
        #torch.save(model.state_dict(), f'best_resnet34_weights.pth')
    else:
        epochs_no_improvement += 1
        if epochs_no_improvement >= early_stopping_patience:
            print('Early stopping')
            early_stopping = True
            break

    print(f"Epoch [{epoch+1}/{num_epochs}]")
    print(f"Train Loss: {epoch_train_loss:.4f} | Train Accuracy: {epoch_train_acc:.2f}%")
    print(f"Test Loss: {epoch_val_loss:.4f} | Test Accuracy: {epoch_val_acc:.2f}%")

Epoch [1/20]
Train Loss: 2.0949 | Train Accuracy: 15.99%
Test Loss: 2.0408 | Test Accuracy: 20.62%
Epoch [2/20]
Train Loss: 1.8348 | Train Accuracy: 36.26%
Test Loss: 1.6266 | Test Accuracy: 54.38%
Epoch [3/20]
Train Loss: 1.2352 | Train Accuracy: 75.65%
Test Loss: 1.0753 | Test Accuracy: 78.75%
Epoch [4/20]
Train Loss: 0.7319 | Train Accuracy: 93.63%
Test Loss: 0.7105 | Test Accuracy: 88.75%
Epoch [5/20]
Train Loss: 0.5838 | Train Accuracy: 97.39%
Test Loss: 0.6042 | Test Accuracy: 96.25%
Epoch [6/20]
Train Loss: 0.5476 | Train Accuracy: 99.58%
Test Loss: 0.6000 | Test Accuracy: 95.62%
Epoch [7/20]
Train Loss: 0.5313 | Train Accuracy: 99.27%
Test Loss: 0.6062 | Test Accuracy: 95.62%
Epoch [8/20]
Train Loss: 0.5184 | Train Accuracy: 99.79%
Test Loss: 0.5527 | Test Accuracy: 96.25%
Epoch [9/20]
Train Loss: 0.5192 | Train Accuracy: 99.37%
Test Loss: 0.5922 | Test Accuracy: 95.62%
Epoch [10/20]
Train Loss: 0.5090 | Train Accuracy: 99.79%
Test Loss: 0.5612 | Test Accuracy: 95.62%
Epoch [11

# Display performance statistics of our model

In [33]:
print(f"Accuracy: {acc * 100:.2f}%")
print(f"Precision: {prec * 100:.2f}%")
print(f"Recall: {rec * 100:.2f}%")
print(f"F1-Score: {val_f1 * 100:.2f}%")
print("Confusion Matrix: ")
print(cm)

Accuracy: 96.88%
Precision: 97.26%
Recall: 96.88%
F1-Score: 96.92%
Confusion Matrix: 
[[19  0  0  0  0  1  0  0]
 [ 0 21  0  0  0  0  0  0]
 [ 0  0 20  0  0  0  0  0]
 [ 0  0  0 20  0  0  0  0]
 [ 0  0  0  0 17  3  0  0]
 [ 0  1  0  0  0 19  0  0]
 [ 0  0  0  0  0  0 20  0]
 [ 0  0  0  0  0  0  0 19]]
