<a href="https://colab.research.google.com/github/Aravindh4404/FYPSeagullClassification01/blob/main/VIT3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install timm



In [None]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, random_split
import numpy as np
import random
import matplotlib.pyplot as plt
from datetime import datetime
import timm  # For using pretrained Vision Transformer models
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

# Mount Google Drive if using Google Colab
from google.colab import drive
drive.mount('/content/drive')

# Define the folder to save model checkpoints
date_str = datetime.now().strftime('%Y%m%d')
checkpoint_folder = f'/content/drive/My Drive/FYP/VIT2_HQ2_{date_str}/'
os.makedirs(checkpoint_folder, exist_ok=True)

# Data Augmentation for Training Set
transform_train = transforms.Compose([
    transforms.Resize((224, 224)),  # Typical input size for ViT models
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(15),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),
    transforms.ToTensor(),
    transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5]),  # Normalize to match pretrained models' expectations
])

# Simple resizing for validation and test sets
transform_val_test = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5]),
])

# Load datasets
data_path = '/content/drive/My Drive/FYP/Dataset/HQ2/train'
test_data_path = '/content/drive/My Drive/FYP/Dataset/HQ2/test'
train_dataset = datasets.ImageFolder(data_path, transform=transform_train)
test_dataset = datasets.ImageFolder(test_data_path, transform=transform_val_test)

# Split the dataset into 80% training and 20% validation
train_size = int(0.8 * len(train_dataset))
val_size = len(train_dataset) - train_size
train_dataset, val_dataset = random_split(train_dataset, [train_size, val_size])

# Create data loaders
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Set the device to GPU if available, otherwise CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


# Enhanced Vision Transformer model with custom attention pooling and classifier head
class EnhancedViT(nn.Module):
    def __init__(self, dropout_rate=0.3, hidden_dim=512):
        """
        Initializes the enhanced ViT model.
        - Loads a pre-trained ViT backbone.
        - Removes the original classification head.
        - Adds an attention mechanism to pool patch tokens.
        - Adds a custom MLP classifier head.
        """
        super(EnhancedViT, self).__init__()
        # Load a pre-trained ViT model from timm
        self.vit = timm.create_model('vit_base_patch16_224', pretrained=True)
        # Remove the original classification head
        self.vit.head = nn.Identity()

        # Get the embedding dimension (most timm ViT models have 'embed_dim')
        if hasattr(self.vit, 'embed_dim'):
            self.embed_dim = self.vit.embed_dim
        else:
            # Fallback: use the in_features of the original head if available
            self.embed_dim = self.vit.head.in_features

        # Attention mechanism: compute an attention score for each token (patch)
        self.attention_layer = nn.Sequential(
            nn.Linear(self.embed_dim, 1)  # Outputs a scalar score per token
        )

        # Custom classifier head: LayerNorm -> Dropout -> Linear -> ReLU -> Dropout -> Linear
        self.classifier = nn.Sequential(
            nn.LayerNorm(self.embed_dim),
            nn.Dropout(dropout_rate),
            nn.Linear(self.embed_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_dim, 2)  # Binary classification (2 classes)
        )

    def forward(self, x):
        """
        Forward pass:
          1. Extract patch token embeddings via ViT's forward_features.
          2. Compute attention scores for each token.
          3. Aggregate tokens via a weighted sum.
          4. Classify the aggregated feature vector.
        """
        # Get patch token embeddings; expected shape: [batch, num_tokens, embed_dim]
        tokens = self.vit.forward_features(x)
        # Compute attention scores for each token; shape: [batch, num_tokens, 1]
        attn_scores = self.attention_layer(tokens)
        # Normalize attention scores using softmax along the token dimension
        attn_weights = torch.softmax(attn_scores, dim=1)
        # Compute the weighted sum of token embeddings to form a global feature vector
        weighted_feature = torch.sum(attn_weights * tokens, dim=1)  # Shape: [batch, embed_dim]
        # Pass the aggregated features through the classifier head
        out = self.classifier(weighted_feature)
        return out

# Initialize the enhanced ViT model and send it to the device
model = EnhancedViT().to(device)

# Define loss function and optimizer with L2 regularization (weight decay)
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=0.0001, weight_decay=1e-4)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.1, patience=3, verbose=True)

# Training loop with learning rate scheduler and checkpoint saving
def train(model, train_loader, val_loader, criterion, optimizer, scheduler, epochs=20):
    best_acc = 0.0
    train_losses = []
    val_accuracies = []
    val_precisions = []
    val_recalls = []
    val_f1s = []

    for epoch in range(epochs):
        model.train()
        running_loss = 0.0

        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()

            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        # Compute average training loss for this epoch
        train_loss = running_loss / len(train_loader)
        train_losses.append(train_loss)
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {train_loss:.6f}")

        # Validate the model and compute metrics
        val_acc, val_precision, val_recall, val_f1 = validate(model, val_loader, criterion)
        val_accuracies.append(val_acc)
        val_precisions.append(val_precision)
        val_recalls.append(val_recall)
        val_f1s.append(val_f1)

        # Step the scheduler on the validation accuracy
        scheduler.step(val_acc)

        # ---------------------------------------------------------
        # Save the "final" model at the end of every epoch
        final_model_path = os.path.join(checkpoint_folder, f"final_model_vit_{date_str}.pth")
        torch.save(model.state_dict(), final_model_path)
        print(f"Final model (epoch {epoch+1}) saved at {final_model_path}")
        # ---------------------------------------------------------

        # Save the best model if the accuracy improves
        if val_acc > best_acc:
            best_acc = val_acc
            best_model_path = os.path.join(checkpoint_folder, f"best_model_vit_{date_str}.pth")
            torch.save(model.state_dict(), best_model_path)
            print(f"New best model saved with accuracy: {best_acc:.2f}% at {best_model_path}")

    # Plot training and validation metrics
    plt.figure(figsize=(20, 10))
    plt.subplot(2, 2, 1)
    plt.plot(train_losses, label='Training Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.title('Training Loss')
    plt.grid(True)

    plt.subplot(2, 2, 2)
    plt.plot(val_accuracies, label='Validation Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy (%)')
    plt.title('Validation Accuracy')
    plt.grid(True)

    plt.subplot(2, 2, 3)
    plt.plot(val_precisions, label='Validation Precision')
    plt.xlabel('Epochs')
    plt.ylabel('Precision')
    plt.title('Validation Precision')
    plt.grid(True)

    plt.subplot(2, 2, 4)
    plt.plot(val_recalls, label='Validation Recall')
    plt.xlabel('Epochs')
    plt.ylabel('Recall')
    plt.title('Validation Recall')
    plt.grid(True)

    plt.tight_layout()
    plt.show()

    # Plot F1 score separately
    plt.figure(figsize=(10, 5))
    plt.plot(val_f1s, label='Validation F1 Score')
    plt.xlabel('Epochs')
    plt.ylabel('F1 Score')
    plt.title('Validation F1 Score')
    plt.grid(True)
    plt.show()

    print("Training complete!")

# Validation loop returning metrics
def validate(model, loader, criterion):
    model.eval()
    correct = 0
    total = 0
    all_labels = []
    all_predictions = []

    with torch.no_grad():
        for inputs, labels in loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            all_labels.extend(labels.cpu().numpy())
            all_predictions.extend(predicted.cpu().numpy())

    accuracy = 100 * correct / total
    precision = precision_score(all_labels, all_predictions, average='binary')
    recall = recall_score(all_labels, all_predictions, average='binary')
    f1 = f1_score(all_labels, all_predictions, average='binary')
    print(f'Validation Accuracy: {accuracy:.2f}%, Precision: {precision:.2f}, Recall: {recall:.2f}, F1 Score: {f1:.2f}')
    return accuracy, precision, recall, f1

# Test function to evaluate on the test set
def test(model, loader, criterion):
    model.eval()
    test_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        for inputs, labels in loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            test_loss += loss.item()

            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    print(f'Test Loss: {test_loss/len(loader):.6f}, Test Accuracy: {accuracy:.2f}%')

# Step 1: Train the model with learning rate scheduling
train(model, train_loader, val_loader, criterion, optimizer, scheduler, epochs=20)

# Step 2: Evaluate the model on the test set
test(model, test_loader, criterion)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]



Epoch [1/20], Loss: 0.442076
Validation Accuracy: 95.39%, Precision: 0.96, Recall: 0.98, F1 Score: 0.97
Final model (epoch 1) saved at /content/drive/My Drive/FYP/VIT2_HQ2_20250207/final_model_vit_20250207.pth
New best model saved with accuracy: 95.39% at /content/drive/My Drive/FYP/VIT2_HQ2_20250207/best_model_vit_20250207.pth
Epoch [2/20], Loss: 0.242328
Validation Accuracy: 96.05%, Precision: 0.95, Recall: 1.00, F1 Score: 0.98
Final model (epoch 2) saved at /content/drive/My Drive/FYP/VIT2_HQ2_20250207/final_model_vit_20250207.pth
New best model saved with accuracy: 96.05% at /content/drive/My Drive/FYP/VIT2_HQ2_20250207/best_model_vit_20250207.pth
Epoch [3/20], Loss: 0.206850
Validation Accuracy: 90.13%, Precision: 0.95, Recall: 0.93, F1 Score: 0.94
Final model (epoch 3) saved at /content/drive/My Drive/FYP/VIT2_HQ2_20250207/final_model_vit_20250207.pth
Epoch [4/20], Loss: 0.119347
Validation Accuracy: 92.76%, Precision: 0.92, Recall: 1.00, F1 Score: 0.96
Final model (epoch 4) save