<a href="https://colab.research.google.com/github/1489685175z-coder/Bird_Speicy_Classification/blob/main/Bird_Speicy_Classify.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
# Project: Fine-Grained Bird Species Classification on CUB-200-2011
# Compare: Pretrained ResNet-18 vs Pretrained ViT

# =============================================================================
# 1. Install dependencies (if needed in Colab)
# =============================================================================
!pip install -q transformers datasets torch torchvision scikit-learn matplotlib pillow tqdm evaluate
!pip install seaborn
# =============================================================================
# 2. Import libraries
# =============================================================================
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, random_split
from torchvision import models, transforms
from transformers import ViTForImageClassification
from datasets import load_dataset
from tqdm import tqdm
from sklearn.metrics import accuracy_score, top_k_accuracy_score, classification_report, confusion_matrix
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt
import time
import seaborn as sns
import pandas as pd



In [9]:
# =============================================================================
# 3. Device & Hyperparameters
# =============================================================================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

BATCH_SIZE = 32  # Compromise between originals (16 for ViT, 64 for ResNet)
NUM_EPOCHS = 30   # Balanced for comparison
NUM_CLASSES = 200

models_config = [
    {"name": "resnet18", "pretrained": True, "lr": 0.001},
    {"name": "vit", "pretrained": True, "lr": 3e-5}
]


Using device: cuda


In [10]:
# =============================================================================
# 4. Load Dataset from Hugging Face
# =============================================================================
print("Loading dataset...")
hf_dataset = load_dataset("bentrevett/caltech-ucsd-birds-200-2011")

class_names = hf_dataset['train'].features['label'].names  # Get class names for reporting

class CUBDataset(torch.utils.data.Dataset):
    def __init__(self, hf_dataset, transform=None):
        self.hf_dataset = hf_dataset
        self.transform = transform

    def __len__(self):
        return len(self.hf_dataset)

    def __getitem__(self, idx):
        item = self.hf_dataset[idx]
        image = item['image'].convert('RGB')
        label = item['label']
        if self.transform:
            image = self.transform(image)
        return image, label

train_transform = transforms.Compose([
    transforms.Resize(256),
    transforms.RandomResizedCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(15),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

test_transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

full_train_ds = CUBDataset(hf_dataset['train'], train_transform)
test_ds = CUBDataset(hf_dataset['test'], test_transform)

train_size = int(0.85 * len(full_train_ds))
val_size = len(full_train_ds) - train_size
train_ds, val_ds = random_split(full_train_ds, [train_size, val_size])

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=2, pin_memory=True)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)
test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)

print(f"Train samples: {len(train_ds)}, Val samples: {len(val_ds)}, Test samples: {len(test_ds)}")
print(f"Num classes: {NUM_CLASSES}, Example classes: {class_names[:5]}...")


Loading dataset...
Train samples: 5094, Val samples: 900, Test samples: 5794
Num classes: 200, Example classes: ['001.Black_footed_Albatross', '002.Laysan_Albatross', '003.Sooty_Albatross', '004.Groove_billed_Ani', '005.Crested_Auklet']...


In [11]:
# =============================================================================
# 5. Train and Evaluate Functions (Unified for both models)
# =============================================================================
def train_epoch(model, loader, criterion, optimizer, scaler, device, model_name):
    model.train()
    total_loss, correct, total = 0.0, 0, 0
    for images, labels in tqdm(loader, desc="Training"):
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        with torch.cuda.amp.autocast(enabled=True):
            if model_name == "vit":
                outputs = model(pixel_values=images).logits
            else:
                outputs = model(images)
            loss = criterion(outputs, labels)
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        total_loss += loss.item() * images.size(0)
        pred = outputs.argmax(dim=1)
        correct += (pred == labels).sum().item()
        total += images.size(0)
    return total_loss / total, correct / total

def evaluate(model, loader, criterion, device, model_name, is_test=False):
    model.eval()
    total_loss, correct, total = 0.0, 0, 0
    all_preds, all_labels, all_logits = [], [], []
    with torch.no_grad():
        for images, labels in tqdm(loader, desc="Evaluating"):
            images, labels = images.to(device), labels.to(device)
            if model_name == "vit":
                outputs = model(pixel_values=images).logits
            else:
                outputs = model(images)
            loss = criterion(outputs, labels)
            total_loss += loss.item() * images.size(0)
            pred = outputs.argmax(dim=1)
            correct += (pred == labels).sum().item()
            total += images.size(0)
            all_preds.extend(pred.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
            if is_test:
                all_logits.extend(outputs.cpu().numpy())  # For top-5
    acc = correct / total
    loss = total_loss / total
    top5_acc = top_k_accuracy_score(all_labels, all_logits, k=5) if is_test else None
    return loss, acc, top5_acc, all_preds, all_labels


In [None]:
# =============================================================================
# 6. Main Training Loop
# =============================================================================
all_history = {}
test_results = {}
for config in models_config:
    name = config["name"]
    print(f"\n=== Training {name} ===")
    if name == "resnet18":
        model = models.resnet18(pretrained=config["pretrained"])
        model.fc = nn.Linear(model.fc.in_features, NUM_CLASSES)
        # Freeze all except last layer and classifier (as in original ResNet)
        for param in model.parameters():
            param.requires_grad = False
        for param in model.layer4.parameters():
            param.requires_grad = True
        for param in model.fc.parameters():
            param.requires_grad = True

    elif name == "vit":
        model = ViTForImageClassification.from_pretrained(
            "google/vit-base-patch16-224-in21k",
            num_labels=NUM_CLASSES,
            ignore_mismatched_sizes=True
        )
        # No freezing in original ViT, so train all with small LR

    model = model.to(device)
    criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
    optimizer = optim.AdamW(model.parameters(), lr=config["lr"], weight_decay=0.01)
    scaler = torch.cuda.amp.GradScaler(enabled=True)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=2)

    history = {"train_loss": [], "train_acc": [], "val_loss": [], "val_acc": [], "train_time": []}
    best_val_acc = 0.0
    t_start = time.time()

    for epoch in range(NUM_EPOCHS):
        tr_loss, tr_acc = train_epoch(model, train_loader, criterion, optimizer, scaler, device, name)
        va_loss, va_acc, _, _, _ = evaluate(model, val_loader, criterion, device, name)
        scheduler.step(va_acc)
        history["train_loss"].append(tr_loss)
        history["train_acc"].append(tr_acc)
        history["val_loss"].append(va_loss)
        history["val_acc"].append(va_acc)
        epoch_time = time.time() - t_start
        history["train_time"].append(epoch_time)
        print(f"Epoch {epoch+1}/{NUM_EPOCHS} | Train Loss: {tr_loss:.4f} Acc: {tr_acc:.4f} | Val Loss: {va_loss:.4f} Acc: {va_acc:.4f}")

        if va_acc > best_val_acc:
            best_val_acc = va_acc
            torch.save(model.state_dict(), f"best_{name}.pth")

    total_train_time = time.time() - t_start
    print(f"Total training time for {name}: {total_train_time:.1f}s")

    # Load best model for test
    model.load_state_dict(torch.load(f"best_{name}.pth", map_location=device))
    te_loss, te_acc, te_top5_acc, te_preds, te_labels = evaluate(model, test_loader, criterion, device, name, is_test=True)
    report = classification_report(te_labels, te_preds, target_names=class_names, output_dict=True)
    macro_f1 = report['macro avg']['f1-score']

    test_results[name] = {
        "acc": te_acc,
        "top5_acc": te_top5_acc,
        "macro_f1": macro_f1,
        "train_time": total_train_time,
        "preds": te_preds,
        "labels": te_labels
    }
    all_history[name] = history

    # Per-model error analysis (top-5 confused pairs)
    errors = [(true, pred) for true, pred in zip(te_labels, te_preds) if true != pred]
    if errors:
        most_common_errors = Counter(errors).most_common(5)
        print(f"Top 5 confused pairs for {name} (True → Predicted):")
        for (true_label, pred_label), count in most_common_errors:
            true_name = class_names[true_label]
            pred_name = class_names[pred_label]
            print(f"  {true_name} → {pred_name} : {count} times")


=== Training resnet18 ===


  scaler = torch.cuda.amp.GradScaler(enabled=True)
  with torch.cuda.amp.autocast(enabled=True):
Training:  16%|█▋        | 26/160 [00:09<00:35,  3.72it/s]

In [None]:

# =============================================================================
# 7. Compare Models
# =============================================================================
print("\n=== Model Comparison ===")
comparison_data = {
    "Model": list(test_results.keys()),
    "Test Accuracy": [f"{test_results[m]['acc']:.4f}" for m in test_results],
    "Test Top-5 Accuracy": [f"{test_results[m]['top5_acc']:.4f}" for m in test_results],
    "Macro F1-Score": [f"{test_results[m]['macro_f1']:.4f}" for m in test_results],
    "Total Train Time (s)": [f"{test_results[m]['train_time']:.1f}" for m in test_results]
}
print(pd.DataFrame(comparison_data).to_markdown(index=False))  # Requires import pandas as pd if needed

# Plot combined curves
plt.figure(figsize=(12, 5))
for name in all_history:
    plt.subplot(1, 2, 1)
    plt.plot(all_history[name]["train_loss"], label=f"{name} train_loss")
    plt.plot(all_history[name]["val_loss"], label=f"{name} val_loss")
    plt.title("Loss Curves")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(all_history[name]["train_acc"], label=f"{name} train_acc")
    plt.plot(all_history[name]["val_acc"], label=f"{name} val_acc")
    plt.title("Accuracy Curves")
    plt.xlabel("Epoch")
    plt.ylabel("Accuracy")
    plt.legend()
plt.show()

In [None]:
# =============================================================================
# 8. Plot Confusion Matrices for Both Models
# =============================================================================
print("\n=== Plotting Confusion Matrices ===")

for name in test_results:
    y_true = test_results[name]['labels']   # list or np.array of true labels
    y_pred = test_results[name]['preds']    # list or np.array of predicted labels

    # Compute Confusion Matrice (200x200)
    cm = confusion_matrix(y_true, y_pred)

    # Normalized (row-normalized to 1, easier to see error distribution)
    cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    cm_normalized = np.nan_to_num(cm_normalized)


    plt.figure(figsize=(10, 9))
    sns.heatmap(
        cm_normalized,
        cmap='Blues',
        annot=False,
        xticklabels=False,
        yticklabels=False,
        square=False,
        cbar_kws={'label': 'Normalized', 'shrink': 0.7}
    )
    plt.title(f'Normalized Confusion Matrix - {name.upper()}', fontsize=12)
    plt.ylabel('True', fontsize=10)
    plt.xlabel('Predicted', fontsize=10)
    plt.tight_layout(pad=0.5)
    plt.savefig(f'cm_{name}_small.png', dpi=200, bbox_inches='tight')
    plt.show()