<a href="https://colab.research.google.com/github/Ananya10-Coder/Image-Classification/blob/main/ImageClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Step 1: Install Required Libraries
!pip install torch torchvision transformers datasets evaluate

# Step 2: Import Libraries
import torch
from torchvision import transforms
from transformers import ViTForImageClassification, ViTFeatureExtractor, AdamW
from datasets import load_dataset
import numpy as np
import evaluate

# Step 3: Load a Small Subset of the MNIST Dataset
# Load the MNIST dataset from Hugging Face's datasets library
mnist_dataset = load_dataset("mnist")

# Use only 100 samples for training and 20 samples for testing (to keep it small)
small_train_dataset = mnist_dataset["train"].shuffle(seed=42).select(range(100))
small_test_dataset = mnist_dataset["test"].shuffle(seed=42).select(range(20))

# Step 4: Preprocess the Data
# Load a pre-trained Vision Transformer (ViT) feature extractor
model_name = "google/vit-base-patch16-224"
feature_extractor = ViTFeatureExtractor.from_pretrained(model_name)

# Define a transformation pipeline to resize images to 224x224 and normalize them
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std)
])

# Apply the transformation to the dataset
def preprocess_images(examples):
    examples["pixel_values"] = [transform(image.convert("RGB")) for image in examples["image"]]
    return examples

small_train_dataset = small_train_dataset.map(preprocess_images, batched=True)
small_test_dataset = small_test_dataset.map(preprocess_images, batched=True)

# Step 5: Load Pre-trained ViT Model
# Use ignore_mismatched_sizes=True to handle the size mismatch
model = ViTForImageClassification.from_pretrained(
    model_name,
    num_labels=10,  # MNIST has 10 classes
    ignore_mismatched_sizes=True  # Ignore size mismatch in the final layer
)

# Step 6: Prepare Data for Training
from torch.utils.data import DataLoader

# Create PyTorch DataLoader for the dataset
train_dataloader = DataLoader(small_train_dataset.with_format("torch"), batch_size=8, shuffle=True)
test_dataloader = DataLoader(small_test_dataset.with_format("torch"), batch_size=8)

# Step 7: Define Evaluation Metrics
accuracy_metric = evaluate.load("accuracy")

# Step 8: Fine-Tune the Model
# Move model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define optimizer and loss function
optimizer = AdamW(model.parameters(), lr=5e-5)
loss_fn = torch.nn.CrossEntropyLoss()

# Fine-tuning loop
num_epochs = 3  # Number of epochs for fine-tuning
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in train_dataloader:
        pixel_values = batch["pixel_values"].to(device)
        labels = batch["label"].to(device)

        # Forward pass
        outputs = model(pixel_values=pixel_values)
        logits = outputs.logits

        # Compute loss
        loss = loss_fn(logits, labels)
        total_loss += loss.item()

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Print average loss for the epoch
    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {avg_loss:.4f}")

# Step 9: Evaluate the Model
def evaluate_model(model, dataloader):
    model.eval()
    predictions = []
    references = []

    for batch in dataloader:
        pixel_values = batch["pixel_values"].to(device)
        labels = batch["label"].to(device)

        with torch.no_grad():
            outputs = model(pixel_values=pixel_values)
            logits = outputs.logits

        preds = torch.argmax(logits, dim=-1)
        predictions.extend(preds.cpu().numpy())
        references.extend(labels.cpu().numpy())

    # Calculate accuracy
    accuracy = accuracy_metric.compute(predictions=predictions, references=references)
    print(f"Accuracy: {accuracy['accuracy'] * 100:.2f}%")

# Evaluate on the test set
print("Evaluating on the test set...")
evaluate_model(model, test_dataloader)

# Step 10: Make Predictions on a Single Image
from PIL import Image

# Load a sample image from the test set
sample_image = small_test_dataset[0]["image"]
sample_label = small_test_dataset[0]["label"]

# Preprocess the image
pixel_values = transform(sample_image.convert("RGB")).unsqueeze(0).to(device)

# Make a prediction
with torch.no_grad():
    outputs = model(pixel_values=pixel_values)
    logits = outputs.logits
    pred = torch.argmax(logits, dim=-1).item()

print(f"Predicted Label: {pred}")
print(f"Actual Label: {sample_label}")

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/6.97k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/15.6M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/2.60M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/60000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]



Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/69.7k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([10]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([10, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]



Epoch 1/3, Loss: 1.9681
Epoch 2/3, Loss: 0.9715
Epoch 3/3, Loss: 0.3829
Evaluating on the test set...
Accuracy: 90.00%
Predicted Label: 5
Actual Label: 5
