In [1]:
from datasets import load_dataset
from evaluate import load as evaluate_load
from torchvision import transforms
import torch
import wandb

In [2]:
wandb.init(project="vit_baseline_dog")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33myepluovozz[0m ([33myepluovozz-the-australian-national-university[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [3]:
datasets = load_dataset('amaye15/stanford-dogs')

In [4]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

def preprocess_images(example):
    example['pixel_values'] = [transform(img.convert("RGB")) for img in example['pixel_values']]
    return example

dataset = datasets.map(preprocess_images, batched=True)

Map:   0%|          | 0/14406 [00:00<?, ? examples/s]

Map:   0%|          | 0/6174 [00:00<?, ? examples/s]

In [5]:
labels = datasets['train'].features['label'].names

In [6]:
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

In [7]:
#import evaluate
#accuracy = evaluate.load("accuracy")

In [7]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    
    predictions = predictions[0] if isinstance(predictions, tuple) else predictions
    labels = labels[0] if isinstance(labels, tuple) else labels
    
    predictions = torch.from_numpy(predictions)
    labels = torch.from_numpy(labels)
    
    # Get the predicted class
    predicted_classes = predictions.argmax(dim=-1)
    
    # Compute accuracy
    correct = (predicted_classes == labels).float()
    accuracy = correct.mean().item()
    
    # Compute top-5 accuracy
    top5_pred = predictions.topk(5, dim=-1)[1]
    top5_correct = top5_pred.eq(labels.view(-1, 1).expand_as(top5_pred)).float()
    top5_accuracy = top5_correct.sum(dim=-1).mean().item()
    
    # Compute F1 score
    metric_f1 = evaluate_load("f1")
    f1_score = metric_f1.compute(predictions=predicted_classes.numpy(), references=labels.numpy(), average="weighted")['f1']
    
    return {
        "accuracy": accuracy,
        "top5_accuracy": top5_accuracy,
        "f1_score": f1_score
    }

In [8]:
from transformers import AutoImageProcessor, AutoModelForImageClassification

model_name = "google/vit-base-patch16-224-in21k"

model = AutoModelForImageClassification.from_pretrained(
    model_name,
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True
)

print(model)
preprocessor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ViTForImageClassification(
  (vit): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTSdpaAttention(
            (attention): ViTSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_fe

Fast image processor class <class 'transformers.models.vit.image_processing_vit_fast.ViTImageProcessorFast'> is available for this model. Using slow image processor class. To use the fast image processor class set `use_fast=True`.


In [9]:
from transformers import Trainer, TrainingArguments
training_args = TrainingArguments(
    output_dir="vit-baseline-dog",
    remove_unused_columns=False,
    eval_strategy="steps",
    save_strategy="steps",
    eval_steps=28,
    save_steps=28,
    learning_rate=5e-4,
    per_device_train_batch_size=64,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=64,
    num_train_epochs=5,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to='wandb'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=preprocessor,
    compute_metrics=compute_metrics,
)

trainer.train()

  trainer = Trainer(


Step,Training Loss,Validation Loss,Accuracy,Top5 Accuracy,F1 Score
28,4.5045,3.714836,0.636378,0.937642,0.606877
56,2.6918,2.135589,0.74668,0.959346,0.728469
84,1.3154,1.309558,0.767412,0.961613,0.756467
112,0.8403,0.981152,0.796242,0.966148,0.792991
140,0.4442,0.870798,0.783771,0.964853,0.779808
168,0.3684,0.79365,0.792679,0.968416,0.790486
196,0.2282,0.77241,0.797862,0.965339,0.7975
224,0.1762,0.71539,0.815517,0.971817,0.817102
252,0.1242,0.699528,0.819728,0.971655,0.820117
280,0.1114,0.673231,0.825721,0.972951,0.826064


TrainOutput(global_step=280, training_loss=1.134519409281867, metrics={'train_runtime': 8540.5641, 'train_samples_per_second': 8.434, 'train_steps_per_second': 0.033, 'total_flos': 5.542503214161199e+18, 'train_loss': 1.134519409281867, 'epoch': 4.95575221238938})