In [1]:
import evaluate
import json
import numpy as np
import os
import pandas as pd
import pyarrow as pa
import requests
import torch

from datasets import load_dataset, load_from_disk, Dataset, Features, Array3D
from io import BytesIO
from transformers import AutoProcessor, ViTFeatureExtractor, ViTForImageClassification, Trainer, TrainingArguments, default_data_collator
from typing import Tuple
from PIL import Image

In [2]:
train_save_path = "./processed-datasets/train"
val_save_path = "./processed-datasets/val"
test_save_path = "./processed-datasets/test"

train_dataset = load_from_disk(train_save_path)

val_dataset = load_from_disk(val_save_path)

In [3]:
num_classes = train_dataset.features["label"].num_classes

In [4]:
# Name of model as named in the HuggingFace Hub
model_name = "google/vit-base-patch16-224-in21k"

# Download model from model hub
model = ViTForImageClassification.from_pretrained(model_name, num_labels=num_classes)

# Download feature extractor from hub
feature_extractor = ViTFeatureExtractor.from_pretrained(model_name)

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
# K for top accuracy metric
k_for_top_acc = 3

# Compute metrics function for binary classification
acc_metric = evaluate.load("accuracy", module_type="metric")

def compute_metrics(eval_pred):
    predicted_probs, labels = eval_pred
    # Accuracy
    predicted_labels = np.argmax(predicted_probs, axis=1)
    acc = acc_metric.compute(predictions=predicted_labels, references=labels)
    # Top-K Accuracy
    top_k_indexes = [np.argpartition(row, -k_for_top_acc)[-k_for_top_acc:] for row in predicted_probs]
    top_k_classes = [top_k_indexes[i][np.argsort(row[top_k_indexes[i]])] for i, row in enumerate(predicted_probs)]
    top_k_classes = np.flip(np.array(top_k_classes), 1)
    acc_k = {
        f"accuracy_k" : sum([label in predictions for predictions, label in zip(top_k_classes, labels)]) / len(labels)
    }
    # Merge metrics
    acc.update(acc_k)
    return acc

In [6]:
# Change labels
id2label = {key:train_dataset.features["label"].names[index] for index,key in enumerate(model.config.id2label.keys())}
label2id = {train_dataset.features["label"].names[index]:value for index,value in enumerate(model.config.label2id.values())}
model.config.id2label = id2label
model.config.label2id = label2id

In [7]:
model_dir = "./model"
output_data_dir = "./outputs"

# Total number of training epochs to perform
num_train_epochs = 1
# The batch size per GPU/TPU core/CPU for training
per_device_train_batch_size = 32
# The batch size per GPU/TPU core/CPU for evaluation
per_device_eval_batch_size = 64
# The initial learning rate for AdamW optimizer
learning_rate = 2e-5
# Number of steps used for a linear warmup from 0 to learning_rate
warmup_steps = 500
# The weight decay to apply to all layers except all bias and LayerNorm weights in AdamW optimizer
weight_decay = 0.01

main_metric_for_evaluation = "accuracy"

In [8]:
# Define training args
training_args = TrainingArguments(
    output_dir = model_dir,
    num_train_epochs = num_train_epochs,
    per_device_train_batch_size = per_device_train_batch_size,
    per_device_eval_batch_size = per_device_eval_batch_size,
    warmup_steps = warmup_steps,
    weight_decay = weight_decay,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    logging_strategy = "epoch",
    logging_dir = f"{output_data_dir}/logs",
    learning_rate = float(learning_rate),
    load_best_model_at_end = True,
    metric_for_best_model = main_metric_for_evaluation,
)

# Create Trainer instance
trainer = Trainer(
    model = model,
    args = training_args,
    compute_metrics = compute_metrics,
    train_dataset = train_dataset,
    eval_dataset = val_dataset,
    data_collator = default_data_collator,
    tokenizer = feature_extractor
)

In [12]:
#trainer.train()     no training

  0%|          | 0/44 [00:00<?, ?it/s]

{'loss': 2.2771, 'learning_rate': 1.76e-06, 'epoch': 1.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 2.2733914852142334, 'eval_accuracy': 0.1825, 'eval_accuracy_k': 0.44, 'eval_runtime': 203.9205, 'eval_samples_per_second': 1.962, 'eval_steps_per_second': 0.034, 'epoch': 1.0}
{'train_runtime': 2129.6733, 'train_samples_per_second': 0.657, 'train_steps_per_second': 0.021, 'train_loss': 2.27710637179288, 'epoch': 1.0}


TrainOutput(global_step=44, training_loss=2.27710637179288, metrics={'train_runtime': 2129.6733, 'train_samples_per_second': 0.657, 'train_steps_per_second': 0.021, 'train_loss': 2.27710637179288, 'epoch': 1.0})

In [13]:
trainer.save_model(model_dir)

In [13]:
# Load dataset
test_dataset = load_from_disk(test_save_path)


    
# Create Trainer instance
trainer = Trainer(
    model=model,
    compute_metrics=compute_metrics,
    data_collator=default_data_collator,
    tokenizer=feature_extractor
)

# Evaluate model
eval_results = trainer.evaluate(eval_dataset=test_dataset)


eval_results


  0%|          | 0/25 [00:00<?, ?it/s]

{'eval_loss': 2.2984514236450195,
 'eval_accuracy': 0.1,
 'eval_accuracy_k': 0.355,
 'eval_runtime': 157.1596,
 'eval_samples_per_second': 1.273,
 'eval_steps_per_second': 0.159}