<a href="https://colab.research.google.com/github/Du5TCh3N/learningCLIP/blob/main/Finetune_ViT_Transformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

this code is taken from this youtube tutorial
https://www.youtube.com/watch?v=qU7wO02urYU

In [None]:
%%capture
!pip install datasets transformers torch

In [None]:
from datasets import load_dataset

dataset_train = load_dataset(
    'cifar10',
    split='train',
    ignore_verifications=False
)

dataset_train

In [None]:
dataset_test = load_dataset(
    'cifar10',
    split='test',
    ignore_verifications=True
)

dataset_test

In [None]:
num_classes = len(set(dataset_train['label']))
labels = dataset_train.features['label']

num_classes, labels

To individual training data

In [None]:
dataset_train[0]

In [None]:
dataset_train[0]['img']

In [None]:
dataset_train[0]['label'], labels.names[dataset_train[0]['label']]

# Load ViT Feature Extractor

In [None]:
 from transformers import ViTImageProcessor

 model_name_or_path = 'laion/CLIP-ViT-L-14-DataComp.XL-s13B-b90K'
# model_name_or_path = 'google/vit-base-patch16-224-in21k'
 feature_extractor = ViTImageProcessor.from_pretrained(
    model_name_or_path
 )

 feature_extractor

In [None]:
example = feature_extractor(
    dataset_train[0]['img'],
    return_tensors='pt'
)
example

In [None]:
example['pixel_values'].shape

the size will be different to the initial due to rescaling

In [None]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

In [None]:
def preprocess(batch):
  inputs = feature_extractor(
      batch['img'],
      return_tensors = 'pt'
  )
  inputs['label'] = batch['label']
  return inputs

In [None]:
prepared_train = dataset_train.with_transform(preprocess)
prepared_test = dataset_test.with_transform(preprocess)

In this section, we are going to build the Trainer, which is a feature-complete training and evaluation loop for PyTorch, optimized for HuggingFace Transformers.

We need to define all of the arguments that it will include:


*   training and testing dataset
*   feature extractor
*   model
*   collate function
*   evaluation metric
*   ... other training arguments



In [None]:
def collate_fn(batch):
  return {
      'pixel_values' : torch.stack([x['pixel_values'] for x in batch]),
      'labels' : torch.tensor([x['label'] for x in batch])
  }

In [None]:
import numpy as np
from datasets import load_metric

metric = load_metric('accuracy')
def compute_metrics(p):
  return metric.compute(
      predictions = np.argmax(p.predictions, axis = 1),
      references = p.label_ids
  )

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir = './cifar',
    per_device_train_batch_size = 16,
    evaluation_strategy = "steps",
    num_train_epochs = 4,
    save_steps = 100,
    eval_steps = 100,
    logging_steps = 10,
    learning_rate = 2e-4,
    save_total_limit = 2,
    remove_unused_columns = False,
    push_to_hub = False,
    load_best_model_at_end = True,
)

In [None]:
from transformers import ViTForImageClassification

labels = dataset_train.features['label'].names

model = ViTForImageClassification.from_pretrained(
    model_name_or_path,
    num_labels = len(labels)
)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model = model,
    args = training_args,
    data_collator = collate_fn,
    compute_metrics = compute_metrics,
    train_dataset = prepared_train,
    eval_dataset = prepared_test,
    tokenizer = feature_extractor,
)

In [None]:
train_results = trainer.train()

trainer.save_model()
trainer.log_metrics("train", train_results.metrics)
trainer.save_metrics("train", train_results.metrics)

trainer.save_state()