In [1]:
!pip install evaluate
! pip install -U accelerate
! pip install -U transformers




[notice] A new release of pip is available: 23.2 -> 23.2.1
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 23.2 -> 23.2.1
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 23.2 -> 23.2.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import evaluate
import json
import numpy as np
import os
import pandas as pd
import pyarrow as pa
import requests
import torch

from datasets import load_dataset, load_from_disk, Dataset, Features, Array3D
from io import BytesIO
from transformers import AutoProcessor, ViTFeatureExtractor, ViTForImageClassification, Trainer, TrainingArguments, default_data_collator
from typing import Tuple
from PIL import Image

In [3]:
# The directory where our images are saved in folders by category
images_dir = "2750"

# The output directory of the processed datasets
train_save_path = "./processed-datasets/train"
val_save_path = "./processed-datasets/val"
test_save_path = "./processed-datasets/test"


# Sizes of dataset splits
val_size = 0.2
test_size = 0.1

# Name of model as named in the HuggingFace Hub
model_name = "google/vit-base-patch16-224-in21k"

In [4]:
dataset = load_dataset("imagefolder", data_dir=images_dir, split='train')

Resolving data files:   0%|          | 0/2000 [00:00<?, ?it/s]

Downloading and preparing dataset imagefolder/default to C:/Users/hp/.cache/huggingface/datasets/imagefolder/default-4fcfc1c3340bae97/0.0.0/37fbb85cc714a338bea574ac6c7d0b5be5aff46c1862c1989b20e0771199e93f...


Downloading data files:   0%|          | 0/2000 [00:00<?, ?it/s]

Downloading data files: 0it [00:00, ?it/s]

Extracting data files: 0it [00:00, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset imagefolder downloaded and prepared to C:/Users/hp/.cache/huggingface/datasets/imagefolder/default-4fcfc1c3340bae97/0.0.0/37fbb85cc714a338bea574ac6c7d0b5be5aff46c1862c1989b20e0771199e93f. Subsequent calls will reuse this data.


In [5]:
def split_dataset(
    dataset: Dataset,
    val_size: float=0.2,
    test_size: float=0.1
) -> Tuple[Dataset, Dataset, Dataset]:
    """
    Returns a tuple with three random train, validation and test subsets by splitting the passed dataset.
    Size of the validation and test sets defined as a fraction of 1 with the `val_size` and `test_size` arguments.
    """

    print("Splitting dataset into train, validation and test sets...")

    # Split dataset into train and (val + test) sets
    split_size = round(val_size + test_size, 3)
    dataset = dataset.train_test_split(shuffle=True, test_size=split_size)

    # Split (val + test) into val and test sets
    split_ratio = round(test_size / (test_size + val_size), 3)
    val_test_sets = dataset['test'].train_test_split(shuffle=True, test_size=split_ratio)

    train_dataset = dataset["train"]
    val_dataset = val_test_sets["train"]
    test_dataset = val_test_sets["test"]
    return train_dataset, val_dataset, test_dataset


# Split dataset into train and test sets
train_dataset, val_dataset, test_dataset = split_dataset(dataset, val_size, test_size)

Splitting dataset into train, validation and test sets...


In [6]:
def process_examples(examples, image_processor):
    """Processor helper function. Used to process batches of images using the
    passed image_processor.

    Parameters
    ----------
    examples
        A batch of image examples.

    image_processor
        A HuggingFace image processor for the selected model.

    Returns
    -------
    examples 
        A batch of processed image examples.
    """
    
    # Get batch of images
    images = examples['image']

    # Preprocess
    inputs = image_processor(images=images)
    # Add pixel_values
    examples['pixel_values'] = inputs['pixel_values']

    return examples


def apply_processing(
    model_name: str,
    train_dataset: Dataset,
    val_dataset: Dataset,
    test_dataset: Dataset
) -> Tuple[Dataset, Dataset, Dataset]:
    """
    Apply model's image AutoProcessor to transform train, validation and test subsets.
    Returns train, validation and test datasets with `pixel_values` in torch tensor type.
    """

    # Extend the features 
    features = Features({
        **train_dataset.features,
        'pixel_values': Array3D(dtype="float32", shape=(3, 224, 224)),
    })

    # Instantiate image_processor
    image_processor = AutoProcessor.from_pretrained(model_name)
    
    # Preprocess images
    train_dataset = train_dataset.map(process_examples, batched=True, features=features, fn_kwargs={"image_processor": image_processor})
    val_dataset = val_dataset.map(process_examples, batched=True, features=features, fn_kwargs={"image_processor": image_processor})
    test_dataset = test_dataset.map(process_examples, batched=True, features=features, fn_kwargs={"image_processor": image_processor})

    # Set to torch format for training
    train_dataset.set_format('torch', columns=['pixel_values', 'label'])
    val_dataset.set_format('torch', columns=['pixel_values', 'label'])
    test_dataset.set_format('torch', columns=['pixel_values', 'label'])
    
    # Remove unused column
    train_dataset = train_dataset.remove_columns("image")
    val_dataset = val_dataset.remove_columns("image")
    test_dataset = test_dataset.remove_columns("image")
    
    return train_dataset, val_dataset, test_dataset


# Apply AutoProcessor
train_dataset, val_dataset, test_dataset = apply_processing(model_name, 
train_dataset, val_dataset, test_dataset)


Map:   0%|          | 0/1400 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [7]:
# Save train, validation and test preprocessed datasets
train_dataset.save_to_disk(train_save_path, num_shards=1)

val_dataset.save_to_disk(val_save_path, num_shards=1)

test_dataset.save_to_disk(test_save_path, num_shards=1)

Saving the dataset (0/1 shards):   0%|          | 0/1400 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/400 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/200 [00:00<?, ? examples/s]

In [8]:
train_dataset = load_from_disk(train_save_path)

val_dataset = load_from_disk(val_save_path)

In [9]:
num_classes = train_dataset.features["label"].num_classes

In [10]:
# Download model from model hub
model = ViTForImageClassification.from_pretrained(model_name, num_labels=num_classes)

# Download feature extractor from hub
feature_extractor = ViTFeatureExtractor.from_pretrained(model_name)

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
# K for top accuracy metric
k_for_top_acc = 3

# Compute metrics function for binary classification
acc_metric = evaluate.load("accuracy", module_type="metric")

def compute_metrics(eval_pred):
    predicted_probs, labels = eval_pred
    # Accuracy
    predicted_labels = np.argmax(predicted_probs, axis=1)
    acc = acc_metric.compute(predictions=predicted_labels, references=labels)
    # Top-K Accuracy
    top_k_indexes = [np.argpartition(row, -k_for_top_acc)[-k_for_top_acc:] for row in predicted_probs]
    top_k_classes = [top_k_indexes[i][np.argsort(row[top_k_indexes[i]])] for i, row in enumerate(predicted_probs)]
    top_k_classes = np.flip(np.array(top_k_classes), 1)
    acc_k = {
        f"accuracy_k" : sum([label in predictions for predictions, label in zip(top_k_classes, labels)]) / len(labels)
    }
    # Merge metrics
    acc.update(acc_k)
    return acc

In [12]:
# Change labels
id2label = {key:train_dataset.features["label"].names[index] for index,key in enumerate(model.config.id2label.keys())}
label2id = {train_dataset.features["label"].names[index]:value for index,value in enumerate(model.config.label2id.values())}
model.config.id2label = id2label
model.config.label2id = label2id

In [17]:
model_dir = "./model"
output_data_dir = "./outputs"

# Total number of training epochs to perform
num_train_epochs = 1
# The batch size per GPU/TPU core/CPU for training
per_device_train_batch_size = 32
# The batch size per GPU/TPU core/CPU for evaluation
per_device_eval_batch_size = 64
# The initial learning rate for AdamW optimizer
learning_rate = 2e-5
# Number of steps used for a linear warmup from 0 to learning_rate
warmup_steps = 500
# The weight decay to apply to all layers except all bias and LayerNorm weights in AdamW optimizer
weight_decay = 0.01

main_metric_for_evaluation = "accuracy"

In [18]:
import accelerate
import transformers

transformers.__version__, accelerate.__version__

('4.33.1', '0.22.0')

In [19]:
# Define training args
training_args = TrainingArguments(
    output_dir = model_dir,
    num_train_epochs = num_train_epochs,
    per_device_train_batch_size = per_device_train_batch_size,
    per_device_eval_batch_size = per_device_eval_batch_size,
    warmup_steps = warmup_steps,
    weight_decay = weight_decay,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    logging_strategy = "epoch",
    logging_dir = f"{output_data_dir}/logs",
    learning_rate = float(learning_rate),
    load_best_model_at_end = True,
    metric_for_best_model = main_metric_for_evaluation,
)

# Create Trainer instance
trainer = Trainer(
    model = model,
    args = training_args,
    compute_metrics = compute_metrics,
    train_dataset = train_dataset,
    eval_dataset = val_dataset,
    data_collator = default_data_collator,
    tokenizer = feature_extractor
)

In [20]:
trainer.train()    

  0%|          | 0/44 [00:00<?, ?it/s]

{'loss': 2.3006, 'learning_rate': 1.76e-06, 'epoch': 1.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 2.286520004272461, 'eval_accuracy': 0.1175, 'eval_accuracy_k': 0.405, 'eval_runtime': 177.0542, 'eval_samples_per_second': 2.259, 'eval_steps_per_second': 0.04, 'epoch': 1.0}
{'train_runtime': 2751.0231, 'train_samples_per_second': 0.509, 'train_steps_per_second': 0.016, 'train_loss': 2.3006241538307886, 'epoch': 1.0}


TrainOutput(global_step=44, training_loss=2.3006241538307886, metrics={'train_runtime': 2751.0231, 'train_samples_per_second': 0.509, 'train_steps_per_second': 0.016, 'train_loss': 2.3006241538307886, 'epoch': 1.0})

In [21]:

  log_history = pd.DataFrame(trainer.state.log_history)
log_history = log_history.fillna(0)
log_history = log_history.groupby(['epoch']).sum()
log_history

Unnamed: 0_level_0,loss,learning_rate,step,eval_loss,eval_accuracy,eval_accuracy_k,eval_runtime,eval_samples_per_second,eval_steps_per_second,train_runtime,train_samples_per_second,train_steps_per_second,total_flos,train_loss
epoch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1.0,2.3006,2e-06,132,2.28652,0.1175,0.405,177.0542,2.259,0.04,2751.0231,0.509,0.016,1.084966e+17,2.300624


In [24]:
trainer.save_model(model_dir)

In [25]:
# Load dataset
test_dataset = load_from_disk(test_save_path)

# Load trained model
model = ViTForImageClassification.from_pretrained('./model')

# Load feature extractor
feature_extractor = ViTFeatureExtractor.from_pretrained('./model')
    
# Create Trainer instance
trainer = Trainer(
    model=model,
    compute_metrics=compute_metrics,
    data_collator=default_data_collator,
    tokenizer=feature_extractor
)

# Evaluate model
eval_results = trainer.evaluate(eval_dataset=test_dataset)

# Writes eval_result to file which can be accessed later
with open(os.path.join(output_data_dir, "eval_results.json"), "w") as writer:
    print(f"Logging evaluation results at {output_data_dir}/eval_results.json")
    writer.write(json.dumps(eval_results))

print(json.dumps(eval_results, indent=4))



  0%|          | 0/25 [00:00<?, ?it/s]

Logging evaluation results at ./outputs/eval_results.json
{
    "eval_loss": 2.2785730361938477,
    "eval_accuracy": 0.14,
    "eval_accuracy_k": 0.435,
    "eval_runtime": 101.7511,
    "eval_samples_per_second": 1.966,
    "eval_steps_per_second": 0.246
}


In [71]:
for name, module in model.named_modules():
    print(name)
    
    


vit
vit.embeddings
vit.embeddings.patch_embeddings
vit.embeddings.patch_embeddings.projection
vit.embeddings.dropout
vit.encoder
vit.encoder.layer
vit.encoder.layer.0
vit.encoder.layer.0.attention
vit.encoder.layer.0.attention.attention
vit.encoder.layer.0.attention.attention.query
vit.encoder.layer.0.attention.attention.key
vit.encoder.layer.0.attention.attention.value
vit.encoder.layer.0.attention.attention.dropout
vit.encoder.layer.0.attention.output
vit.encoder.layer.0.attention.output.dense
vit.encoder.layer.0.attention.output.dropout
vit.encoder.layer.0.intermediate
vit.encoder.layer.0.intermediate.dense
vit.encoder.layer.0.intermediate.intermediate_act_fn
vit.encoder.layer.0.output
vit.encoder.layer.0.output.dense
vit.encoder.layer.0.output.dropout
vit.encoder.layer.0.layernorm_before
vit.encoder.layer.0.layernorm_after
vit.encoder.layer.1
vit.encoder.layer.1.attention
vit.encoder.layer.1.attention.attention
vit.encoder.layer.1.attention.attention.query
vit.encoder.layer.1.atte

In [78]:
selected_layer = 'vit.encoder.layer[0]'