In [None]:
import zipfile
from google.colab import drive

drive.mount('/content/drive/')

zip_ref = zipfile.ZipFile("/content/drive/My Drive/subclass_training_data.zip", 'r')
zip_ref.extractall("/tmp")
zip_ref.close()


Mounted at /content/drive/


In [None]:
!pip install transformers datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 30.1 MB/s 
[?25hCollecting datasets
  Downloading datasets-2.7.1-py3-none-any.whl (451 kB)
[K     |████████████████████████████████| 451 kB 74.5 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 76.8 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 66.7 MB/s 
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting xxhash
  Downloading xxhash-3.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 81.

In [None]:
from transformers import ViTFeatureExtractor, ViTForImageClassification, TrainingArguments, Trainer
import numpy as np
from datasets import Image, Dataset, load_metric
from torchvision.transforms import (CenterCrop, 
                                    Compose, 
                                    Normalize, 
                                    RandomHorizontalFlip,
                                    RandomResizedCrop, 
                                    Resize, 
                                    ToTensor)                        
from torch.utils.data import DataLoader
import torch
import os


DATASET_DIR = '/tmp/subclass_training_data/train'


# Hangs like hell and i dont know why
# dataset = load_dataset('imagefolder', data_dir='D:/Homework Assignments/NNDL/project/subclass_training_data', cache_dir='./')

training_images = []
training_labels = []
label_names = []

directory = os.fsencode(DATASET_DIR)

for folder in os.listdir(directory):
    folder_name = os.fsdecode(folder)
    label_names.append(folder_name)
    for file in os.listdir(os.fsencode(f'{DATASET_DIR}/{folder_name}')):
        # print(os.fsdecode(file))

        filename = os.fsdecode(file)
        img_path = f'{DATASET_DIR}/{folder_name}/{filename}'

        training_images.append(img_path)
        training_labels.append(label_names.index(folder_name))

dataset = Dataset.from_dict({'image': training_images, 'label': training_labels}).cast_column("image", Image())

dataset.shuffle(seed=42)
print(dataset[0])
print(label_names)


### FROM VIT CARD https://huggingface.co/google/vit-base-patch16-224 ###
# url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
# image = dataset[0]['image']

splits = dataset.train_test_split(test_size=0.1)
train_ds = splits['train']
val_ds = splits['test']

# print(train_ds['label'])

id2label = {id:label for id, label in enumerate(label_names)}
label2id = {label:id for id, label in id2label.items()}

print(id2label)
print(label2id)

feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224')
model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224-in21k',
                                                  num_labels=89,
                                                  id2label=id2label,
                                                  label2id=label2id)
# inputs = feature_extractor(images=image, return_tensors="pt")

print(feature_extractor.size)
resize_seq = (feature_extractor.size['height'], feature_extractor.size['width'])

normalize = Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std)
_train_transforms = Compose(
        [
            RandomResizedCrop(resize_seq),
            RandomHorizontalFlip(),
            ToTensor(),
            normalize,
        ]
    )

_val_transforms = Compose(
        [
            Resize(resize_seq),
            CenterCrop(resize_seq),
            ToTensor(),
            normalize,
        ]
    )

def train_transforms(examples):
    examples['pixel_values'] = [_train_transforms(image.convert("RGB")) for image in examples['image']]
    return examples

def val_transforms(examples):
    examples['pixel_values'] = [_val_transforms(image.convert("RGB")) for image in examples['image']]
    return examples

# Set the transforms
train_ds.set_transform(train_transforms)
val_ds.set_transform(val_transforms)

# print(train_ds[:2])

metric_name = "accuracy"

args = TrainingArguments(
    'nndl_checkpoints_etc',
    save_strategy="epoch",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=10,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    logging_dir='logs',
    remove_unused_columns=False,
)
     
metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

def collate_fn(examples):
    pixel_values = torch.stack([example["pixel_values"] for example in examples])
    labels = torch.tensor([example["label"] for example in examples])
    return {"pixel_values": pixel_values, "labels": labels}

trainer = Trainer(
    model,
    args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
    tokenizer=feature_extractor,
)

# trainer.train()


# train_dataloader = DataLoader(train_ds, collate_fn=collate_fn, batch_size=4)

# batch = next(iter(train_dataloader))
# for k,v in batch.items():
#   if isinstance(v, torch.Tensor):
#     print(k, v.shape)



# outputs = model(**inputs)
# logits = outputs.logits
# # model predicts one of the 1000 ImageNet classes
# predicted_class_idx = logits.argmax(-1).item()
# print("Predicted class:", model.config.id2label[predicted_class_idx])



{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=8x8 at 0x7FF0BBE27F10>, 'label': 0}
['eft', 'oystercatcher, oyster catcher', 'green lizard, Lacerta viridis', 'junco, snowbird', 'Chihuahua', 'Sealyham terrier, Sealyham', 'basset, basset hound', 'pelican', 'soft-coated wheaten terrier', 'bulbul', 'loggerhead, loggerhead turtle, Caretta caretta', 'thunder snake, worm snake, Carphophis amoenus', 'American chameleon, anole, Anolis carolinensis', 'African crocodile, Nile crocodile, Crocodylus niloticus', 'toy terrier', 'triceratops', 'bald eagle, American eagle, Haliaeetus leucocephalus', 'Dandie Dinmont, Dandie Dinmont terrier', 'European gallinule, Porphyrio porphyrio', 'cock', 'alligator lizard', 'Blenheim spaniel', 'jay', 'silky terrier, Sydney silky', 'redshank, Tringa totanus', 'red-backed sandpiper, dunlin, Erolia alpina', 'ruddy turnstone, Arenaria interpres', 'giant schnauzer', 'Pekinese, Pekingese, Peke', 'Lhasa, Lhasa apso', 'brambling, Fringilla montifringilla', 

Downloading:   0%|          | 0.00/346M [00:00<?, ?B/s]

Some weights of the model checkpoint at google/vit-base-patch16-224-in21k were not used when initializing ViTForImageClassification: ['pooler.dense.weight', 'pooler.dense.bias']
- This IS expected if you are initializing ViTForImageClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ViTForImageClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'height': 224, 'width': 224}


  metric = load_metric("accuracy")


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

In [None]:
args = TrainingArguments(
    'nndl_checkpoints_etc',
    save_strategy="epoch",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=10,
    per_device_eval_batch_size=4,
    num_train_epochs=4,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    logging_dir='logs',
    remove_unused_columns=False,
)

trainer = Trainer(
    model,
    args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
    tokenizer=feature_extractor,
)

trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 9829
  Num Epochs = 4
  Instantaneous batch size per device = 10
  Total train batch size (w. parallel, distributed & accumulation) = 10
  Gradient Accumulation steps = 1
  Total optimization steps = 3932
  Number of trainable parameters = 85867097


Epoch,Training Loss,Validation Loss,Accuracy
1,3.8722,3.655452,0.172004
2,3.6498,3.491529,0.207685
3,3.5557,3.3922,0.233303
4,3.4865,3.354999,0.240622


***** Running Evaluation *****
  Num examples = 1093
  Batch size = 4
Saving model checkpoint to nndl_checkpoints_etc/checkpoint-983
Configuration saved in nndl_checkpoints_etc/checkpoint-983/config.json
Model weights saved in nndl_checkpoints_etc/checkpoint-983/pytorch_model.bin
Image processor saved in nndl_checkpoints_etc/checkpoint-983/preprocessor_config.json
***** Running Evaluation *****
  Num examples = 1093
  Batch size = 4
Saving model checkpoint to nndl_checkpoints_etc/checkpoint-1966
Configuration saved in nndl_checkpoints_etc/checkpoint-1966/config.json
Model weights saved in nndl_checkpoints_etc/checkpoint-1966/pytorch_model.bin
Image processor saved in nndl_checkpoints_etc/checkpoint-1966/preprocessor_config.json
***** Running Evaluation *****
  Num examples = 1093
  Batch size = 4
Saving model checkpoint to nndl_checkpoints_etc/checkpoint-2949
Configuration saved in nndl_checkpoints_etc/checkpoint-2949/config.json
Model weights saved in nndl_checkpoints_etc/checkpoint-2

TrainOutput(global_step=3932, training_loss=3.617821539163832, metrics={'train_runtime': 1616.0859, 'train_samples_per_second': 24.328, 'train_steps_per_second': 2.433, 'total_flos': 3.0490507191757455e+18, 'train_loss': 3.617821539163832, 'epoch': 4.0})

In [None]:
args = TrainingArguments(
    'nndl_checkpoints_etc',
    save_strategy="epoch",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=10,
    per_device_eval_batch_size=4,
    num_train_epochs=4,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    logging_dir='logs',
    remove_unused_columns=False,
)

trainer = Trainer(
    model,
    args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
    tokenizer=feature_extractor,
)

trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 9829
  Num Epochs = 4
  Instantaneous batch size per device = 10
  Total train batch size (w. parallel, distributed & accumulation) = 10
  Gradient Accumulation steps = 1
  Total optimization steps = 3932
  Number of trainable parameters = 85867097


Epoch,Training Loss,Validation Loss,Accuracy
1,3.4512,3.300677,0.225069
2,3.2683,3.171093,0.249771
3,3.2063,3.078944,0.276304
4,3.1678,3.048329,0.283623


***** Running Evaluation *****
  Num examples = 1093
  Batch size = 4
Saving model checkpoint to nndl_checkpoints_etc/checkpoint-983
Configuration saved in nndl_checkpoints_etc/checkpoint-983/config.json
Model weights saved in nndl_checkpoints_etc/checkpoint-983/pytorch_model.bin
Image processor saved in nndl_checkpoints_etc/checkpoint-983/preprocessor_config.json
***** Running Evaluation *****
  Num examples = 1093
  Batch size = 4
Saving model checkpoint to nndl_checkpoints_etc/checkpoint-1966
Configuration saved in nndl_checkpoints_etc/checkpoint-1966/config.json
Model weights saved in nndl_checkpoints_etc/checkpoint-1966/pytorch_model.bin
Image processor saved in nndl_checkpoints_etc/checkpoint-1966/preprocessor_config.json
***** Running Evaluation *****
  Num examples = 1093
  Batch size = 4
Saving model checkpoint to nndl_checkpoints_etc/checkpoint-2949
Configuration saved in nndl_checkpoints_etc/checkpoint-2949/config.json
Model weights saved in nndl_checkpoints_etc/checkpoint-2

TrainOutput(global_step=3932, training_loss=3.2615244371602397, metrics={'train_runtime': 1611.8935, 'train_samples_per_second': 24.391, 'train_steps_per_second': 2.439, 'total_flos': 3.0490507191757455e+18, 'train_loss': 3.2615244371602397, 'epoch': 4.0})