In [3]:
%%capture
! pip install datasets transformers

In [4]:
%cd MaskDetection/

/content/drive/MyDrive/MaskDetection


In [5]:
import torch
import torchvision.transforms as transforms
import os
import pandas as pd
from skimage import io
from torch.utils.data import (
    Dataset,
)

from transformers import ViTFeatureExtractor

model_name_or_path = 'google/vit-base-patch16-224-in21k'
feature_extractor = ViTFeatureExtractor.from_pretrained(model_name_or_path)

class FeatureExtractor(object):
    def __call__(self, image, target):
        sample = feature_extractor(image, return_tensors='pt')
        sample["labels"] = target
        return sample

class MaskDataset(Dataset):
    def __init__(self, csv_file, root_dir, transform=None):
        self.annotations = pd.read_csv(csv_file)
        self.root_dir = root_dir
        self.transform = transform

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, index):
        img_path = os.path.join(self.root_dir, self.annotations.iloc[index, 0])
        image = io.imread(img_path)
        y_label = torch.tensor(int(self.annotations.iloc[index, 1]))
        
        data = self.transform(image,y_label)

        return data

train_ds = MaskDataset(
    csv_file="train.csv",
    root_dir="",
    transform=FeatureExtractor(),
)

test_ds = MaskDataset(
    csv_file="test.csv",
    root_dir="",
    transform=FeatureExtractor(),
)

Downloading:   0%|          | 0.00/160 [00:00<?, ?B/s]

In [6]:
train_ds.__getitem__(0)

{'pixel_values': tensor([[[[-0.2471, -0.2471, -0.2471,  ..., -0.3882, -0.3882, -0.3882],
          [-0.2471, -0.2471, -0.2471,  ..., -0.3882, -0.3882, -0.3882],
          [-0.2471, -0.2471, -0.2471,  ..., -0.3882, -0.3882, -0.3882],
          ...,
          [ 0.9137,  0.9137,  0.9137,  ...,  0.9608,  0.9608,  0.9608],
          [ 0.9137,  0.9137,  0.9137,  ...,  0.9608,  0.9608,  0.9608],
          [ 0.9137,  0.9137,  0.9137,  ...,  0.9608,  0.9608,  0.9608]],

         [[-0.6078, -0.6078, -0.6078,  ..., -0.5686, -0.5686, -0.5686],
          [-0.6078, -0.6078, -0.6078,  ..., -0.5686, -0.5686, -0.5686],
          [-0.6078, -0.6078, -0.6078,  ..., -0.5686, -0.5686, -0.5686],
          ...,
          [ 0.8824,  0.8824,  0.8824,  ...,  0.8118,  0.8118,  0.8118],
          [ 0.8824,  0.8824,  0.8824,  ...,  0.8118,  0.8118,  0.8118],
          [ 0.8824,  0.8824,  0.8824,  ...,  0.8118,  0.8118,  0.8118]],

         [[-0.7333, -0.7333, -0.7333,  ..., -0.6157, -0.6157, -0.6157],
          [-0

In [7]:
import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")
def compute_metrics(p):
    return metric.compute(predictions=np.argmax(p.predictions, axis=1), references=p.label_ids)

Downloading:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

In [8]:
from transformers import ViTForImageClassification

labels = ["No Mask","Mask"]

model = ViTForImageClassification.from_pretrained(
    model_name_or_path,
    num_labels=len(labels),
    id2label={str(i): c for i, c in enumerate(labels)},
    label2id={c: str(i) for i, c in enumerate(labels)}
)

Downloading:   0%|          | 0.00/502 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/330M [00:00<?, ?B/s]

Some weights of the model checkpoint at google/vit-base-patch16-224-in21k were not used when initializing ViTForImageClassification: ['pooler.dense.bias', 'pooler.dense.weight']
- This IS expected if you are initializing ViTForImageClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ViTForImageClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
from transformers import TrainingArguments
from transformers import EarlyStoppingCallback

training_args = TrainingArguments(
  output_dir="./vit-base-mask-finetuned",
  per_device_train_batch_size=16,
  evaluation_strategy="steps",
  num_train_epochs=4,
  save_total_limit = 4, # Only last 4 models are saved. Older ones are deleted.
  fp16=True,
  save_steps=100,
  eval_steps=100,
  logging_steps=10,
  learning_rate=2e-4,
  remove_unused_columns=False,
  push_to_hub=False,
  report_to='tensorboard',
  load_best_model_at_end=True,
)

PyTorch: setting up devices


In [14]:
import torch

def collate_fn(batch):
    return {
        'pixel_values': torch.stack([x['pixel_values'][0] for x in batch]),
        'labels': torch.tensor([x['labels'] for x in batch])
    }

In [15]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    tokenizer=feature_extractor,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
)

Using amp half precision backend


In [16]:
train_results = trainer.train()
trainer.save_model()
trainer.log_metrics("train", train_results.metrics)
trainer.save_metrics("train", train_results.metrics)
trainer.save_state()

***** Running training *****
  Num examples = 14428
  Num Epochs = 4
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 3608


Step,Training Loss,Validation Loss,Accuracy
100,0.0214,0.155222,0.961346
200,0.0415,0.061964,0.986374
300,0.0372,0.058919,0.984983
400,0.0738,0.173725,0.927419
500,0.0096,0.039475,0.989433
600,0.0584,0.17512,0.949944
700,0.0097,0.045691,0.987764


***** Running Evaluation *****
  Num examples = 3596
  Batch size = 8
Saving model checkpoint to ./vit-base-mask-finetuned/checkpoint-100
Configuration saved in ./vit-base-mask-finetuned/checkpoint-100/config.json
Model weights saved in ./vit-base-mask-finetuned/checkpoint-100/pytorch_model.bin
Configuration saved in ./vit-base-mask-finetuned/checkpoint-100/preprocessor_config.json
***** Running Evaluation *****
  Num examples = 3596
  Batch size = 8
Saving model checkpoint to ./vit-base-mask-finetuned/checkpoint-200
Configuration saved in ./vit-base-mask-finetuned/checkpoint-200/config.json
Model weights saved in ./vit-base-mask-finetuned/checkpoint-200/pytorch_model.bin
Configuration saved in ./vit-base-mask-finetuned/checkpoint-200/preprocessor_config.json
***** Running Evaluation *****
  Num examples = 3596
  Batch size = 8
Saving model checkpoint to ./vit-base-mask-finetuned/checkpoint-300
Configuration saved in ./vit-base-mask-finetuned/checkpoint-300/config.json
Model weights sa

Step,Training Loss,Validation Loss,Accuracy
100,0.0214,0.155222,0.961346
200,0.0415,0.061964,0.986374
300,0.0372,0.058919,0.984983
400,0.0738,0.173725,0.927419
500,0.0096,0.039475,0.989433
600,0.0584,0.17512,0.949944
700,0.0097,0.045691,0.987764
800,0.0465,0.040168,0.989711


***** Running Evaluation *****
  Num examples = 3596
  Batch size = 8
Saving model checkpoint to ./vit-base-mask-finetuned/checkpoint-800
Configuration saved in ./vit-base-mask-finetuned/checkpoint-800/config.json
Model weights saved in ./vit-base-mask-finetuned/checkpoint-800/pytorch_model.bin
Configuration saved in ./vit-base-mask-finetuned/checkpoint-800/preprocessor_config.json
Deleting older checkpoint [vit-base-mask-finetuned/checkpoint-400] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./vit-base-mask-finetuned/checkpoint-500 (score: 0.03947526961565018).
Saving model checkpoint to ./vit-base-mask-finetuned
Configuration saved in ./vit-base-mask-finetuned/config.json
Model weights saved in ./vit-base-mask-finetuned/pytorch_model.bin
Configuration saved in ./vit-base-mask-finetuned/preprocessor_config.json


***** train metrics *****
  epoch                    =        0.89
  total_flos               = 923776502GF
  train_loss               =       0.057
  train_runtime            =  0:40:10.40
  train_samples_per_second =      23.943
  train_steps_per_second   =       1.497


In [17]:
metrics = trainer.evaluate(test_ds)
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

***** Running Evaluation *****
  Num examples = 3596
  Batch size = 8


***** eval metrics *****
  epoch                   =       0.89
  eval_accuracy           =     0.9894
  eval_loss               =     0.0395
  eval_runtime            = 0:00:36.81
  eval_samples_per_second =     97.685
  eval_steps_per_second   =     12.224


In [18]:
!zip -r vit-base-mask-finetuned.zip vit-base-mask-finetuned/

  adding: vit-base-mask-finetuned/ (stored 0%)
  adding: vit-base-mask-finetuned/runs/ (stored 0%)
  adding: vit-base-mask-finetuned/runs/Feb17_12-52-55_0ef67e2ff67c/ (stored 0%)
  adding: vit-base-mask-finetuned/runs/Feb17_12-52-55_0ef67e2ff67c/events.out.tfevents.1645102429.0ef67e2ff67c.84.0 (deflated 58%)
  adding: vit-base-mask-finetuned/runs/Feb17_12-52-55_0ef67e2ff67c/1645102429.3300188/ (stored 0%)
  adding: vit-base-mask-finetuned/runs/Feb17_12-52-55_0ef67e2ff67c/1645102429.3300188/events.out.tfevents.1645102429.0ef67e2ff67c.84.1 (deflated 62%)
  adding: vit-base-mask-finetuned/runs/Feb17_13-36-51_0ef67e2ff67c/ (stored 0%)
  adding: vit-base-mask-finetuned/runs/Feb17_13-36-51_0ef67e2ff67c/events.out.tfevents.1645105012.0ef67e2ff67c.84.2 (deflated 65%)
  adding: vit-base-mask-finetuned/runs/Feb17_13-36-51_0ef67e2ff67c/1645105012.531838/ (stored 0%)
  adding: vit-base-mask-finetuned/runs/Feb17_13-36-51_0ef67e2ff67c/1645105012.531838/events.out.tfevents.1645105012.0ef67e2ff67c.84.