In [2]:
import torch
from torch import nn

In [None]:
learning_rate = 1e-4
num_classes = 10
patch_size = 4
img_size = 28
in_channels= 3
num_heads = 8
dropout= 0.001
hidden_dim = 786


In [1]:
import timm

In [4]:
print("Available Vision Transformer Models: ")
(timm.list_models("vit*"))

Available Vision Transformer Models: 


['vit_base_patch8_224',
 'vit_base_patch14_dinov2',
 'vit_base_patch14_reg4_dinov2',
 'vit_base_patch16_18x2_224',
 'vit_base_patch16_224',
 'vit_base_patch16_224_miil',
 'vit_base_patch16_384',
 'vit_base_patch16_clip_224',
 'vit_base_patch16_clip_384',
 'vit_base_patch16_clip_quickgelu_224',
 'vit_base_patch16_gap_224',
 'vit_base_patch16_plus_240',
 'vit_base_patch16_reg4_gap_256',
 'vit_base_patch16_rpn_224',
 'vit_base_patch16_siglip_224',
 'vit_base_patch16_siglip_256',
 'vit_base_patch16_siglip_384',
 'vit_base_patch16_siglip_512',
 'vit_base_patch16_xp_224',
 'vit_base_patch32_224',
 'vit_base_patch32_384',
 'vit_base_patch32_clip_224',
 'vit_base_patch32_clip_256',
 'vit_base_patch32_clip_384',
 'vit_base_patch32_clip_448',
 'vit_base_patch32_clip_quickgelu_224',
 'vit_base_patch32_plus_256',
 'vit_base_r26_s32_224',
 'vit_base_r50_s16_224',
 'vit_base_r50_s16_384',
 'vit_base_resnet26d_224',
 'vit_base_resnet50d_224',
 'vit_giant_patch14_224',
 'vit_giant_patch14_clip_224',
 

In [5]:
import math
import torch
from torch import nn


class NewGELUActivation(nn.Module):
    """
    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see
    the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415

    Taken from https://github.com/huggingface/transformers/blob/main/src/transformers/activations.py
    """

    def forward(self, input):
        return 0.5 * input * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (input + 0.044715 * torch.pow(input, 3.0))))



In [6]:
class PatchEmbeddings(nn.Module):
    """
    Convert the image into patches and then project them into a vector space.
    """

    def __init__(self, config):
        super().__init__()
        self.image_size = config["image_size"]
        self.patch_size = config["patch_size"]
        self.num_channels = config["num_channels"]
        self.hidden_size = config["hidden_size"]
        # Calculate the number of patches from the image size and patch size
        self.num_patches = (self.image_size // self.patch_size) ** 2
        # Create a projection layer to convert the image into patches
        # The layer projects each patch into a vector of size hidden_size
        self.projection = nn.Conv2d(self.num_channels, self.hidden_size, kernel_size=self.patch_size, stride=self.patch_size)

    def forward(self, x):
        # (batch_size, num_channels, image_size, image_size) -> (batch_size, num_patches, hidden_size)
        x = self.projection(x)
        x = x.flatten(2).transpose(1, 2)
        return x

In [7]:
from datasets import load_dataset

In [8]:
dataset = load_dataset("imagefolder", data_dir=r"C:\Users\abdullah\projects\Brain Tumor\Brain-Tumor-Image-Classification-Project\data\Brain Tumor Classification (MRI)")
dataset

Resolving data files:   0%|          | 0/2870 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/394 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/2870 [00:00<?, ?files/s]

Downloading data:   0%|          | 0/394 [00:00<?, ?files/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['image', 'label'],
        num_rows: 2870
    })
    validation: Dataset({
        features: ['image', 'label'],
        num_rows: 394
    })
})

In [11]:
dataset['train'][2800]

{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=512x512>,
 'label': 3}

In [12]:
labels = dataset["train"].features["label"].names
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label
print(label2id)
print(id2label)

{'glioma_tumor': '0', 'meningioma_tumor': '1', 'no_tumor': '2', 'pituitary_tumor': '3'}
{'0': 'glioma_tumor', '1': 'meningioma_tumor', '2': 'no_tumor', '3': 'pituitary_tumor'}


In [13]:
from transformers import AutoImageProcessor

In [14]:
checkpoint = "google/vit-base-patch16-224-in21k"
image_processor = AutoImageProcessor.from_pretrained(checkpoint)



preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

In [15]:
from torchvision.transforms import RandomResizedCrop, Compose, Normalize, ToTensor

normalize = Normalize(mean=image_processor.image_mean, std=image_processor.image_std)
size = (
    image_processor.size["shortest_edge"]
    if "shortest_edge" in image_processor.size
    else (image_processor.size["height"], image_processor.size["width"])
)
_transforms = Compose([RandomResizedCrop(size), ToTensor(), normalize])

In [16]:
def transforms(examples):
    examples["pixel_values"] = [_transforms(img.convert("RGB")) for img in examples["image"]]
    del examples["image"]
    return examples

In [17]:
dataset = dataset.with_transform(transforms)

In [18]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()

In [4]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return dict(accuracy=accuracy_score(predictions, labels))

In [24]:
from transformers import AutoModelForImageClassification, TrainingArguments, Trainer

model = AutoModelForImageClassification.from_pretrained(
    checkpoint,
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id,
)



model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:
training_args = TrainingArguments(
    output_dir="my_awesome_food_model",
    remove_unused_columns=False,
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    warmup_ratio=0.1,
    logging_steps=10,
    metric_for_best_model="accuracy",
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    tokenizer=image_processor,
    compute_metrics=compute_metrics,
)

trainer.train()

  0%|          | 0/135 [00:00<?, ?it/s]

{'loss': 1.3621, 'grad_norm': 0.6295940279960632, 'learning_rate': 3.571428571428572e-05, 'epoch': 0.22}
{'loss': 1.1915, 'grad_norm': 1.2261874675750732, 'learning_rate': 4.75206611570248e-05, 'epoch': 0.44}
{'loss': 0.947, 'grad_norm': 1.285302996635437, 'learning_rate': 4.338842975206612e-05, 'epoch': 0.67}
{'loss': 0.7672, 'grad_norm': 3.0426669120788574, 'learning_rate': 3.925619834710744e-05, 'epoch': 0.89}
{'loss': 0.6362, 'grad_norm': 2.086291551589966, 'learning_rate': 3.512396694214876e-05, 'epoch': 1.11}
{'loss': 0.5645, 'grad_norm': 1.5295226573944092, 'learning_rate': 3.099173553719008e-05, 'epoch': 1.33}
{'loss': 0.4915, 'grad_norm': 2.0500593185424805, 'learning_rate': 2.6859504132231405e-05, 'epoch': 1.56}
{'loss': 0.4359, 'grad_norm': 1.4621026515960693, 'learning_rate': 2.272727272727273e-05, 'epoch': 1.78}
{'loss': 0.4127, 'grad_norm': 2.313804864883423, 'learning_rate': 1.859504132231405e-05, 'epoch': 2.0}
{'loss': 0.3938, 'grad_norm': 1.969889760017395, 'learning_r

TrainOutput(global_step=135, training_loss=0.6224458659136737, metrics={'train_runtime': 212.581, 'train_samples_per_second': 40.502, 'train_steps_per_second': 0.635, 'total_flos': 6.672179904948634e+17, 'train_loss': 0.6224458659136737, 'epoch': 3.0})