# Introduction 
I have used a Visual Transformer model, specifically `google/vit-base-patch16-224-in21k` for image classification

In [1]:
!pip install gdown evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting fsspec>=2021.05.0 (from fsspec[http]>=2021.05.0->evaluate)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, evaluate
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cesium 0.12.4 requires

This google drive link contains a sample of the dataset 

In [2]:
!gdown "https://drive.google.com/drive/folders/11YDOiQwyjnt1E8spPrCyOdJhC7eUuR5c" -O /tmp/folder --folder

Retrieving folder contents
Processing file 1mJMSCMvMERxx5KV3F1A9n0lVSN0uyVN6 data-00000-of-00012.arrow
Processing file 14MzgUcsDhI0pm_5WsTC9XM-PfCh38LRs data-00001-of-00012.arrow
Processing file 1r5i23PHJjS3jwM24sLpZpcm_prviH9ub data-00002-of-00012.arrow
Processing file 1tWfurqu_POnJLQwcmZBFp1PqAviKRHbw data-00003-of-00012.arrow
Processing file 1451VD0wpafAdohLEs7FldK_wecTJ1Q6M data-00004-of-00012.arrow
Processing file 1un3h1Zxm-Pri_uM2WbAB6_aheCd_DH9X data-00005-of-00012.arrow
Processing file 1iAg90ye8EgGgzv35RZS_N5im53DGUdde data-00006-of-00012.arrow
Processing file 1Z-rJBeYGXCMMhkwRZYU63qsSSSzZwsvv data-00007-of-00012.arrow
Processing file 1IdYL9cPGjfIizOsAJPiRG5fnTL-Md1j7 data-00008-of-00012.arrow
Processing file 1wSAJQycPMbco183H7_PQ6tzhJYUWDwBU data-00009-of-00012.arrow
Processing file 1hv-5q3xauwP1jNlBVDhoxCZUO27J5xlM data-00010-of-00012.arrow
Processing file 1vJJFMdwaEOIilwZwJeeRnNQdKiyta4hx data-00011-of-00012.arrow
Processing file 1QpAnoO2vgTsx8415ZJel883ig-HaulL

In [3]:
from functools import partial

from datasets import load_from_disk
from transformers import AutoImageProcessor,AutoModelForImageClassification, TrainingArguments, Trainer, DefaultDataCollator
from torchvision.transforms import RandomResizedCrop, Compose, Normalize, ToTensor
from torchvision.transforms import v2
import evaluate
import numpy as np
import wandb
wandb.init(mode="disabled")

2025-06-12 16:33:50.000967: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749746030.322524      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749746030.394043      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [4]:
dataset = load_from_disk("/tmp/folder")

In [5]:
saved_dataset = dataset.train_test_split(test_size=0.05,train_size=0.5)
train,test = saved_dataset["train"],saved_dataset["test"]

In [6]:
labels = saved_dataset["train"].features["label"].names
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

In [7]:
checkpoint = "google/vit-base-patch16-224-in21k"
image_processor = AutoImageProcessor.from_pretrained(checkpoint,use_fast=True)

preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

In [8]:

normalize = Normalize(mean=image_processor.image_mean, std=image_processor.image_std)
size = (
    image_processor.size["shortest_edge"]
    if "shortest_edge" in image_processor.size
    else (image_processor.size["height"], image_processor.size["width"])
)
_transforms = Compose([RandomResizedCrop(size), ToTensor(), normalize])

In [9]:
train_transform = v2.Compose([
      v2.Resize((image_processor.size["height"], image_processor.size["width"])),
      v2.RandomHorizontalFlip(0.4),
      v2.RandomVerticalFlip(0.1),
      v2.RandomApply(transforms=[v2.RandomRotation(degrees=(0, 90))], p=0.5),
      v2.RandomApply(transforms=[v2.ColorJitter(brightness=.3, hue=.1)], p=0.3),
      v2.RandomApply(transforms=[v2.GaussianBlur(kernel_size=(5, 9))], p=0.3),
      v2.ToTensor(),
      normalize
      #transforms.Normalize(mean = (0.485, 0.456, 0.406), std = (0.229, 0.224, 0.225))
 ])

test_transform = v2.Compose([
    v2.Resize((image_processor.size["height"], image_processor.size["width"])),
    v2.ToTensor(),
    normalize
])



In [10]:
def transforms(examples,t):
    examples["pixel_values"] = [t(img.convert("RGB")) for img in examples["image"]]
    del examples["image"]
    return examples

train = train.with_transform(partial(transforms,t=train_transform))
test = test.with_transform(partial(transforms,t=test_transform))

In [11]:
data_collator = DefaultDataCollator()
accuracy = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [12]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [13]:
model = AutoModelForImageClassification.from_pretrained(
    checkpoint,
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id,
    device_map="auto",
)
# model.to("cuda")

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
training_args = TrainingArguments(
    output_dir="rvl_cdip_model",
    remove_unused_columns=False,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=64,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    warmup_ratio=0.1,
    logging_steps=1,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train,
    eval_dataset=test,
    processing_class=image_processor,
    compute_metrics=compute_metrics,
)

In [15]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,1.3708,1.796248,0.567083
2,1.0209,1.293425,0.682083
3,0.9454,1.115273,0.713333
4,0.8263,0.967232,0.749583
5,0.7867,0.906331,0.7625
6,0.6689,0.858473,0.775833
7,0.5731,0.799534,0.7925
8,0.5725,0.779134,0.786667
9,0.802,0.751664,0.8


TrainOutput(global_step=930, training_loss=1.1516033175171063, metrics={'train_runtime': 7073.8014, 'train_samples_per_second': 33.928, 'train_steps_per_second': 0.131, 'total_flos': 1.8406966873156485e+19, 'train_loss': 1.1516033175171063, 'epoch': 9.896})

In [16]:
trainer.save_model("final_model")

In [17]:
ls

[0m[01;34mfinal_model[0m/  __notebook__.ipynb  [01;34mrvl_cdip_model[0m/


In [18]:
cd final_model

/kaggle/working/final_model


In [19]:
!zip -r ../final_model.zip .

  adding: config.json (deflated 58%)
  adding: preprocessor_config.json (deflated 51%)
  adding: model.safetensors (deflated 7%)
  adding: training_args.bin (deflated 51%)


In [20]:
from IPython.display import FileLink

FileLink('/kaggle/working/final_model.zip')  # Replace with your filename
