In [10]:
!pip install transformers datasets torch




In [11]:
from datasets import load_dataset

train_ds, test_ds = load_dataset("cifar10", split=["train[:5000]", "test[:2000]"])
splits = train_ds.train_test_split(test_size=0.1)
train_ds, val_ds = splits["train"], splits["test"]

In [19]:
from transformers import ViTImageProcessor, ViTForImageClassification
from transformers import TrainingArguments, Trainer
from sklearn.metrics import accuracy_score
import numpy as np

In [20]:
model_name = "google/vit-base-patch16-224"
processor = ViTImageProcessor.from_pretrained(model_name)

In [39]:
def preprocess(examples):
    processed = processor(
        images=[image.convert("RGB") for image in examples["img"]],
        return_tensors="pt"
    )
    return {
        "pixel_values": processed["pixel_values"],
        "label": examples["label"]
    }


In [38]:
print(type(train_ds[0]["pixel_values"]))


<class 'list'>


In [31]:
train_ds = train_ds.map(preprocess, batched=True)
val_ds = val_ds.map(preprocess, batched=True)
test_ds = test_ds.map(preprocess, batched=True)


Map:   0%|          | 0/4500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [40]:
train_ds = train_ds.map(preprocess, batched=True).remove_columns("img")
train_ds.set_format("torch", columns=["pixel_values", "label"])

Map:   0%|          | 0/4500 [00:00<?, ? examples/s]

In [42]:
model = ViTForImageClassification.from_pretrained(
    model_name,
    num_labels=10,
    id2label={i: str(i) for i in range(10)},
    ignore_mismatched_sizes=True
)


Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([10]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([10, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [43]:
training_args = TrainingArguments(
    output_dir="./vit-cifar10",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    logging_dir="./logs",
    remove_unused_columns=False
)



In [44]:
import torch
def collate_fn(batch):
    return {
        "pixel_values": torch.stack([x["pixel_values"] for x in batch]),
        "labels": torch.tensor([x["label"] for x in batch])
    }

# Define accuracy metric
def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return {"accuracy": accuracy_score(predictions, eval_pred.label_ids)}

In [45]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
    tokenizer=processor,
)

# Start training
trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss


TypeError: expected Tensor as element 0 in argument 0, but got list