In [1]:
from datasets import load_dataset

# Load CIFAR-10 dataset from Hugging Face datasets
dataset = load_dataset("cifar10")

In [2]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['img', 'label'],
        num_rows: 50000
    })
    test: Dataset({
        features: ['img', 'label'],
        num_rows: 10000
    })
})


In [3]:
print(dataset['train'])

Dataset({
    features: ['img', 'label'],
    num_rows: 50000
})


In [4]:
print(dataset['train'][0])

{'img': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=32x32 at 0x7FF71C2409A0>, 'label': 0}


img  - PIL image 32x32
label - int

In [5]:
train_dataset = dataset["train"].shuffle(seed=42).select(range(500))  # Select 500 examples for training
test_dataset = dataset["test"].shuffle(seed=42).select(range(500))  # Select 500 examples for testing

In [6]:
train_dataset[0]

{'img': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=32x32>,
 'label': 1}

In [7]:
from transformers import AutoImageProcessor
image_processor = AutoImageProcessor.from_pretrained('google/vit-base-patch16-224-in21k')

In [10]:
examples= train_dataset[0]
transformed = image_processor(examples['img'], return_tensors='pt')
print(transformed["pixel_values"].shape)

torch.Size([1, 3, 224, 224])


In [11]:
# def transform(examples):
#     transformed = image_processor(examples['img'], return_tensors='pt')
#     # transformed['labels'] = examples['label']
#     return transformed

In [19]:
def transform(example_batch):
    inputs = image_processor([x for x in example_batch['img']], return_tensors='pt')
    inputs['labels'] = example_batch['label']
    return inputs

In [20]:
# transformed_dataset = dataset.map(transform)
train_dataset = train_dataset.with_transform(transform)
test_dataset = test_dataset.with_transform(transform)

In [21]:
train_dataset[0:2]

{'pixel_values': tensor([[[[-0.7490, -0.7490, -0.7490,  ...,  0.2941,  0.2941,  0.2941],
          [-0.7490, -0.7490, -0.7490,  ...,  0.2941,  0.2941,  0.2941],
          [-0.7490, -0.7490, -0.7490,  ...,  0.2941,  0.2941,  0.2941],
          ...,
          [-0.5451, -0.5451, -0.5451,  ...,  0.0196,  0.0196,  0.0196],
          [-0.5451, -0.5451, -0.5451,  ...,  0.0196,  0.0196,  0.0196],
          [-0.5451, -0.5451, -0.5451,  ...,  0.0196,  0.0196,  0.0196]],

         [[-0.7804, -0.7804, -0.7804,  ...,  0.1529,  0.1529,  0.1529],
          [-0.7804, -0.7804, -0.7804,  ...,  0.1529,  0.1529,  0.1529],
          [-0.7804, -0.7804, -0.7804,  ...,  0.1529,  0.1529,  0.1529],
          ...,
          [-0.6471, -0.6471, -0.6471,  ..., -0.1451, -0.1451, -0.1451],
          [-0.6471, -0.6471, -0.6471,  ..., -0.1451, -0.1451, -0.1451],
          [-0.6471, -0.6471, -0.6471,  ..., -0.1451, -0.1451, -0.1451]],

         [[-0.8039, -0.8039, -0.8039,  ..., -0.3333, -0.3333, -0.3333],
          [-0

In [22]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [23]:
from torch.utils.data import DataLoader
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8)

In [33]:
for batch in train_loader:
    for p in batch['pixel_values']:
        print(p)
        print(p.shape)
        print('---------')
    for l in batch["labels"]:
        print(l)
        print("-------")
    
    inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
    print(inputs)
    print("Inputs -------")
    
    print(inputs['pixel_values'].shape)
    break

tensor([[[-0.6706, -0.6706, -0.6706,  ..., -0.6392, -0.6392, -0.6392],
         [-0.6706, -0.6706, -0.6706,  ..., -0.6392, -0.6392, -0.6392],
         [-0.6706, -0.6706, -0.6706,  ..., -0.6392, -0.6392, -0.6392],
         ...,
         [-0.4510, -0.4510, -0.4510,  ..., -0.3333, -0.3333, -0.3333],
         [-0.4510, -0.4510, -0.4510,  ..., -0.3333, -0.3333, -0.3333],
         [-0.4510, -0.4510, -0.4510,  ..., -0.3333, -0.3333, -0.3333]],

        [[-0.7020, -0.7020, -0.7020,  ..., -0.5686, -0.5686, -0.5686],
         [-0.7020, -0.7020, -0.7020,  ..., -0.5686, -0.5686, -0.5686],
         [-0.7020, -0.7020, -0.7020,  ..., -0.5686, -0.5686, -0.5686],
         ...,
         [-0.1765, -0.1765, -0.1765,  ..., -0.0196, -0.0196, -0.0196],
         [-0.1765, -0.1765, -0.1765,  ..., -0.0196, -0.0196, -0.0196],
         [-0.1765, -0.1765, -0.1765,  ..., -0.0196, -0.0196, -0.0196]],

        [[-0.9373, -0.9373, -0.9373,  ..., -0.5922, -0.5922, -0.5922],
         [-0.9373, -0.9373, -0.9373,  ..., -0

In [26]:
from transformers import ViTForImageClassification, ViTFeatureExtractor, ViTImageProcessor
from torch.optim import AdamW
import torch.nn as nn

model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224-in21k', num_labels=10)
model.to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=5e-5)



Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [34]:
for epoch in range(1):  # Example: 3 epochs
    model.train()
    for batch in train_loader:        
        inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
        # inputs = batch['pixel_values'].to(device)
        labels = batch['labels'].to(device)
        optimizer.zero_grad()
        outputs = model(pixel_values = inputs['pixel_values'])
        
        loss = loss_fn(outputs.logits, labels)
        loss.backward()
        optimizer.step()
        print(f"Batch {epoch}, Loss: {loss.item()}")
    print(f"Epoch {epoch}, Loss: {loss.item()}")

IndexError: too many indices for tensor of dimension 4