In [1]:
import torch
print(torch.cuda.is_available())  # Should return True
print(torch.cuda.get_device_name(0))  # Should display your GPU name


True
NVIDIA GeForce RTX 4060 Laptop GPU


In [1]:
import torch
import numpy as np
from transformers import ViTForImageClassification, ViTFeatureExtractor, Trainer, TrainingArguments
from datasets import load_dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Set dataset directory (WSL2 path)
dataset_path = r"D:\01 STUDY MATERIAL\ai project\Breast-Splitted"

# Load dataset using ImageFolder
dataset = load_dataset('imagefolder', data_dir=dataset_path)

# Load feature extractor
feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224')

# Define transformation function
def transform(example):
    example['pixel_values'] = feature_extractor(example['image'], return_tensors='pt')['pixel_values'][0]
    return example

# Apply transformations
dataset = dataset.map(transform, remove_columns=['image'])

# Define label mappings
labels = dataset['train'].features['label'].names
num_labels = len(labels)
id2label = {i: label for i, label in enumerate(labels)}
label2id = {label: i for i, label in enumerate(labels)}

# Load ViT model with size mismatch handling
model = ViTForImageClassification.from_pretrained(
    'google/vit-base-patch16-224',
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True  # Added to avoid size mismatch error
)

# Define metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    return {'accuracy': acc, 'precision': precision, 'recall': recall, 'f1': f1}

# Define training arguments
training_args = TrainingArguments(
    output_dir='./vit_model',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=True,
    fp16=True if torch.cuda.is_available() else False,  # Enable mixed precision if GPU is available
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['val'],
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

# Save trained model
trainer.save_model('./vit_trained')

  from .autonotebook import tqdm as notebook_tqdm
Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([2]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


KeyError: 'val'

In [3]:
import torch
import numpy as np
from transformers import ViTForImageClassification, ViTFeatureExtractor, Trainer, TrainingArguments
from datasets import load_dataset, DatasetDict
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import os

# Define paths
data_dir = 'D:/01 STUDY MATERIAL/ai project/Breast-Splitted'
train_dir = os.path.join(data_dir, 'train')
test_dir = os.path.join(data_dir, 'test')

# Load the dataset
dataset = load_dataset('imagefolder', data_files={
    'train': os.path.join(train_dir, '**/*.png'),
    'test': os.path.join(test_dir, '**/*.png')
})

# Split train dataset into train and validation sets (80-20 split)
dataset = DatasetDict({
    'train': dataset['train'].train_test_split(test_size=0.2, seed=42)['train'],
    'val': dataset['train'].train_test_split(test_size=0.2, seed=42)['test'],
    'test': dataset['test']
})

# Load feature extractor
feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224')

# Preprocessing function
def preprocess(examples):
    # Directly access lists of images and labels
    images = examples['image']
    inputs = feature_extractor(images, return_tensors='pt')
    inputs['labels'] = examples['label']
    return inputs

# Apply preprocessing
dataset = dataset.map(preprocess, batched=True)

# Get labels
labels = dataset['train'].features['label'].names
num_labels = len(labels)
id2label = {i: label for i, label in enumerate(labels)}
label2id = {label: i for i, label in enumerate(labels)}

# Load ViT model
model = ViTForImageClassification.from_pretrained(
    'google/vit-base-patch16-224',
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True
)

# Define metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    return {'accuracy': acc, 'precision': precision, 'recall': recall, 'f1': f1}

# Define training arguments
training_args = TrainingArguments(
    output_dir='./vit_model',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=True,
    fp16=True if torch.cuda.is_available() else False,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['val'],
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

# Evaluate on the test set
metrics = trainer.evaluate(dataset['test'])
print(metrics)


Map: 100%|██████████| 3796/3796 [00:59<00:00, 63.54 examples/s]
Map: 100%|██████████| 949/949 [00:14<00:00, 64.00 examples/s]
Map: 100%|██████████| 1583/1583 [00:24<00:00, 63.42 examples/s]
Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([2]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0022,0.111603,0.971549,0.974474,0.984825,0.979623
2,0.0379,0.056198,0.986301,0.989394,0.990895,0.990144
3,0.0003,0.062891,0.984194,0.980597,0.996965,0.988713
4,0.0389,0.056224,0.984194,0.992355,0.984825,0.988576
5,0.0,0.058119,0.985248,0.983508,0.995448,0.989442


{'eval_loss': 0.09825846552848816, 'eval_accuracy': 0.9709412507896399, 'eval_precision': 0.9797235023041475, 'eval_recall': 0.9779208831646734, 'eval_f1': 0.9788213627992634, 'eval_runtime': 117.6481, 'eval_samples_per_second': 13.455, 'eval_steps_per_second': 1.683, 'epoch': 5.0}
