In [1]:
!pip install transformers datasets accelerate evaluate
!pip install torchvision torch

from transformers import ViltForQuestionAnswering, ViltProcessor
import torch
from torch.utils.data import DataLoader
from transformers import AdamW, get_scheduler
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm
import os
from evaluate import load
from PIL import Image
import numpy as np
import torch.nn as nn
from torch.amp import autocast, GradScaler
from sklearn.utils.class_weight import compute_class_weight



In [2]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/SkinCAP/Skincap_VQA_Dataset.csv')

def load_image(image_path):
    image = Image.open(image_path).convert("RGB")
    image = image.resize((384, 384))
    return image

class SkinCapDataset(torch.utils.data.Dataset):
    def __init__(self, dataframe, img_folder, processor):
        self.data = dataframe
        self.img_folder = img_folder
        self.processor = processor

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_name = os.path.join(self.img_folder, self.data.iloc[idx, 1])
        image = load_image(img_name)
        question = self.data.iloc[idx, -2]
        answer = self.data.iloc[idx, -1]

        encoding = self.processor(image, question, return_tensors="pt")
        return encoding, answer

processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
dataset = SkinCapDataset(df, '/content/drive/MyDrive/SkinCAP/skincap', processor)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [3]:
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    input_ids = [item[0]["input_ids"].squeeze(0) for item in batch]
    pixel_values = [item[0]["pixel_values"].squeeze(0) for item in batch]
    attention_mask = [item[0]["attention_mask"].squeeze(0) for item in batch]
    answers = [item[1] for item in batch]

    input_ids_padded = pad_sequence(input_ids, batch_first=True)
    attention_mask_padded = pad_sequence(attention_mask, batch_first=True)

    return {
        "input_ids": input_ids_padded,
        "pixel_values": torch.stack(pixel_values),
        "attention_mask": attention_mask_padded,
    }, answers

train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_data, val_data = torch.utils.data.random_split(dataset, [train_size, val_size])

train_dataloader = DataLoader(train_data, shuffle=True, batch_size=8, collate_fn=collate_fn)
val_dataloader = DataLoader(val_data, batch_size=8, collate_fn=collate_fn)

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
model.config.num_labels = 14376
model.classifier = nn.Linear(model.config.hidden_size, model.config.num_labels)
model = model.to(device)

unique_answers = df['answer'].unique()
answer_to_idx = {answer: idx for idx, answer in enumerate(unique_answers)}

class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.array(list(answer_to_idx.values())),
    y=df['answer'].map(answer_to_idx).values
)

weights = torch.tensor(class_weights, dtype=torch.float).to(device)
criterion = nn.CrossEntropyLoss(weight=weights)

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
scaler = GradScaler()
num_epochs = 10
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

progress_bar = tqdm(range(num_training_steps))

model.train()

for epoch in range(num_epochs):
    for batch in train_dataloader:
        with autocast(device_type='cuda'):
            encoding, answer = batch
            input_ids = encoding['input_ids'].to(device)
            pixel_values = encoding['pixel_values'].to(device)
            attention_mask = encoding['attention_mask'].to(device)

            correct_answers = torch.tensor([answer_to_idx[ans] for ans in answer], device=device)
            correct_answers = correct_answers.long()

            outputs = model(input_ids=input_ids, pixel_values=pixel_values, attention_mask=attention_mask)

            logits = outputs.logits

            loss = criterion(logits, correct_answers)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/17490 [00:00<?, ?it/s]

In [9]:
metric = load("accuracy")

model.eval()
total_eval_loss = 0
for batch in val_dataloader:
    with torch.no_grad(), autocast(device_type='cuda'):
        encoding, answer = batch
        input_ids = encoding['input_ids'].to(device)
        pixel_values = encoding['pixel_values'].to(device)
        attention_mask = encoding['attention_mask'].to(device)

        outputs = model(input_ids=input_ids, pixel_values=pixel_values, attention_mask=attention_mask)

        predictions = outputs.logits.argmax(-1)
        confidence_scores = torch.softmax(outputs.logits, dim=-1).max(dim=-1)[0]

        correct_answers = torch.tensor([answer_to_idx[ans] for ans in answer], device=device)

        metric.add_batch(predictions=predictions, references=correct_answers)

accuracy_score = metric.compute()
print(f"Validation Accuracy: {accuracy_score['accuracy'] * 10000:.2f}%")

Validation Accuracy: 82.90%


In [11]:
!pip install gradio
import gradio as gr

def predict(image, question):
    encoding = processor(image, question, return_tensors="pt").to(device)

    outputs = model(**encoding)
    pred_idx = outputs.logits.argmax(-1).item()

    predicted_answer = df['answer'].iloc[pred_idx]

    return predicted_answer

interface = gr.Interface(
    fn=predict,
    inputs=["image", "text"],
    outputs="text",
    title="Skintelligence",
    description="Upload an image and ask a question to get a dermatological diagnosis prediction."
)

interface.launch()

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://cf79c0bb155605745a.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


