In [6]:
!pip install transformers rouge_score evaluate datasets

Collecting transformers
  Using cached transformers-4.46.2-py3-none-any.whl.metadata (44 kB)
Collecting evaluate
  Using cached evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting datasets
  Using cached datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Using cached transformers-4.46.2-py3-none-any.whl (10.0 MB)
Using cached evaluate-0.4.3-py3-none-any.whl (84 kB)
Using cached datasets-3.1.0-py3-none-any.whl (480 kB)
Installing collected packages: transformers, datasets, evaluate
Successfully installed datasets-3.1.0 evaluate-0.4.3 transformers-4.46.2


In [7]:
pip install transformer-utils

Note: you may need to restart the kernel to use updated packages.


In [1]:
import os
import torch
from PIL import Image
import pandas as pd
from tqdm import tqdm
from transformers import VisionEncoderDecoderModel, GPT2TokenizerFast, ViTImageProcessor, AutoTokenizer
from PIL import UnidentifiedImageError
from torchvision.transforms import ToTensor
import concurrent.futures
import functools
import evaluate
from torch.utils.data import DataLoader

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda


In [2]:
encoder_model = "microsoft/swin-base-patch4-window7-224-in22k"
decoder_model = "gpt2"

model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(
    encoder_model, decoder_model
).to(device)

tokenizer = GPT2TokenizerFast.from_pretrained(decoder_model)
image_processor = ViTImageProcessor.from_pretrained(encoder_model)

# Configure tokenizer
if "gpt2" in decoder_model:
    tokenizer.pad_token = tokenizer.eos_token
    model.config.eos_token_id = tokenizer.eos_token_id
    model.config.pad_token_id = tokenizer.pad_token_id
    model.config.decoder_start_token_id = tokenizer.bos_token_id

max_length = 128  # Maximum length for captions

Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at gpt2 and are newly initialized: ['h.0.crossattention.c_attn.bias', 'h.0.crossattention.c_attn.weight', 'h.0.crossattention.c_proj.bias', 'h.0.crossattention.c_proj.weight', 'h.0.crossattention.q_attn.bias', 'h.0.crossattention.q_attn.weight', 'h.0.ln_cross_attn.bias', 'h.0.ln_cross_attn.weight', 'h.1.crossattention.c_attn.bias', 'h.1.crossattention.c_attn.weight', 'h.1.crossattention.c_proj.bias', 'h.1.crossattention.c_proj.weight', 'h.1.crossattention.q_attn.bias', 'h.1.crossattention.q_attn.weight', 'h.1.ln_cross_attn.bias', 'h.1.ln_cross_attn.weight', 'h.10.crossattention.c_attn.bias', 'h.10.crossattention.c_attn.weight', 'h.10.crossattention.c_proj.bias', 'h.10.crossattention.c_proj.weight', 'h.10.crossattention.q_attn.bias', 'h.10.crossattention.q_attn.weight', 'h.10.ln_cross_attn.bias', 'h.10.ln_cross_attn.weight', 'h.11.crossattention.c_attn.bias', 'h.11.crossattention.c_attn.weight', 'h.11.crossat

In [7]:
def process_single_image(image_path, caption):
    """Process a single image and return the result or None if failed"""
    try:
        if not os.path.exists(image_path):
            return None

        image = Image.open(image_path).convert('RGB')
        image_tensor = ToTensor()(image)

        if image_tensor.shape[0] != 3:
            return None

        return {
            'image': image_tensor,
            'sentences': [{'raw': caption}]
        }
    except Exception:
        return None

def load_dataset_parallel(csv_path, image_folder, max_samples=None, num_workers=4):
    """Load dataset using parallel processing"""
    print(f"Reading CSV file: {csv_path}")
    df = pd.read_csv(csv_path)

    if max_samples:
        df = df.head(max_samples)

    total = len(df)
    dataset = []

    # Create list of (image_path, caption) tuples
    image_caption_pairs = [
        (os.path.join(image_folder, row['name']), row['caption'])
        for _, row in df.iterrows()
    ]

    # Process images in parallel
    with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
        results = list(tqdm(
            executor.map(lambda x: process_single_image(*x), image_caption_pairs),
            total=len(image_caption_pairs),
            desc="Processing images"
        ))

    # Filter out None results and create final dataset
    dataset = [item for item in results if item is not None]

    print(f"Successfully loaded {len(dataset)} valid images out of {total} total entries")
    return dataset

In [8]:
# Set paths

train_csv = "D:/Dataset_ROCO/ROCO/train/radiology/traindata.csv"
train_img_folder = "D:/Dataset_ROCO/ROCO/train/radiology/images"
val_csv = "D:/Dataset_ROCO/ROCO/validation/radiology/valdata.csv"
val_img_folder = "D:/Dataset_ROCO/ROCO/validation/radiology/images"
test_csv = "D:/Dataset_ROCO/ROCO/test/radiology/testdata.csv"
test_img_folder = "D:/Dataset_ROCO/ROCO/test/radiology/images"

print("Loading training dataset...")
train_ds = load_dataset_parallel(
    train_csv,
    train_img_folder,
    max_samples=10000,
    num_workers=2
)

print("\nLoading validation dataset...")
valid_ds = load_dataset_parallel(
    val_csv,
    val_img_folder,
    max_samples=2000,
    num_workers=2
)

print("\nLoading test dataset...")
test_ds = load_dataset_parallel(
    test_csv,
    test_img_folder,
    max_samples=2000,
    num_workers=2
)

Loading training dataset...
Reading CSV file: D:/Dataset_ROCO/ROCO/train/radiology/traindata.csv


Processing images:  38%|███▊      | 3844/10000 [00:26<01:23, 73.50it/s] 

: 

In [None]:
'''import pickle

def save_dataset(dataset, filename):
    with open(filename, 'wb') as f:
        pickle.dump(dataset, f)

def load_saved_dataset(filename):
    with open(filename, 'rb') as f:
        return pickle.load(f)

# After processing, save datasets
save_dataset(train_ds, 'train_dataset.pkl')
save_dataset(valid_ds, 'valid_dataset.pkl')
save_dataset(test_ds, 'test_dataset.pkl')

# In future runs, load pre-processed datasets
train_ds = load_saved_dataset('train_dataset.pkl')
valid_ds = load_saved_dataset('valid_dataset.pkl')
test_ds = load_saved_dataset('test_dataset.pkl')'''

"import pickle\n\ndef save_dataset(dataset, filename):\n    with open(filename, 'wb') as f:\n        pickle.dump(dataset, f)\n\ndef load_saved_dataset(filename):\n    with open(filename, 'rb') as f:\n        return pickle.load(f)\n\n# After processing, save datasets\nsave_dataset(train_ds, 'train_dataset.pkl')\nsave_dataset(valid_ds, 'valid_dataset.pkl')\nsave_dataset(test_ds, 'test_dataset.pkl')\n\n# In future runs, load pre-processed datasets\ntrain_ds = load_saved_dataset('train_dataset.pkl')\nvalid_ds = load_saved_dataset('valid_dataset.pkl')\ntest_ds = load_saved_dataset('test_dataset.pkl')"

In [None]:
max_length = 128

def preprocess(items):
    try:

        pixel_values = image_processor(items["image"], return_tensors="pt").pixel_values

        pixel_values = pixel_values.squeeze(0)


        caption = items["sentences"][0]["raw"]
        target = tokenizer(
            caption,
            max_length=max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        labels = target["input_ids"].squeeze(0)

        return {
            'pixel_values': pixel_values,
            'labels': labels
        }
    except Exception as e:
        print(f"Error in preprocessing: {str(e)}")
        return None

def safe_preprocess_dataset(dataset):
    processed_dataset = []
    for item in tqdm(dataset, desc="Preprocessing"):
        try:
            processed_item = preprocess(item)
            if processed_item is not None:
                processed_dataset.append(processed_item)
        except Exception as e:
            print(f"Skipping item due to error: {str(e)}")
            continue
    return processed_dataset

In [None]:
def collate_fn(batch):
    try:
        # Filter out None values if any
        batch = [b for b in batch if b is not None]
        if not batch:
            raise ValueError("Empty batch after filtering")

        # Stack the tensors
        pixel_values = torch.stack([item['pixel_values'] for item in batch])
        labels = torch.stack([item['labels'] for item in batch])

        return {
            'pixel_values': pixel_values.to(device),
            'labels': labels.to(device)
        }
    except Exception as e:
        print(f"Error in collate_fn: {str(e)}")
        return None

In [None]:
print("Preprocessing training dataset...")
train_dataset = safe_preprocess_dataset(train_ds)
print("Preprocessing validation dataset...")
valid_dataset = safe_preprocess_dataset(valid_ds)
print("Preprocessing test dataset...")
test_dataset = safe_preprocess_dataset(test_ds)

Preprocessing training dataset...


NameError: name 'safe_preprocess_dataset' is not defined

In [None]:
batch_size = 16
train_dataloader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=collate_fn,
    #num_workers=2
)
valid_dataloader = DataLoader(
    valid_dataset,
    batch_size=batch_size,
    shuffle=False,
    collate_fn=collate_fn,
    #num_workers=2
)
test_dataloader = DataLoader(
    test_dataset,
    batch_size=batch_size,
    shuffle=False,
    collate_fn=collate_fn,
    #num_workers=2
)

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

# Load metrics
rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")

In [22]:
num_epochs = 10
best_val_loss = float('inf')

for epoch in range(num_epochs):
    model.train()
    total_train_loss = 0

    print(f"\nEpoch {epoch+1}/{num_epochs}")
    for batch_idx, batch in enumerate(tqdm(train_dataloader)):
        try:
            if batch is None:
                continue

            outputs = model(pixel_values=batch['pixel_values'], labels=batch['labels'])
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            total_train_loss += loss.item()

            # Validate every 100 batches
            if (batch_idx + 1) % 100 == 0:
                model.eval()
                val_loss = 0
                with torch.no_grad():
                    for val_batch in valid_dataloader:
                        if val_batch is None:
                            continue
                        val_outputs = model(pixel_values=val_batch['pixel_values'],
                                         labels=val_batch['labels'])
                        val_loss += val_outputs.loss.item()

                avg_val_loss = val_loss / len(valid_dataloader)
                print(f"\nStep {batch_idx+1}: Validation Loss = {avg_val_loss:.4f}")

                if avg_val_loss < best_val_loss:
                    best_val_loss = avg_val_loss
                    model.save_pretrained(f"best_model_epoch_{epoch+1}_batch_{batch_idx+1}")

                model.train()

        except Exception as e:
            print(f"Error in batch {batch_idx}: {str(e)}")
            continue

    avg_train_loss = total_train_loss / len(train_dataloader)
    print(f"\nEpoch {epoch+1} - Average Training Loss: {avg_train_loss:.4f}")


Epoch 1/10


 16%|█▌        | 99/625 [01:04<05:21,  1.64it/s]


Step 100: Validation Loss = 0.9284


 32%|███▏      | 199/625 [02:34<04:28,  1.59it/s]  


Step 200: Validation Loss = 0.8960


 48%|████▊     | 299/625 [04:04<03:26,  1.58it/s]  


Step 300: Validation Loss = 0.8753


 64%|██████▍   | 399/625 [05:34<02:23,  1.57it/s]


Step 400: Validation Loss = 0.8614


 80%|███████▉  | 499/625 [07:05<01:20,  1.57it/s]


Step 500: Validation Loss = 0.8495


 96%|█████████▌| 599/625 [08:35<00:16,  1.57it/s]


Step 600: Validation Loss = 0.8390


100%|██████████| 625/625 [09:18<00:00,  1.12it/s]



Epoch 1 - Average Training Loss: 0.9863

Epoch 2/10


 16%|█▌        | 99/625 [01:02<05:35,  1.57it/s]


Step 100: Validation Loss = 0.8320


 32%|███▏      | 199/625 [02:33<04:30,  1.58it/s]  


Step 200: Validation Loss = 0.8261


 48%|████▊     | 299/625 [04:03<03:27,  1.57it/s]  


Step 300: Validation Loss = 0.8195


 64%|██████▍   | 399/625 [05:33<02:23,  1.57it/s]


Step 400: Validation Loss = 0.8138


 80%|███████▉  | 499/625 [07:02<01:19,  1.58it/s]


Step 500: Validation Loss = 0.8099


 96%|█████████▌| 599/625 [08:32<00:16,  1.57it/s]


Step 600: Validation Loss = 0.8054


100%|██████████| 625/625 [09:15<00:00,  1.13it/s]



Epoch 2 - Average Training Loss: 0.8565

Epoch 3/10


 16%|█▌        | 99/625 [01:02<05:32,  1.58it/s]


Step 100: Validation Loss = 0.8031


 32%|███▏      | 199/625 [02:32<04:30,  1.58it/s]  


Step 200: Validation Loss = 0.8011


 48%|████▊     | 299/625 [04:01<03:27,  1.57it/s]  


Step 300: Validation Loss = 0.7992


 64%|██████▍   | 399/625 [05:31<02:23,  1.58it/s]


Step 400: Validation Loss = 0.7931


 80%|███████▉  | 499/625 [07:01<01:19,  1.58it/s]


Step 500: Validation Loss = 0.7900


 96%|█████████▌| 599/625 [08:30<00:16,  1.57it/s]


Step 600: Validation Loss = 0.7882


100%|██████████| 625/625 [09:13<00:00,  1.13it/s]



Epoch 3 - Average Training Loss: 0.8039

Epoch 4/10


 16%|█▌        | 100/625 [01:28<1:11:54,  8.22s/it]


Step 100: Validation Loss = 0.7882


 32%|███▏      | 199/625 [02:31<04:30,  1.57it/s]  


Step 200: Validation Loss = 0.7868


 48%|████▊     | 299/625 [04:00<03:28,  1.56it/s]  


Step 300: Validation Loss = 0.7844


 64%|██████▍   | 399/625 [05:31<02:25,  1.55it/s]


Step 400: Validation Loss = 0.7829


 80%|███████▉  | 499/625 [07:02<01:20,  1.57it/s]


Step 500: Validation Loss = 0.7809


 96%|█████████▌| 600/625 [08:59<03:27,  8.31s/it]


Step 600: Validation Loss = 0.7818


100%|██████████| 625/625 [09:15<00:00,  1.13it/s]



Epoch 4 - Average Training Loss: 0.7626

Epoch 5/10


 16%|█▌        | 100/625 [01:29<1:12:47,  8.32s/it]


Step 100: Validation Loss = 0.7826


 32%|███▏      | 200/625 [02:59<58:47,  8.30s/it]  


Step 200: Validation Loss = 0.7816


 48%|████▊     | 300/625 [04:28<44:55,  8.30s/it]


Step 300: Validation Loss = 0.7817


 64%|██████▍   | 399/625 [05:31<02:24,  1.57it/s]


Step 400: Validation Loss = 0.7761


 80%|███████▉  | 499/625 [07:03<01:21,  1.55it/s]


Step 500: Validation Loss = 0.7758


 96%|█████████▌| 600/625 [08:59<03:27,  8.29s/it]


Step 600: Validation Loss = 0.7764


100%|██████████| 625/625 [09:15<00:00,  1.13it/s]



Epoch 5 - Average Training Loss: 0.7264

Epoch 6/10


 16%|█▌        | 100/625 [01:29<1:12:42,  8.31s/it]


Step 100: Validation Loss = 0.7814


 32%|███▏      | 200/625 [02:58<58:45,  8.29s/it]  


Step 200: Validation Loss = 0.7818


 48%|████▊     | 300/625 [04:28<44:57,  8.30s/it]


Step 300: Validation Loss = 0.7789


 64%|██████▍   | 400/625 [05:57<31:05,  8.29s/it]


Step 400: Validation Loss = 0.7763


 80%|███████▉  | 499/625 [07:00<01:20,  1.56it/s]


Step 500: Validation Loss = 0.7744


 96%|█████████▌| 600/625 [08:57<03:27,  8.30s/it]


Step 600: Validation Loss = 0.7768


100%|██████████| 625/625 [09:13<00:00,  1.13it/s]



Epoch 6 - Average Training Loss: 0.6921

Epoch 7/10


 16%|█▌        | 100/625 [01:29<1:12:35,  8.30s/it]


Step 100: Validation Loss = 0.7836


 32%|███▏      | 200/625 [02:58<58:45,  8.30s/it]  


Step 200: Validation Loss = 0.7838


 48%|████▊     | 300/625 [04:28<45:02,  8.32s/it]


Step 300: Validation Loss = 0.7857


 64%|██████▍   | 400/625 [05:57<31:09,  8.31s/it]


Step 400: Validation Loss = 0.7804


 80%|████████  | 500/625 [07:26<17:17,  8.30s/it]


Step 500: Validation Loss = 0.7801


 96%|█████████▌| 600/625 [08:56<03:27,  8.29s/it]


Step 600: Validation Loss = 0.7793


100%|██████████| 625/625 [09:12<00:00,  1.13it/s]



Epoch 7 - Average Training Loss: 0.6601

Epoch 8/10


 16%|█▌        | 100/625 [01:29<1:12:38,  8.30s/it]


Step 100: Validation Loss = 0.7885


 32%|███▏      | 200/625 [02:58<58:46,  8.30s/it]  


Step 200: Validation Loss = 0.7902


 48%|████▊     | 300/625 [04:27<44:59,  8.31s/it]


Step 300: Validation Loss = 0.7885


 64%|██████▍   | 400/625 [05:57<31:04,  8.28s/it]


Step 400: Validation Loss = 0.7858


 80%|████████  | 500/625 [07:26<17:17,  8.30s/it]


Step 500: Validation Loss = 0.7869


 96%|█████████▌| 600/625 [08:55<03:27,  8.30s/it]


Step 600: Validation Loss = 0.7860


100%|██████████| 625/625 [09:11<00:00,  1.13it/s]



Epoch 8 - Average Training Loss: 0.6284

Epoch 9/10


 16%|█▌        | 100/625 [01:29<1:12:48,  8.32s/it]


Step 100: Validation Loss = 0.8024


 32%|███▏      | 200/625 [02:59<58:50,  8.31s/it]  


Step 200: Validation Loss = 0.7993


 48%|████▊     | 300/625 [04:28<44:56,  8.30s/it]


Step 300: Validation Loss = 0.8018


 64%|██████▍   | 400/625 [05:57<31:05,  8.29s/it]


Step 400: Validation Loss = 0.7994


 80%|████████  | 500/625 [07:27<17:16,  8.29s/it]


Step 500: Validation Loss = 0.7972


 96%|█████████▌| 600/625 [08:56<03:27,  8.30s/it]


Step 600: Validation Loss = 0.8004


100%|██████████| 625/625 [09:12<00:00,  1.13it/s]



Epoch 9 - Average Training Loss: 0.5979

Epoch 10/10


 16%|█▌        | 100/625 [01:29<1:12:46,  8.32s/it]


Step 100: Validation Loss = 0.8175


 32%|███▏      | 200/625 [02:58<58:46,  8.30s/it]  


Step 200: Validation Loss = 0.8166


 48%|████▊     | 300/625 [04:28<44:55,  8.29s/it]


Step 300: Validation Loss = 0.8146


 64%|██████▍   | 400/625 [05:57<31:03,  8.28s/it]


Step 400: Validation Loss = 0.8061


 80%|████████  | 500/625 [07:26<17:19,  8.31s/it]


Step 500: Validation Loss = 0.8117


 96%|█████████▌| 600/625 [08:56<03:27,  8.32s/it]


Step 600: Validation Loss = 0.8090


100%|██████████| 625/625 [09:12<00:00,  1.13it/s]


Epoch 10 - Average Training Loss: 0.5677





In [24]:
torch.save({
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'config': model.config
}, 'D:/Dataset_ROCO/ROCO/model.pth')


### model1 after loading new model

In [25]:
checkpoint = torch.load('D:/Dataset_ROCO/ROCO/model.pth')
model1 = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(
    encoder_model, decoder_model
).to(device)
model1.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

  checkpoint = torch.load('D:/Dataset_ROCO/ROCO/model.pth')
Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at gpt2 and are newly initialized: ['h.0.crossattention.c_attn.bias', 'h.0.crossattention.c_attn.weight', 'h.0.crossattention.c_proj.bias', 'h.0.crossattention.c_proj.weight', 'h.0.crossattention.q_attn.bias', 'h.0.crossattention.q_attn.weight', 'h.0.ln_cross_attn.bias', 'h.0.ln_cross_attn.weight', 'h.1.crossattention.c_attn.bias', 'h.1.crossattention.c_attn.weight', 'h.1.crossattention.c_proj.bias', 'h.1.crossattention.c_proj.weight', 'h.1.crossattention.q_attn.bias', 'h.1.crossattention.q_attn.weight', 'h.1.ln_cross_attn.bias', 'h.1.ln_cross_attn.weight', 'h.10.crossattention.c_attn.bias', 'h.10.crossattention.c_attn.weight', 'h.10.crossattention.c_proj.bias', 'h.10.crossattention.c_proj.weight', 'h.10.crossattention.q_attn.bias', 'h.10.crossattention.q_attn.weight', 'h.10.ln_cross_attn.bias', 'h.10.ln_cross_attn.weight', 'h.11.crossattention.c_at

In [26]:
if "gpt2" in decoder_model:
    tokenizer.pad_token = tokenizer.eos_token
    model1.config.eos_token_id = tokenizer.eos_token_id
    model1.config.pad_token_id = tokenizer.pad_token_id
    model1.config.decoder_start_token_id = tokenizer.bos_token_id

# Make sure the model is in the correct device
model1 = model1.to(device)

In [29]:
optimizer = torch.optim.Adagrad(model.parameters(), lr=0.01)


In [30]:
num_epochs = 5
best_val_loss = float('inf')

for epoch in range(num_epochs):
    model1.train()
    total_train_loss = 0

    print(f"\nEpoch {epoch+1}/{num_epochs}")
    for batch_idx, batch in enumerate(tqdm(train_dataloader)):
        try:
            if batch is None:
                continue

            outputs = model1(pixel_values=batch['pixel_values'], labels=batch['labels'])
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            total_train_loss += loss.item()

            # Validate every 100 batches
            if (batch_idx + 1) % 100 == 0:
                model1.eval()
                val_loss = 0
                with torch.no_grad():
                    for val_batch in valid_dataloader:
                        if val_batch is None:
                            continue
                        val_outputs = model1(pixel_values=val_batch['pixel_values'],
                                         labels=val_batch['labels'])
                        val_loss += val_outputs.loss.item()

                avg_val_loss = val_loss / len(valid_dataloader)
                print(f"\nStep {batch_idx+1}: Validation Loss = {avg_val_loss:.4f}")

                if avg_val_loss < best_val_loss:
                    best_val_loss = avg_val_loss
                    model1.save_pretrained(f"best_model1_epoch_{epoch+1}_batch_{batch_idx+1}")

                model1.train()

        except Exception as e:
            print(f"Error in batch {batch_idx}: {str(e)}")
            continue

    avg_train_loss = total_train_loss / len(train_dataloader)
    print(f"\nEpoch {epoch+1} - Average Training Loss: {avg_train_loss:.4f}")


Epoch 1/5


  0%|          | 2/625 [00:00<01:57,  5.30it/s]

Error in batch 0: CUDA out of memory. Tried to allocate 24.00 MiB. GPU 0 has a total capacity of 15.99 GiB of which 104.50 MiB is free. Of the allocated memory 14.17 GiB is allocated by PyTorch, and 501.34 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
Error in batch 1: CUDA out of memory. Tried to allocate 24.00 MiB. GPU 0 has a total capacity of 15.99 GiB of which 146.07 MiB is free. Of the allocated memory 14.14 GiB is allocated by PyTorch, and 529.60 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


  1%|          | 4/625 [00:00<01:31,  6.79it/s]

Error in batch 2: CUDA out of memory. Tried to allocate 24.00 MiB. GPU 0 has a total capacity of 15.99 GiB of which 92.24 MiB is free. Of the allocated memory 14.14 GiB is allocated by PyTorch, and 528.25 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
Error in batch 3: CUDA out of memory. Tried to allocate 18.00 MiB. GPU 0 has a total capacity of 15.99 GiB of which 175.05 MiB is free. Of the allocated memory 14.06 GiB is allocated by PyTorch, and 601.69 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


  1%|          | 5/625 [00:00<01:23,  7.44it/s]

: 

In [27]:
def evaluate_model(model, dataloader, split="test"):
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []

    print(f"\nEvaluating on {split} set...")
    with torch.no_grad():
        for batch in tqdm(dataloader):
            try:
                if batch is None:
                    continue

                outputs = model.generate(
                    batch['pixel_values'],
                    max_length=max_length,
                    num_beams=4,
                    early_stopping=True
                )

                # Compute loss
                model_outputs = model(pixel_values=batch['pixel_values'],
                                   labels=batch['labels'])
                total_loss += model_outputs.loss.item()

                # Store predictions and labels
                all_preds.extend(outputs.cpu().numpy())
                all_labels.extend(batch['labels'].cpu().numpy())

            except Exception as e:
                print(f"Error in evaluation: {str(e)}")
                continue

    # Decode predictions and compute metrics
    pred_str = tokenizer.batch_decode(all_preds, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(all_labels, skip_special_tokens=True)

    rouge_scores = rouge.compute(predictions=pred_str, references=label_str)
    bleu_score = bleu.compute(predictions=pred_str, references=label_str)

    metrics = {
        f"{split}_loss": total_loss / len(dataloader),
        f"{split}_rouge1": rouge_scores['rouge1'],
        f"{split}_rouge2": rouge_scores['rouge2'],
        f"{split}_rougeL": rouge_scores['rougeL'],
        f"{split}_bleu": bleu_score['bleu']
    }

    return metrics, pred_str

# Evaluate on test set
test_metrics, generated_captions = evaluate_model(model, test_dataloader)
print("\nTest Results:")
for metric, value in test_metrics.items():
    print(f"{metric}: {value:.4f}")

# Save some example predictions
print("\nSample Generated Captions:")
for i in range(min(5, len(generated_captions))):
    print(f"Caption {i+1}: {generated_captions[i]}")


Evaluating on test set...


  0%|          | 0/125 [00:00<?, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  1%|          | 1/125 [00:01<03:31,  1.71s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  2%|▏         | 2/125 [00:02<02:52,  1.40s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  2%|▏         | 3/125 [00:04<02:40,  1.32s/it]The attention mask and the pad token id were not set. As a consequence


Test Results:
test_loss: 0.8623
test_rouge1: 0.1587
test_rouge2: 0.0476
test_rougeL: 0.1409
test_bleu: 0.0156

Sample Generated Captions:
Caption 1:  Computed tomography (CT) scan of the abdomen showing a large mass in the right adrenal gland.

Caption 2:  Computed tomography (CT) scan of the abdomen showing a large mass in the right adrenal gland.

Caption 3:  Contrast-enhanced computed tomography (CT) scan of the abdomen and pelvis shows a large heterogeneously enhancing mass in the right adrenal gland (arrow).

Caption 4:  Chest X-ray showing bilateral pleural effusions.

Caption 5:  Chest X-ray of the patient showing a large right-sided pleural effusion

