In [1]:
%load_ext autoreload
%autoreload 2

In [4]:
!pip install -r requirements.txt

Collecting clip@ git+https://github.com/openai/CLIP.git@dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1 (from -r requirements.txt (line 18))
  Cloning https://github.com/openai/CLIP.git (to revision dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1) to /tmp/pip-install-6wtv59y7/clip_df97596d93b242949454a234f5151ba5
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-install-6wtv59y7/clip_df97596d93b242949454a234f5151ba5
  Running command git rev-parse -q --verify 'sha^dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1'
  Running command git fetch -q https://github.com/openai/CLIP.git dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting absl-py==2.2.2 (from -r requirements.txt (line 1))
  Downloading absl_py-2.2.2-py3-none-any.whl.metadata (2.6 kB)
Collecting accelerate==1.6.0 (from -r requirements.txt (line 2))
  Downl

In [1]:
!apt-get update -y
!apt-get install -y openjdk-17-jre-headless

Get:1 http://archive.ubuntu.com/ubuntu jammy InRelease [270 kB]
Get:2 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]      
Get:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1581 B]
Get:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [1804 kB]
Get:5 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease [18.1 kB]
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]        
Get:7 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Get:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy/main amd64 Packages [33.2 kB]
Get:9 http://security.ubuntu.com/ubuntu jammy-security/main amd64 Packages [3098 kB]
Get:10 http://archive.ubuntu.com/ubuntu jammy/universe amd64 Packages [17.5 MB]
Get:11 http://security.ubuntu.com/ubuntu jammy-security/multiverse amd64 Packages [48.5 kB]
Get:12 http://security.ubuntu.com/ubuntu jammy-security/res

In [4]:
import os
import time
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm import tqdm
from transformers import get_scheduler
from torch.optim.lr_scheduler import ReduceLROnPlateau
from custom_tokenizers import Tokenizer
from configs.config import DataConfig, EncoderConfig, DecoderConfig, PaliGemmaConfig
from decoder_layers import KVCache, PaliGemmaForConditionalGeneration
from dataloaders import CustomDataLoader
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

2025-07-03 06:19:53.310670: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-07-03 06:19:53.321143: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751523593.334046    2631 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751523593.338048    2631 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1751523593.348615    2631 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [5]:
def train(
    model,
    dataloader,
    optimizer,
    device,
    epoch,
    grad_accumulation_steps=1,
    max_grad_norm=1.0,
    use_amp=False,
):
    model.train()
    kv_cache = KVCache()
    scaler = torch.cuda.amp.GradScaler(enabled=use_amp)
    config = DecoderConfig()

    total_loss = 0.0

    for step, batch in enumerate(tqdm(dataloader, desc=f"Epoch {epoch}")):
        input_ids, pixel_values, attention_mask = batch
        input_ids = input_ids.to(device)
        pixel_values = pixel_values.to(device)
        attention_mask = attention_mask.to(device)
        with torch.cuda.amp.autocast(enabled=use_amp):
            outputs = model(
                input_ids=input_ids,
                pixel_values=pixel_values,
                attention_mask=attention_mask
            )
            
            logits = outputs["logits"]
            shift_logits = logits[:, :-1, :].contiguous()
            shift_labels = input_ids[:, 1:].contiguous()
            
            loss = torch.nn.functional.cross_entropy(
                shift_logits.view(-1, config.vocab_size),
                shift_labels.view(-1),
                label_smoothing=0.1
            )

        scaler.scale(loss).backward()

        if (step + 1) % grad_accumulation_steps == 0:
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()

        total_loss += loss.item()

    return total_loss / len(dataloader)

@torch.no_grad()
def validate(model, dataloader, device, use_amp=False):
    model.eval()
    total_loss = 0.0
    scaler = torch.cuda.amp.GradScaler(enabled=use_amp)
    config = DecoderConfig()

    for batch in tqdm(dataloader, desc="Validating"):
        input_ids, pixel_values, attention_mask = batch
        input_ids = input_ids.to(device)
        pixel_values = pixel_values.to(device)
        attention_mask = attention_mask.to(device)

        with torch.cuda.amp.autocast(enabled=use_amp):
            outputs = model(
                input_ids=input_ids,
                pixel_values=pixel_values,
                attention_mask=attention_mask
            )
            logits = outputs["logits"]  # shape: [B, T, V]
            shift_logits = logits[:, :-1, :].contiguous()
            shift_labels = input_ids[:, 1:].contiguous()
            
            loss = torch.nn.functional.cross_entropy(
                shift_logits.view(-1, config.vocab_size),
                shift_labels.view(-1),
                label_smoothing=0.1
            )

        total_loss += loss.item()

    return total_loss / len(dataloader)

In [6]:
def main():
    # Hyperparameters
    epochs = 10
    batch_size = 64
    learning_rate = 3e-5
    grad_accumulation_steps = 1
    use_amp = True
    patience = 30

    device = "cuda" if torch.cuda.is_available() else "cpu"
    print("Using device:", device)

    # === Load Config ===
    text_config = DecoderConfig()
    vision_config = EncoderConfig()
    text_config.vocab_size += 1
    image_token_index = text_config.vocab_size - 1
    config = PaliGemmaConfig(
        text_config=text_config,
        vision_config=vision_config,
        image_token_index=image_token_index,
    )

    # === Load Model ===
    model = PaliGemmaForConditionalGeneration(config).to(device)

    # Tokenizer
    tokenizer = Tokenizer(DataConfig())

    # === Dataloader ===
    train_loader = CustomDataLoader(
        split="train",
        batch_size=batch_size,
        num_workers=1,
        tokenizer=tokenizer,
        shuffle=True
    )
    val_loader = CustomDataLoader(
        split="val",
        batch_size=batch_size,
        num_workers=1,
        tokenizer=tokenizer,
        shuffle=False
    )

    # === Optimizer and Scheduler ===
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    num_training_steps = epochs * len(train_loader) // grad_accumulation_steps
    # lr_scheduler = get_scheduler(
    #     "linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
    # )
    lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=10, threshold=0.0001, threshold_mode='abs')

    best_val_loss = float("inf")
    patience_counter = 0

    # === Training Loop ===
    start_time = time.time()
    train_loss = []
    val_loss = []
    for epoch in range(1, epochs + 1):
        avg_train_loss = train(
            model,
            train_loader,
            optimizer,
            # lr_scheduler,
            device,
            epoch,
            grad_accumulation_steps,
            use_amp=use_amp,
        )
        avg_val_loss = validate(model, val_loader, device, use_amp)
        lr_scheduler.step(avg_val_loss)
        train_loss.append(avg_train_loss)
        val_loss.append(avg_val_loss)
        print(f"Epoch {epoch} - Train Loss: {avg_train_loss:.4f} - Val Loss: {avg_val_loss:.4f}")

        # Save best model
        if avg_val_loss < best_val_loss:
            print("Validation loss improved. Saving model...")
            best_val_loss = avg_val_loss
            patience_counter = 0
            os.makedirs("checkpoints", exist_ok=True)
            torch.save(model.state_dict(), f"checkpoints/experiment_6.pt")
        else:
            patience_counter += 1
            print(f"No improvement. Patience: {patience_counter}/{patience}")
            if patience_counter >= patience:
                print("Early stopping triggered.")
                break
    save_dir = "plot"
    os.makedirs(save_dir, exist_ok=True)

    # Plotting
    epochs = list(range(1, len(train_loss) + 1))
    plt.figure(figsize=(8, 6))
    plt.plot(epochs, train_loss, label='Train Loss', marker='o')
    plt.plot(epochs, val_loss, label='Validation Loss', marker='x')
    plt.title("Training and Validation Loss per Epoch")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.legend()
    plt.grid(True)

    # Save the plot
    plot_path = os.path.join(save_dir, "experiment_6.png")
    plt.savefig(plot_path)
    plt.close()

    # Calculate elapsed time
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Elapsed time: {elapsed_time} seconds")

In [7]:
if __name__ == "__main__":
    main()

Using device: cuda


Epoch 1: 100%|██████████| 33/33 [00:35<00:00,  1.06s/it]
Validating: 100%|██████████| 5/5 [00:05<00:00,  1.04s/it]


Epoch 1 - Train Loss: 3.8154 - Val Loss: 2.3447
Validation loss improved. Saving model...


Epoch 2: 100%|██████████| 33/33 [00:34<00:00,  1.04s/it]
Validating: 100%|██████████| 5/5 [00:05<00:00,  1.02s/it]


Epoch 2 - Train Loss: 2.0775 - Val Loss: 1.9575
Validation loss improved. Saving model...


Epoch 3: 100%|██████████| 33/33 [00:34<00:00,  1.05s/it]
Validating: 100%|██████████| 5/5 [00:05<00:00,  1.03s/it]


Epoch 3 - Train Loss: 1.8767 - Val Loss: 1.7503
Validation loss improved. Saving model...


Epoch 4: 100%|██████████| 33/33 [00:34<00:00,  1.05s/it]
Validating: 100%|██████████| 5/5 [00:05<00:00,  1.07s/it]


Epoch 4 - Train Loss: 1.6936 - Val Loss: 1.6201
Validation loss improved. Saving model...


Epoch 5: 100%|██████████| 33/33 [00:34<00:00,  1.05s/it]
Validating: 100%|██████████| 5/5 [00:05<00:00,  1.03s/it]


Epoch 5 - Train Loss: 1.5928 - Val Loss: 1.5533
Validation loss improved. Saving model...


Epoch 6: 100%|██████████| 33/33 [00:34<00:00,  1.03s/it]
Validating: 100%|██████████| 5/5 [00:05<00:00,  1.12s/it]


Epoch 6 - Train Loss: 1.5428 - Val Loss: 1.5148
Validation loss improved. Saving model...


Epoch 7: 100%|██████████| 33/33 [00:34<00:00,  1.04s/it]
Validating: 100%|██████████| 5/5 [00:05<00:00,  1.03s/it]


Epoch 7 - Train Loss: 1.5102 - Val Loss: 1.4876
Validation loss improved. Saving model...


Epoch 8: 100%|██████████| 33/33 [00:34<00:00,  1.04s/it]
Validating: 100%|██████████| 5/5 [00:05<00:00,  1.07s/it]


Epoch 8 - Train Loss: 1.4866 - Val Loss: 1.4668
Validation loss improved. Saving model...


Epoch 9: 100%|██████████| 33/33 [00:34<00:00,  1.04s/it]
Validating: 100%|██████████| 5/5 [00:05<00:00,  1.03s/it]


Epoch 9 - Train Loss: 1.4655 - Val Loss: 1.4497
Validation loss improved. Saving model...


Epoch 10: 100%|██████████| 33/33 [00:33<00:00,  1.02s/it]
Validating: 100%|██████████| 5/5 [00:05<00:00,  1.05s/it]


Epoch 10 - Train Loss: 1.4489 - Val Loss: 1.4359
Validation loss improved. Saving model...
Elapsed time: 416.77386713027954 seconds


In [7]:
from PIL import Image
import clip
import torch
from torch.nn import functional as F
from tqdm import tqdm
import argparse
from dataloaders import CustomDataLoader
from custom_tokenizers import Tokenizer
from types import SimpleNamespace
from decoder_layers import KVCache, PaliGemmaForConditionalGeneration
from configs.config import DataConfig, EncoderConfig, DecoderConfig, PaliGemmaConfig
from metrics import compute_scores

In [8]:
import torch.nn.functional as F

def greedy_generate(
    model,
    tokenizer,
    pixel_values,        # shape: (B, 2, C, H, W)
    image_token_index,   # placeholder index for image tokens
    max_length=60,
    eos_token_id=2,
    pad_token_id=0,
    device="cuda",
):
    model.eval()
    batch_size = pixel_values.size(0)
    eos_token_id = eos_token_id or tokenizer.eos_token_id

    # Encode images
    B, N, C, H, W = pixel_values.shape
    pixel_values = pixel_values.view(B * N, C, H, W).to(device)
    pixel_values = pixel_values.to(dtype=next(model.vision_tower.parameters()).dtype, device=device)
    vision_features = model.vision_tower(pixel_values)
    vision_features = vision_features.view(B, N, *vision_features.shape[1:])
    image_features = torch.cat([vision_features[:, 0], vision_features[:, 1]], dim=1)
    image_features = model.multi_modal_projector(image_features)  # shape: [B, Seq, Hidden]

    # Initialize sequence
    input_ids = torch.full((batch_size, 1), image_token_index, dtype=torch.long, device=device)
    attention_mask = torch.ones_like(input_ids)

    # Initialize done mask
    is_done = torch.zeros(batch_size, dtype=torch.bool, device=device)

    for _ in range(max_length):
        # Embedding
        input_embeds = model.language_model.get_input_embeddings()(input_ids)

        # Merge image and text
        merged_input, attn_mask, pos_ids = model._merge_input_ids_with_image_features(
            input_ids=input_ids,
            image_features=image_features,
            inputs_embeds=input_embeds,
            attention_mask=attention_mask,
            kv_cache=None
        )

        # Forward pass
        outputs = model.language_model(
            inputs_embeds=merged_input,
            attention_mask=attn_mask,
            position_ids=pos_ids,
        )

        logits = outputs["logits"]
        next_token_logits = logits[:, -1, :]  # Last token logits
        next_token = torch.argmax(next_token_logits, dim=-1).unsqueeze(1)  # Greedy pick

        input_ids = torch.cat([input_ids, next_token], dim=1)
        attention_mask = torch.cat([attention_mask, torch.ones_like(next_token)], dim=1)

        # Mark sequences that have ended
        is_done = is_done | (next_token.squeeze(1) == eos_token_id)
        if is_done.all():
            break

    return input_ids

def beam_search_generate(
    model,
    tokenizer,
    pixel_values,        # shape: (B, 2, C, H, W)
    image_token_index,   # placeholder index for image tokens
    max_length=60,
    num_beams=5,
    eos_token_id=2,
    pad_token_id=0,
    device="cuda",
):
    model.eval()
    batch_size = pixel_values.size(0)
    eos_token_id = eos_token_id or tokenizer.eos_token_id

    # Encode images
    B, N, C, H, W = pixel_values.shape
    pixel_values = pixel_values.view(B * N, C, H, W).to(device)
    pixel_values = pixel_values.to(dtype=next(model.vision_tower.parameters()).dtype, device="cuda")
    vision_features = model.vision_tower(pixel_values)
    vision_features = vision_features.view(B, N, *vision_features.shape[1:])
    image_features = torch.cat([vision_features[:, 0], vision_features[:, 1]], dim=1)
    image_features = model.multi_modal_projector(image_features)  # shape: [B, Seq, Hidden]

    # Init input_ids with image token index
    input_ids = torch.full((batch_size * num_beams, 1), image_token_index, dtype=torch.long, device=device)
    attention_mask = torch.ones_like(input_ids, device=device)

    # Expand inputs for each beam
    image_features = image_features.unsqueeze(1).repeat(1, num_beams, 1, 1)
    image_features = image_features.view(batch_size * num_beams, *image_features.shape[2:])

    beam_scores = torch.zeros((batch_size, num_beams), device=device)
    beam_scores[:, 1:] = -1e9  # mask beams other than first
    beam_scores = beam_scores.view(-1)  # shape: [B * num_beams]

    sequences = input_ids
    is_done = [False] * batch_size

    for step in range(max_length):
        # Embedding
        input_embeds = model.language_model.get_input_embeddings()(sequences)
        
        # Merge image + text features
        merged_input, attn_mask, pos_ids = model._merge_input_ids_with_image_features(
            input_ids = sequences, image_features = image_features, inputs_embeds = input_embeds, attention_mask = attention_mask, kv_cache=None
        )

        # Forward pass
        outputs = model.language_model(
            inputs_embeds=merged_input,
            attention_mask=attn_mask,
            position_ids=pos_ids,
        )
        logits = outputs["logits"]  # shape: [B * num_beams, Seq_len, Vocab]
        next_token_logits = logits[:, -1, :]  # take last token only
        next_token_log_probs = F.log_softmax(next_token_logits, dim=-1)
        # next_token_log_probs[:, image_token_index] = -1e9


        # Add current beam scores
        next_token_log_probs = next_token_log_probs + beam_scores[:, None]

        # Get top k * num_beams candidates
        vocab_size = next_token_log_probs.size(-1)
        next_token_log_probs = next_token_log_probs.view(batch_size, num_beams * vocab_size)
        topk_log_probs, topk_indices = torch.topk(next_token_log_probs, num_beams, dim=-1)

        # Prepare for next step
        beam_indices = topk_indices // vocab_size
        token_indices = topk_indices % vocab_size

        # Reorder sequences and image features
        sequences = sequences.view(batch_size, num_beams, -1)
        new_sequences = []
        for i in range(batch_size):
            new_sequences.append(sequences[i, beam_indices[i]])
        sequences = torch.stack(new_sequences).view(batch_size * num_beams, -1)
        sequences = torch.cat([sequences, token_indices.view(-1, 1)], dim=-1)

        # Update scores
        beam_scores = topk_log_probs.view(-1)

        # Update attention mask
        attention_mask = torch.cat([attention_mask, torch.ones_like(token_indices.view(-1, 1))], dim=1)

        # Check if all sequences have ended
        if eos_token_id is not None:
            for i in range(batch_size):
                done_for_beam = True
                for beam_id in range(num_beams):
                    token = sequences[i * num_beams + beam_id, -1]
                    if token != eos_token_id:
                        done_for_beam = False
                        break
                is_done[i] = done_for_beam
        
            if all(is_done):
                break

    # Reshape to [batch_size, num_beams, seq_len] and pick best beam
    sequences = sequences.view(batch_size, num_beams, -1)
    beam_scores = beam_scores.view(batch_size, num_beams)
    best_indices = torch.argmax(beam_scores, dim=1)

    best_sequences = []
    for i in range(batch_size):
        best_sequences.append(sequences[i, best_indices[i]])
    best_sequences = torch.stack(best_sequences)

    return best_sequences


def evaluate_model(model, tokenizer, dataloader, device, image_token_index, decoding, max_len=60, num_beams=5):
    model.eval()
    gts = {}
    res = {}

    with torch.no_grad():
        for i, batch in enumerate(tqdm(dataloader, desc="Evaluating")):
            input_ids, pixel_values, att_masks = batch
            pixel_values = pixel_values.to(device)

            if decoding == "greedy":
                print("generate with greedy")
                generated_ids = greedy_generate(
                    model,
                    tokenizer,
                    pixel_values=pixel_values,
                    image_token_index=image_token_index,
                    eos_token_id=tokenizer.eos_token_id,
                    device=device,
                    max_length=max_len,
                )
            else:
                print("generate with beam")
                generated_ids = beam_search_generate(
                    model,
                    tokenizer,
                    pixel_values=pixel_values,
                    image_token_index=image_token_index,
                    eos_token_id=tokenizer.eos_token_id,
                    device=device,
                    num_beams=num_beams,
                    max_length=max_len,
                )

            # Decode predictions
            decoded_preds = tokenizer.decode_batch(generated_ids)
            decoded_preds = [pred.strip().lower() for pred in decoded_preds]

            # Decode references
            references = input_ids
            if isinstance(references[0], torch.Tensor):
                references = [tokenizer.decode(ref).replace("<image>", "").strip().lower() for ref in references]

            batch_size = pixel_values.size(0)
            for j in range(batch_size):
                image_id = f"img_{i * batch_size + j}"
                gts[image_id] = [references[j]]  # list of refs
                res[image_id] = [decoded_preds[j]]  # model output
            for i in range(1):
                print(f"Pred: {decoded_preds[i]}")
                print(f"Ref: {references[i]}")
                print()

    scores = compute_scores(gts, res)

    # Optional print
    for metric, score in scores.items():
        print(f"{metric.upper()}: {score:.4f}")

    return scores, gts, res

In [9]:
config = PaliGemmaConfig(text_config=DecoderConfig(), vision_config=EncoderConfig())
model = PaliGemmaForConditionalGeneration(config)
model.load_state_dict(torch.load("checkpoints/experiment_2.pt", map_location="cuda"))
model.eval()

tokenizer = Tokenizer(DataConfig())

device = "cuda" if torch.cuda.is_available() else "cpu"
model_clip, preprocess = clip.load("ViT-B/32", device=device)
model = model.to(device)

tokenizer = Tokenizer(DataConfig())

test_loader = CustomDataLoader(
    split="test",
    batch_size=64,
    num_workers=1,
    tokenizer=tokenizer,
    shuffle=True
)

scores, gts, res = evaluate_model(model, tokenizer, test_loader, device, image_token_index=763, decoding="greedy")
scores

100%|████████████████████████████████████████| 338M/338M [00:02<00:00, 167MiB/s]
Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]

generate with greedy


Evaluating:  10%|█         | 1/10 [00:03<00:28,  3.21s/it]

Pred: <image> 5th acute bony structures reveal masses replacement exam most compatible with tip projects over the cardiac silhouette is a small t-spine osteophytes changes throughout both lungs are clear bilaterally more this could represent sequelae <bos> frontal view reveals granulomas throughout both lungs are clear bilaterally more this could represent sequelae <bos> frontal view reveals granulomas throughout both lungs are clear
Ref: <bos> unchanged cardiomegaly . there is <unk> interstitial prominence bilaterally . unchanged vascular appearance . there is patchy retrocardiac opacity . negative for pneumothorax . <eos>

generate with greedy


Evaluating:  20%|██        | 2/10 [00:04<00:16,  2.11s/it]

Pred: <image> normally inflated without evidence of frontal view reveals granulomas throughout both lungs are clear bilaterally more this could represent sequelae <bos> frontal view reveals granulomas throughout both lungs are clear bilaterally more this could represent sequelae <bos> frontal view reveals granulomas throughout both lungs are clear bilaterally more this could represent sequelae <bos> frontal view reveals granulomas throughout both lungs
Ref: <bos> the cardiomediastinal silhouette is within normal limits . there is rounded calcified density within the left lower lobe most consistent with granuloma . <unk> lungs are clear without evidence of focal opacification . no pneumothorax or large pleural effusion . no acute bone abnormality . <eos>

generate with greedy


Evaluating:  30%|███       | 3/10 [00:05<00:12,  1.76s/it]

Pred: <image> normally inflated without evidence of frontal view reveals granulomas throughout both lungs are clear bilaterally more this could represent sequelae <bos> frontal view reveals granulomas throughout both lungs are clear bilaterally more this could represent sequelae <bos> frontal view reveals granulomas throughout both lungs are clear bilaterally more this could represent sequelae <bos> frontal view reveals granulomas throughout both lungs
Ref: <bos> no focal lung consolidation . heart size and pulmonary vascularity are within normal limits . no pneumothorax or pleural effusion . osseous structures are grossly intact . <eos>

generate with greedy


Evaluating:  40%|████      | 4/10 [00:07<00:09,  1.60s/it]

Pred: <image> normally inflated without evidence of frontal view reveals granulomas throughout both lungs are clear bilaterally more this could represent sequelae <bos> frontal view reveals granulomas throughout both lungs are clear bilaterally more this could represent sequelae <bos> frontal view reveals granulomas throughout both lungs are clear bilaterally more this could represent sequelae <bos> frontal view reveals granulomas throughout both lungs
Ref: <bos> cardiac and mediastinal contours are within normal limits . the lungs are clear . bony structures are intact . <eos>

generate with greedy


Evaluating:  50%|█████     | 5/10 [00:08<00:07,  1.51s/it]

Pred: <image> normally inflated without evidence of frontal view reveals granulomas throughout both lungs are clear bilaterally more images apices could represent sequelae <bos> frontal view reveals granulomas throughout both lungs are clear bilaterally more this could represent sequelae <bos> frontal view reveals granulomas throughout both lungs are clear bilaterally more this could represent sequelae <bos> frontal view reveals granulomas throughout both
Ref: <bos> heart size is normal . there are xxxx opacities which appear to xxxx xxxx above the right xxxx fissure . there is mild thickening in the fissure . no pneumothorax . no large pleural effusions . <eos>

generate with greedy


Evaluating:  60%|██████    | 6/10 [00:09<00:05,  1.45s/it]

Pred: <image> normally inflated without evidence of frontal view reveals granulomas throughout both lungs are clear bilaterally more this could represent sequelae <bos> frontal view reveals granulomas throughout both lungs are clear bilaterally more this could represent sequelae <bos> frontal view reveals granulomas throughout both lungs are clear bilaterally more this could represent sequelae <bos> frontal view reveals granulomas throughout both lungs
Ref: <bos> mediastinal contours are normal . lungs are clear . there is no pneumothorax or large pleural effusion . <eos>

generate with greedy


Evaluating:  70%|███████   | 7/10 [00:11<00:04,  1.42s/it]

Pred: <image> normally inflated without evidence of frontal view reveals granulomas throughout both lungs are clear bilaterally more this could represent sequelae <bos> frontal view reveals granulomas throughout both lungs are clear bilaterally more this could represent sequelae <bos> frontal view reveals granulomas throughout both lungs are clear bilaterally more this could represent sequelae <bos> frontal view reveals granulomas throughout both lungs
Ref: <bos> the heart is normal in size . the mediastinum is unremarkable . the lungs are clear . <eos>

generate with greedy


Evaluating:  80%|████████  | 8/10 [00:12<00:02,  1.40s/it]

Pred: <image> normally inflated without evidence of frontal view reveals granulomas throughout both lungs are clear bilaterally more this could represent sequelae <bos> frontal view reveals granulomas throughout both lungs are clear bilaterally more this could represent sequelae <bos> frontal view reveals granulomas throughout both lungs are clear bilaterally more this could represent sequelae <bos> frontal view reveals granulomas throughout both lungs
Ref: <bos> the cardiac contours are normal . the lungs are clear . thoracic spondylosis . <eos>

generate with greedy


Evaluating:  90%|█████████ | 9/10 [00:13<00:01,  1.38s/it]

Pred: <image> normally inflated without evidence of frontal view reveals granulomas throughout both lungs are clear bilaterally more this could represent sequelae <bos> frontal view reveals granulomas throughout both lungs are clear bilaterally more this could represent sequelae <bos> frontal view reveals granulomas throughout both lungs are clear bilaterally more this could represent sequelae <bos> frontal view reveals granulomas throughout both lungs
Ref: <bos> the heart pulmonary xxxx and mediastinum are within normal limits . there is no pleural effusion or pneumothorax . there is no focal air space opacity to suggest a pneumonia . <eos>

generate with greedy


Evaluating: 100%|██████████| 10/10 [00:14<00:00,  1.45s/it]

Pred: <image> <unk> cm nodular density projected over the cardiac silhouette is a small t-spine osteophytes changes throughout both lungs are clear bilaterally more this could represent sequelae <bos> frontal view reveals granulomas throughout both lungs are clear bilaterally more this could represent sequelae <bos> frontal view reveals granulomas throughout both lungs are clear bilaterally more images apices could represent sequelae <bos>
Ref: <bos> normal heart size and mediastinal contours . stable calcification in the left upper lobe xxxx representing a granuloma . no focal airspace opacities . no pleural effusion or pneumothorax . visualized osseous structures are unremarkable in appearance . <eos>






BLEU_1: 0.0719
BLEU_2: 0.0388
BLEU_3: 0.0240
BLEU_4: 0.0132
METEOR: 0.1057
ROUGE_L: 0.0840


{'BLEU_1': 0.0718856078584496,
 'BLEU_2': 0.03876937847659147,
 'BLEU_3': 0.024038623641290346,
 'BLEU_4': 0.013248570457362201,
 'METEOR': 0.1057116640117974,
 'ROUGE_L': np.float64(0.08402614183632913)}

In [10]:
import json
with open("gts_greed.json", "w") as file:
    json.dump(gts, file)
with open("res_greed.json", "w") as file:
    json.dump(res, file)