In [12]:
"""Dense_Closed.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1RAl-O0E1Ixj8rMU0lUYeF2ToSIOMOtiu
"""

# Install necessary packages
!pip install datasets
# pip install focal-loss --upgrade
# import focal_loss

import os
import re
from torch.utils.data import Dataset
import pandas as pd
from PIL import Image
from datasets import Dataset, DatasetDict
import os
import requests
import matplotlib.pyplot as plt
import random
import torch
import numpy as np
import torchvision.transforms as transforms

# Ensure NLTK data is downloaded for BLEU scoring
import nltk
nltk.download('punkt')

from google.colab import drive
drive.mount('/content/drive')

def set_seed(seed_value=42):
    """Set seed for reproducibility for PyTorch and NumPy.

    Args:
        seed_value (int): The seed value to set for random number generators.
    """
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


set_seed(42)

path_project = '/content/drive/MyDrive/Project'

# 2) Load + preprocess + split
def vqa_rad_setup(path_project):
    json_file = "VQA_RAD Dataset Public.json"
    image_folder = "VQA_RAD Image Folder"
    json_path = os.path.join(path_project, json_file)
    image_dir  = os.path.join(path_project, image_folder)



    df = pd.read_json(json_path)
    df['image_path'] = df['image_name'].apply(lambda fn: os.path.join(image_dir, fn))

    unique_images  = df['image_path'].nunique()
    print(f"Number of unique images: {unique_images}")


    # 2) Normalize and filter to only CLOSED‑type questions
    df['answer_type']   = df['answer_type'].str.strip().str.lower()
    df['question_type'] = df['question_type'].str.strip().str.lower()
    df['answer']        = df['answer'].str.strip().str.lower()
    df_closed = df[df['answer_type']=='closed'].copy()
    print("Closed-QA rows:", len(df_closed))

    # — 3) original closed-QA (organ column will be NaN here)
    df_closed_qa = df_closed[['image_path','question','answer']].copy()
    df_closed_qa['image_organ'] = pd.NA

    # — 4) build the “template” for organ & question_type
    df_present = (
        df_closed[['image_path','image_organ','question_type']]  # include question_type
          .drop_duplicates(subset='image_path')
          .copy()
    )

    # — 5a) QA variant #1: ask about the question type
    df_q1 = df_present.assign(
        question = "What is the type of the question?",
        answer   = df_present['question_type']
    )[['image_path','question','answer','image_organ']]

    # — 5b) QA variant #2: ask about the organ
    df_q2 = df_present.assign(
        question = "Which organ is shown?",
        answer   = df_present['image_organ']
    )[['image_path','question','answer','image_organ']]

    # — 6) combine all three sets and shuffle
    df_all = pd.concat([df_closed_qa, df_q1, df_q2], ignore_index=True)
    df_all = df_all.sample(frac=1, random_state=42).reset_index(drop=True)
    print("Total QA rows:", len(df_all))

    return df_all

df_binary = vqa_rad_setup(path_project)

from transformers import CLIPProcessor, CLIPModel
from tqdm import tqdm

# 3) Instantiate your MedCLIP processor
##model_name = "openai/clip-vit-base-patch32"  # You can choose different CLIP model variants
#processor = CLIPProcessor.from_pretrained(model_name)


def densenet_processor():
    return transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(
            mean=[0.485, 0.456, 0.406],
            std=[0.229, 0.224, 0.225]
        )
    ])

def transform_image_pre(image_path):
    image = Image.open(image_path)
    if image.mode != 'RGB':
        image = image.convert('RGB')
    return densenet_processor()(image)


# 6) Batch‑load all train images into one big tensor
image_tensors = []
failed_images = []

for img_path in tqdm(df_binary['image_path'], desc="Processing images"):
    try:
        tensor = transform_image_pre(img_path)
        image_tensors.append(tensor)
    except Exception as e:
        print(f"❌ Skipped {img_path}: {e}")
        failed_images.append(img_path)

full_image_tensor = torch.stack(image_tensors, dim=0)
print(f"\nProcessed {len(image_tensors)} images; skipped {len(failed_images)}.")
print("full_image_tensor.shape =", full_image_tensor.shape)

import re
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.checkpoint as cp
from collections import OrderedDict



def _bn_function_factory(norm, relu, conv):
    def bn_function(*inputs):
        concated_features = torch.cat(inputs, 1)
        bottleneck_output = conv(relu(norm(concated_features)))
        return bottleneck_output

    return bn_function




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Number of unique images: 314
Closed-QA rows: 1299
Total QA rows: 1899


Processing images: 100%|██████████| 1899/1899 [00:28<00:00, 67.24it/s]



Processed 1899 images; skipped 0.
full_image_tensor.shape = torch.Size([1899, 3, 224, 224])


In [18]:
import torch
from transformers import BertTokenizer

# 1) Device & tokenizer
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
print(f"Loaded BERT tokenizer with vocab size: {tokenizer.vocab_size}")

question_lengths = [len(tokenizer(q, add_special_tokens=True)['input_ids']) for q in df_binary['question']]
answer_lengths   = [len(tokenizer(a, add_special_tokens=True)['input_ids']) for a in df_binary['answer']]
import numpy as np

print(f"Question lengths: min={np.min(question_lengths)}, max={np.max(question_lengths)}, mean={np.mean(question_lengths):.1f}, 95th percentile={np.percentile(question_lengths, 95)}")
print(f"Answer lengths:   min={np.min(answer_lengths)}, max={np.max(answer_lengths)}, mean={np.mean(answer_lengths):.1f}, 95th percentile={np.percentile(answer_lengths, 95)}")


# 2) Max lengths
max_question_length = 28
max_answer_length   = 9

# 3) Prepare lists
q_input_ids, q_attention_masks = [], []
a_input_ids, a_attention_masks = [], []

# 4) Loop over your DataFrame
for q, a in zip(df_binary['question'], df_binary['answer']):
    # tokenize question
    q_tok = tokenizer(
        q,
        padding='max_length',
        truncation=True,
        max_length=max_question_length,
        return_tensors="pt"
    )
    q_input_ids.append(q_tok['input_ids'].squeeze(0))         # [seq_len]
    q_attention_masks.append(q_tok['attention_mask'].squeeze(0))

    # tokenize answer
    a_tok = tokenizer(
        a,
        padding='max_length',
        truncation=True,
        max_length=max_answer_length,
        return_tensors="pt",
        add_special_tokens=True
    )
    a_input_ids.append(a_tok['input_ids'].squeeze(0))         # [seq_len]
    a_attention_masks.append(a_tok['attention_mask'].squeeze(0))

# 5) Stack into big tensors
Question_ids   = torch.stack(q_input_ids)        # [N, max_question_length]
Question_mask  = torch.stack(q_attention_masks)  # [N, max_question_length]
Answer_ids     = torch.stack(a_input_ids)        # [N, max_answer_length]
Answer_mask    = torch.stack(a_attention_masks)  # [N, max_answer_length]


Loaded BERT tokenizer with vocab size: 30522
Question lengths: min=6, max=28, mean=10.4, 95th percentile=16.0
Answer lengths:   min=3, max=9, mean=3.2, 95th percentile=4.0


In [19]:

import torch
import torch.nn as nn

class BiLSTM(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, num_layers, dropout):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.lstm = nn.LSTM(
            input_size=emb_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=True,
            dropout=dropout if num_layers > 1 else 0.0
        )
        # project bidirectional hidden to single vector
        self.fc = nn.Linear(hidden_dim * 2, hidden_dim)

    def forward(self, input_ids, attention_mask):
        # input_ids: [B, T], attention_mask: [B, T]
        embeds = self.embedding(input_ids)                      # [B, T, emb_dim]
        lengths = attention_mask.sum(dim=1).cpu()               # actual lengths
        packed = nn.utils.rnn.pack_padded_sequence(
            embeds, lengths, batch_first=True, enforce_sorted=False
        )
        packed_out, (h_n, _) = self.lstm(packed)
        # h_n: [num_layers*2, B, hidden_dim]
        # take last forward & backward layers
        h_fwd = h_n[-2]                                         # [B, hidden_dim]
        h_bwd = h_n[-1]                                         # [B, hidden_dim]
        h_cat = torch.cat([h_fwd, h_bwd], dim=1)                # [B, hidden_dim*2]
        out = self.fc(h_cat)                                    # [B, hidden_dim]
        return out

In [21]:


from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
print(tokenizer.vocab_size)

#from torchvision.models import densenet121 as tv_densenet121

#1) Load the torchvision DenseNet121 (with its pretrained weights)
#densenet = tv_densenet121(pretrained=True).to(device).eval()


from torchvision.models import densenet169 as tv_densenet169
densenet = tv_densenet169(pretrained=True).to(device).eval()

# 2) Wrap it for feature extraction just as before
feature_extractor = nn.Sequential(
    densenet.features,
    nn.ReLU(inplace=True),
    nn.AdaptiveAvgPool2d((1, 1)),
    nn.Flatten(start_dim=1),
).to(device).eval()

# 3) Your BiLSTM instance—make sure you **don't** leave a trailing comma!
text_model = BiLSTM(
    vocab_size = tokenizer.vocab_size,
    emb_dim=256,
    hidden_dim=256,
    num_layers=2,
    dropout=0.3
).to(device).eval()  # ← no comma here

print("📦 Tokenizer vocab size:", tokenizer.vocab_size)

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

# 1) Raw‐tensor Dataset & DataLoader
batch_size = 64
raw_ds     = TensorDataset(full_image_tensor, Question_ids, Question_mask)
raw_loader = DataLoader(raw_ds, batch_size=batch_size, shuffle=False)

import time
from tqdm.auto import tqdm

def extract_features_and_questions(loader, feature_extractor, text_model, device):
    all_img_feats  = []
    all_txt_feats  = []
    all_q_ids      = []
    all_q_mask     = []
    total_start    = time.time()


    for imgs, q_ids, q_mask in tqdm(loader, desc="Extracting features", unit="batch"):
        imgs   = imgs.to(device)
        q_ids  = q_ids.to(device)
        q_mask = q_mask.to(device)

        with torch.no_grad():
            img_feats = feature_extractor(imgs)
            txt_feats = text_model(q_ids, q_mask)

        all_img_feats.append(img_feats.cpu())
        all_txt_feats.append(txt_feats.cpu())
        all_q_ids.append(q_ids.cpu())
        all_q_mask.append(q_mask.cpu())

    #concatenate everything
    image_features = torch.cat(all_img_feats, dim=0)  # [N,1024]
    text_features  = torch.cat(all_txt_feats,  dim=0)  # [N,256]
    Question_ids   = torch.cat(all_q_ids,      dim=0)  # [N, T_q]
    Question_mask  = torch.cat(all_q_mask,     dim=0)  # [N, T_q]

    return image_features, text_features, Question_ids, Question_mask

# run it
image_features, text_features, Question_ids, Question_mask = \
    extract_features_and_questions(raw_loader, feature_extractor, text_model, device)

print("image_features:", image_features.shape)  # → [N,1024]
print("text_features: ", text_features.shape)   # → [N,256]

y_train_tensor = Answer_ids

# If your model expects decoder inputs shifted right (for teacher forcing),
# you can slice:
decoder_input_ids = y_train_tensor[:, :-1]   # [N, max_answer_length-1]
labels           = y_train_tensor[:, 1:]


# 4) Now split & wrap for your classifier
N_split = 75
Image_train   = image_features[N_split:]       # [train_N, 1024]
Image_test    = image_features[:N_split]       # [test_N, 1024]
Text_train    = text_features[N_split:]        # [train_N, 256]
Text_test     = text_features[:N_split]        # [test_N, 256]
Qids_train    = Question_ids[N_split:]         # [train_N, T_q]
Qids_test     = Question_ids[:N_split]         # [test_N, T_q]
Qmask_train   = Question_mask[N_split:]        # [train_N, T_q]
Qmask_test    = Question_mask[:N_split]        # [test_N, T_q]
Dec_in_train  = decoder_input_ids[N_split:]        # [train_N, T_a-1]
Dec_in_test   = decoder_input_ids[:N_split]        # [test_N, T_a-1]
Dec_lab_train = labels[N_split:]           # [train_N, T_a-1]
Dec_lab_test  = labels[:N_split]           # [test_N, T_a-1]

# 4) Build your TensorDatasets
train_ds = TensorDataset(
    Image_train,
    Text_train,
    Qids_train,
    Qmask_train,
    Dec_in_train,
    Dec_lab_train,
)
test_ds = TensorDataset(
    Image_test,
    Text_test,
    Qids_test,
    Qmask_test,
    Dec_in_test,
    Dec_lab_test,
)

# 5) Wrap in DataLoaders
train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
test_loader  = DataLoader(test_ds,  batch_size=1,  shuffle=False)





30522
📦 Tokenizer vocab size: 30522


Extracting features:   0%|          | 0/60 [00:00<?, ?batch/s]

image_features: torch.Size([1899, 1024])
text_features:  torch.Size([1899, 256])


In [29]:

class ImprovedCrossAttentionFusionGenerator(nn.Module):
    def __init__(self, vocab_size, image_dim=1024, text_dim=256, fusion_dim=512,
                 dec_dim=512, num_heads=8, dropout=0.3):  # Keep increased dropout
        super().__init__()
        # Project image and text features
        self.image_proj = nn.Linear(image_dim, fusion_dim)
        self.text_proj = nn.Linear(text_dim, fusion_dim)

        # Token expanders (transform features into sequences)
        self.img_to_tokens = nn.Linear(fusion_dim, fusion_dim * 4)  # Expand to 4 tokens
        self.txt_to_tokens = nn.Linear(fusion_dim, fusion_dim * 4)  # Expand to 4 tokens

        # Co-attention between modalities
        self.img_txt_attn = nn.MultiheadAttention(
            embed_dim=fusion_dim,
            num_heads=num_heads//2,
            dropout=dropout,
            batch_first=True
        )
        self.txt_img_attn = nn.MultiheadAttention(
            embed_dim=fusion_dim,
            num_heads=num_heads//2,
            dropout=dropout,
            batch_first=True
        )

        # Fusion layer normalization
        self.fusion_ln = nn.LayerNorm(fusion_dim)

        # Token embedding for decoder
        self.embedding = nn.Embedding(vocab_size, dec_dim, padding_idx=0)

        # Cross-attention layer
        self.cross_attn = nn.MultiheadAttention(
            embed_dim=dec_dim,
            num_heads=num_heads,
            dropout=dropout,
            batch_first=True
        )

        # Decoder LSTM
        self.decoder_lstm = nn.LSTM(
            input_size=dec_dim,
            hidden_size=dec_dim,
            num_layers=2,
            batch_first=True,
            dropout=dropout
        )

        # Add layer norm and more dropout
        self.pre_output_ln = nn.LayerNorm(dec_dim)
        self.pre_output_dropout = nn.Dropout(dropout)

        # Final projection
        self.output_head = nn.Linear(dec_dim, vocab_size)

    def forward(self, img_feat, txt_feat, decoder_input_ids, temperature=0.1):
        B = img_feat.size(0)  # Batch size

        # Project image and text to same fusion space
        img_proj = self.image_proj(img_feat)    # [B, fusion_dim]
        txt_proj = self.text_proj(txt_feat)     # [B, fusion_dim]

        # Expand to token sequences
        img_tokens = self.img_to_tokens(img_proj).reshape(B, 4, -1)  # [B, 4, fusion_dim]
        txt_tokens = self.txt_to_tokens(txt_proj).reshape(B, 4, -1)  # [B, 4, fusion_dim]

        # Co-attention between modalities
        img_attn_to_txt, _ = self.img_txt_attn(img_tokens, txt_tokens, txt_tokens)
        txt_attn_to_img, _ = self.txt_img_attn(txt_tokens, img_tokens, img_tokens)

        # Combine and normalize
        fusion_memory = torch.cat([img_attn_to_txt, txt_attn_to_img], dim=1)  # [B, 8, fusion_dim]
        fusion_memory = self.fusion_ln(fusion_memory)

        # Embed decoder input tokens
        dec_emb = self.embedding(decoder_input_ids)  # [B, T, dec_dim]

        # Cross-attention: decoder attends to fused image+text features
        attn_output, _ = self.cross_attn(
            query=dec_emb,
            key=fusion_memory,
            value=fusion_memory
        )  # [B, T, dec_dim]

        # Feed through LSTM
        lstm_output, (_, _) = self.decoder_lstm(attn_output)  # [B, T, dec_dim]

        # Apply layer norm and dropout before final output
        lstm_output = self.pre_output_ln(lstm_output)
        lstm_output = self.pre_output_dropout(lstm_output)

        # Project to vocabulary logits
        logits = self.output_head(lstm_output)   # [B, T, vocab_size]

        # Apply temperature for generation diversity
        if temperature != 1.0:
            logits = logits / temperature

        return logits

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from tqdm import tqdm
import numpy as np
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

def evaluate_model(model, test_loader, tokenizer, device, temperature=0.9):  # Lower temperature for generation
    """
    Evaluate model on test set and calculate BLEU score
    """
    model.eval()
    all_bleu_scores = []

    # Store yes/no accuracy separately for analysis
    binary_correct = 0
    binary_total = 0

    # Use smoothing for BLEU calculation
    smoothie = SmoothingFunction().method4  # Changed to method4 for consistency

    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Evaluating"):
            # Unpack batch
            img_feats, txt_feats, q_ids, q_mask, decoder_inputs, labels = [b.to(device) for b in batch]

            # Get question text for analysis
            question = tokenizer.decode(q_ids[0].cpu().tolist(), skip_special_tokens=True).lower()
            is_binary = any(question.startswith(prefix) for prefix in ["is ", "are ", "can ", "do ", "does "])

            # Reference answer (ground truth)
            reference_ids = labels[0].cpu().tolist()
            reference_ids = [id for id in reference_ids if id != 0]  # Remove padding
            reference = tokenizer.decode(reference_ids, skip_special_tokens=True).lower()

            # For evaluation, we generate one token at a time
            # Start with BOS token (101)
            current_input = torch.tensor([[101]], device=device)
            generated_ids = [101]

            # Generate tokens auto-regressively
            max_len = 8  # Maximum answer length
            for _ in range(max_len - 1):
                # Forward pass (with temperature)
                logits = model(img_feats, txt_feats, current_input, temperature)

                # Get the most likely next token
                next_token_id = logits[0, -1].argmax().item()

                # Stop if we hit the EOS token (102)
                if next_token_id == 102:
                    break

                # Add token to the sequence
                generated_ids.append(next_token_id)

                # Update input for next iteration
                current_input = torch.cat([
                    current_input,
                    torch.tensor([[next_token_id]], device=device)
                ], dim=1)

            # Get the predicted answer
            prediction = tokenizer.decode(generated_ids[1:], skip_special_tokens=True).lower()

            # Track binary question accuracy separately
            if is_binary:
                binary_total += 1
                if (prediction == "yes" and reference == "yes") or (prediction == "no" and reference == "no"):
                    binary_correct += 1

            # Convert IDs to tokens for BLEU calculation
            reference_tokens = tokenizer.convert_ids_to_tokens(reference_ids)
            generated_tokens = tokenizer.convert_ids_to_tokens(generated_ids[1:])  # Skip BOS token

            # Remove special tokens
            reference_tokens = [token for token in reference_tokens if token not in ['[CLS]', '[SEP]', '[PAD]']]
            generated_tokens = [token for token in generated_tokens if token not in ['[CLS]', '[SEP]', '[PAD]']]

            # Calculate BLEU score (with smoothing for short sequences)
            # If either sequence is empty, assign a score of 0
            if len(reference_tokens) == 0 or len(generated_tokens) == 0:
                bleu = 0
            else:
                bleu = sentence_bleu([reference_tokens], generated_tokens,
                                     smoothing_function=smoothie)

            all_bleu_scores.append(bleu)

    # Calculate average BLEU score
    avg_bleu = np.mean(all_bleu_scores)

    # Print binary accuracy if applicable
    if binary_total > 0:
        binary_acc = binary_correct / binary_total * 100
        print(f"Yes/No question accuracy: {binary_acc:.1f}% ({binary_correct}/{binary_total}) - for analysis only")

    return avg_bleu
# Simplify the train function to use the unified model approach
def train_vqa_model(model, train_loader, test_loader, tokenizer,
                    num_epochs=60, lr=5e-4, device='cuda'):
    """
    Train the VQA model and evaluate on test set
    Returns the model with best BLEU score and the score itself
    """
    # Move model to device
    model = model.to(device)

    # Define loss function (with label smoothing)
    criterion = nn.CrossEntropyLoss(ignore_index=0, label_smoothing=0.1)  # Add label smoothing

    # Define optimizer
    optimizer = optim.Adam(model.parameters(), lr=lr)

    # Learning rate scheduler
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='min', factor=0.5, patience=3, verbose=True
    )

    # Training loop
    best_bleu = 0.0
    best_model_state = None

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0

        # Training loop
        train_loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")
        for batch in train_loop:
            # Unpack batch
            img_feats, txt_feats, _, _, decoder_inputs, labels = [b.to(device) for b in batch]

            # Forward pass
            logits = model(img_feats, txt_feats, decoder_inputs)

            # Calculate loss
            B, T, V = logits.shape
            loss = criterion(logits.view(-1, V), labels.view(-1))

            # Backward pass
            optimizer.zero_grad()
            loss.backward()

            # Gradient clipping to prevent exploding gradients
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

            # Update weights
            optimizer.step()

            # Update statistics
            total_loss += loss.item()
            train_loop.set_postfix(loss=loss.item())

        # Calculate average loss
        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")

        # Update learning rate based on loss
        scheduler.step(avg_loss)

        # Evaluate on test set
        bleu_score, rouge1_score, rouge2_score, rougeL_score = evaluate_model(model, test_loader, tokenizer, device)

        # Print all scores
        print(f"BLEU score: {bleu_score:.4f}")
        print(f"ROUGE-1 score: {rouge1_score:.4f}")
        print(f"ROUGE-2 score: {rouge2_score:.4f}")
        print(f"ROUGE-L score: {rougeL_score:.4f}")

        # Save best model
        if bleu_score > best_bleu:
            best_bleu = bleu_score
            # Save model state to file
            torch.save(model.state_dict(), "best_vqa_model.pth")
            # Also keep a copy of the best model state in memory
            best_model_state = model.state_dict().copy()
            print(f"New best model saved with BLEU score: {best_bleu:.4f}")

    # Load the best model state before returning
    if best_model_state is not None:
        model.load_state_dict(best_model_state)
        print(f"Restored best model with BLEU score: {best_bleu:.4f}")

    # Return best model and its BLEU score
    return model, best_bleu

# Update the evaluate_model function to use temperature scaling
def evaluate_model(model, test_loader, tokenizer, device, temperature=0.5):
    """
    Evaluate model on test set and calculate BLEU and ROUGE scores
    """
    model.eval()
    all_bleu_scores = []
    all_rouge1_scores = []
    all_rouge2_scores = []
    all_rougeL_scores = []

    # Store yes/no accuracy separately for analysis
    binary_correct = 0
    binary_total = 0

    # Use smoothing for BLEU calculation
    smoothie = SmoothingFunction().method4
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Evaluating"):
            # Unpack batch
            img_feats, txt_feats, q_ids, q_mask, _, labels = [b.to(device) for b in batch]

            # Get question text for analysis
            question = tokenizer.decode(q_ids[0].cpu().tolist(), skip_special_tokens=True).lower()
            is_binary = any(question.startswith(prefix) for prefix in ["is ", "are ", "can ", "do ", "does "])

            # Reference answer (ground truth)
            reference_ids = labels[0].cpu().tolist()
            reference_ids = [id for id in reference_ids if id != 0]  # Remove padding
            reference = tokenizer.decode(reference_ids, skip_special_tokens=True).lower()

            # For evaluation, we generate one token at a time
            # Start with BOS token (101)
            current_input = torch.tensor([[101]], device=device)
            generated_ids = [101]

            # Generate tokens auto-regressively
            max_len = 8  # Maximum answer length
            for _ in range(max_len - 1):
                # Forward pass with temperature scaling
                logits = model(img_feats, txt_feats, current_input)

                # Apply temperature scaling
                logits = logits / temperature

                # Get the most likely next token
                next_token_id = logits[0, -1].argmax().item()

                # Stop if we hit the EOS token (102)
                if next_token_id == 102:
                    break

                # Add token to the sequence
                generated_ids.append(next_token_id)

                # Update input for next iteration
                current_input = torch.cat([
                    current_input,
                    torch.tensor([[next_token_id]], device=device)
                ], dim=1)

            # Get the predicted answer
            prediction = tokenizer.decode(generated_ids[1:], skip_special_tokens=True).lower()

            # Track binary question accuracy separately
            if is_binary:
                binary_total += 1
                if (prediction == "yes" and reference == "yes") or (prediction == "no" and reference == "no"):
                    binary_correct += 1

            # Convert IDs to tokens for BLEU calculation
            reference_tokens = tokenizer.convert_ids_to_tokens(reference_ids)
            generated_tokens = tokenizer.convert_ids_to_tokens(generated_ids[1:])  # Skip BOS token

            # Remove special tokens
            reference_tokens = [token for token in reference_tokens if token not in ['[CLS]', '[SEP]', '[PAD]']]
            generated_tokens = [token for token in generated_tokens if token not in ['[CLS]', '[SEP]', '[PAD]']]

            # Calculate BLEU score (with smoothing for short sequences)
            if len(reference_tokens) == 0 or len(generated_tokens) == 0:
                bleu = 0
            else:
                bleu = sentence_bleu([reference_tokens], generated_tokens,
                                    smoothing_function=smoothie)

            all_bleu_scores.append(bleu)

            # Calculate ROUGE scores for this example
            rouge_scores = scorer.score(reference, prediction)
            all_rouge1_scores.append(rouge_scores['rouge1'].fmeasure)
            all_rouge2_scores.append(rouge_scores['rouge2'].fmeasure)
            all_rougeL_scores.append(rouge_scores['rougeL'].fmeasure)

    # Calculate average scores
    avg_bleu = np.mean(all_bleu_scores)
    avg_rouge1 = np.mean(all_rouge1_scores)
    avg_rouge2 = np.mean(all_rouge2_scores)
    avg_rougeL = np.mean(all_rougeL_scores)

    # Print the average ROUGE scores
    print(f"Average ROUGE-1 score: {avg_rouge1:.4f}")
    print(f"Average ROUGE-2 score: {avg_rouge2:.4f}")
    print(f"Average ROUGE-L score: {avg_rougeL:.4f}")

    # Print binary accuracy if applicable
    if binary_total > 0:
        binary_acc = binary_correct / binary_total * 100
        print(f"Yes/No question accuracy: {binary_acc:.1f}% ({binary_correct}/{binary_total}) - for analysis only")

    return avg_bleu, avg_rouge1, avg_rouge2, avg_rougeL

# Initialize your model (assuming it's already defined)
fusion_generator = ImprovedCrossAttentionFusionGenerator(
    vocab_size=tokenizer.vocab_size,
    image_dim=1024,
    text_dim=256,
    fusion_dim=512,
    dec_dim=512,
    num_heads=8,
    dropout=0.3
)

fusion_model, fusion_best_bleu, fusion_best_rouge1, fusion_best_rouge2, fusion_best_rougeL = train_vqa_model(
    model=fusion_generator,
    train_loader=train_loader,
    test_loader=test_loader,
    tokenizer=tokenizer,
    num_epochs=50,
    lr=1e-4,  # Lower learning rate
    device=device
)

# Final evaluation on test set
print("Running final evaluation on best model...")
fusion_model.eval()  # Ensure model is in evaluation mode

# Add debugging to see actual predictions
def debug_predictions(model, test_loader, tokenizer, device, num_samples=5):
    """Show predictions for a few test samples to help debug BLEU score issues"""
    model.eval()
    count = 0

    with torch.no_grad():
        for batch in test_loader:
            if count >= num_samples:
                break

            img_feats, txt_feats, q_ids, q_mask, decoder_inputs, labels = [b.to(device) for b in batch]

            # Get question text
            question = tokenizer.decode(q_ids[0].cpu().tolist(), skip_special_tokens=True)

            # Get ground truth answer
            reference_ids = labels[0].cpu().tolist()
            reference_ids = [id for id in reference_ids if id != 0]  # Remove padding
            reference = tokenizer.decode(reference_ids, skip_special_tokens=True)

            # Generate prediction
            current_input = torch.tensor([[101]], device=device)  # Start with BOS token
            generated_ids = [101]

            # Generate tokens auto-regressively
            max_len = 8
            for _ in range(max_len - 1):
                logits = model(img_feats, txt_feats, current_input)
                next_token_id = logits[0, -1].argmax().item()

                if next_token_id == 102:  # EOS token
                    break

                generated_ids.append(next_token_id)
                current_input = torch.cat([
                    current_input,
                    torch.tensor([[next_token_id]], device=device)
                ], dim=1)

            # Decode prediction
            prediction = tokenizer.decode(generated_ids[1:], skip_special_tokens=True)

            # Convert to tokens for BLEU calculation
            reference_tokens = tokenizer.convert_ids_to_tokens(reference_ids)
            generated_tokens = tokenizer.convert_ids_to_tokens(generated_ids[1:])

            # Remove special tokens
            reference_tokens = [token for token in reference_tokens if token not in ['[CLS]', '[SEP]', '[PAD]']]
            generated_tokens = [token for token in generated_tokens if token not in ['[CLS]', '[SEP]', '[PAD]']]

            # Print debug info
            #print(f"\nSample {count+1}:")
            #print(f"Question: {question}")
            #print(f"Reference: '{reference}' (tokens: {reference_tokens})")
            #print(f"Prediction: '{prediction}' (tokens: {generated_tokens})")

            # Calculate individual BLEU score
            smoothie = SmoothingFunction().method4
            if len(reference_tokens) == 0 or len(generated_tokens) == 0:
                bleu = 0
                reason = "Empty tokens"
            else:
                bleu = sentence_bleu([reference_tokens], generated_tokens, smoothing_function=smoothie)
                reason = "Valid tokens"

            print(f"Individual BLEU: {bleu:.4f} ({reason})")
            count += 1

# Run debugging
debug_predictions(fusion_model, test_loader, tokenizer, device)

# Then run the final evaluation
final_bleu_score = evaluate_model(fusion_model, test_loader, tokenizer, device)
print(f"Final BLEU score on test set: {final_bleu_score:.4f}")

# Optional: Save the final model
torch.save(fusion_model.state_dict(), "final_vqa_model.pth")
print("Final model saved as final_vqa_model.pth")



Epoch 1/50: 100%|██████████| 29/29 [00:01<00:00, 23.94it/s, loss=5.04]


Epoch 1/50, Loss: 8.9853


Evaluating: 100%|██████████| 75/75 [00:00<00:00, 165.85it/s]


Average ROUGE-1 score: 0.4222
Average ROUGE-2 score: 0.0000
Average ROUGE-L score: 0.4222
Yes/No question accuracy: 52.1% (25/48) - for analysis only
BLEU score: 0.3835
ROUGE-1 score: 0.4222
ROUGE-2 score: 0.0000
ROUGE-L score: 0.4222
New best model saved with BLEU score: 0.3835


Epoch 2/50: 100%|██████████| 29/29 [00:01<00:00, 26.28it/s, loss=4.25]


Epoch 2/50, Loss: 5.7906


Evaluating: 100%|██████████| 75/75 [00:00<00:00, 168.57it/s]


Average ROUGE-1 score: 0.4622
Average ROUGE-2 score: 0.0000
Average ROUGE-L score: 0.4622
Yes/No question accuracy: 52.1% (25/48) - for analysis only
BLEU score: 0.4235
ROUGE-1 score: 0.4622
ROUGE-2 score: 0.0000
ROUGE-L score: 0.4622
New best model saved with BLEU score: 0.4235


Epoch 3/50: 100%|██████████| 29/29 [00:01<00:00, 24.39it/s, loss=6.32]


Epoch 3/50, Loss: 5.0484


Evaluating: 100%|██████████| 75/75 [00:00<00:00, 166.51it/s]


Average ROUGE-1 score: 0.5022
Average ROUGE-2 score: 0.0000
Average ROUGE-L score: 0.5022
Yes/No question accuracy: 45.8% (22/48) - for analysis only
BLEU score: 0.4635
ROUGE-1 score: 0.5022
ROUGE-2 score: 0.0000
ROUGE-L score: 0.5022
New best model saved with BLEU score: 0.4635


Epoch 4/50: 100%|██████████| 29/29 [00:01<00:00, 24.52it/s, loss=3.82]


Epoch 4/50, Loss: 4.4563


Evaluating: 100%|██████████| 75/75 [00:00<00:00, 164.66it/s]


Average ROUGE-1 score: 0.4489
Average ROUGE-2 score: 0.0000
Average ROUGE-L score: 0.4489
Yes/No question accuracy: 39.6% (19/48) - for analysis only
BLEU score: 0.4102
ROUGE-1 score: 0.4489
ROUGE-2 score: 0.0000
ROUGE-L score: 0.4489


Epoch 5/50: 100%|██████████| 29/29 [00:01<00:00, 26.33it/s, loss=4.16]


Epoch 5/50, Loss: 4.2282


Evaluating: 100%|██████████| 75/75 [00:00<00:00, 120.62it/s]


Average ROUGE-1 score: 0.4489
Average ROUGE-2 score: 0.0000
Average ROUGE-L score: 0.4489
Yes/No question accuracy: 39.6% (19/48) - for analysis only
BLEU score: 0.4102
ROUGE-1 score: 0.4489
ROUGE-2 score: 0.0000
ROUGE-L score: 0.4489


Epoch 6/50: 100%|██████████| 29/29 [00:01<00:00, 24.85it/s, loss=5.75]


Epoch 6/50, Loss: 4.0307


Evaluating: 100%|██████████| 75/75 [00:00<00:00, 122.18it/s]


Average ROUGE-1 score: 0.4489
Average ROUGE-2 score: 0.0000
Average ROUGE-L score: 0.4489
Yes/No question accuracy: 39.6% (19/48) - for analysis only
BLEU score: 0.4102
ROUGE-1 score: 0.4489
ROUGE-2 score: 0.0000
ROUGE-L score: 0.4489


Epoch 7/50: 100%|██████████| 29/29 [00:01<00:00, 25.16it/s, loss=3.66]


Epoch 7/50, Loss: 3.7458


Evaluating: 100%|██████████| 75/75 [00:00<00:00, 164.46it/s]


Average ROUGE-1 score: 0.5689
Average ROUGE-2 score: 0.0000
Average ROUGE-L score: 0.5689
Yes/No question accuracy: 56.2% (27/48) - for analysis only
BLEU score: 0.5302
ROUGE-1 score: 0.5689
ROUGE-2 score: 0.0000
ROUGE-L score: 0.5689
New best model saved with BLEU score: 0.5302


Epoch 8/50: 100%|██████████| 29/29 [00:01<00:00, 24.58it/s, loss=3.1]


Epoch 8/50, Loss: 3.5859


Evaluating: 100%|██████████| 75/75 [00:00<00:00, 161.59it/s]


Average ROUGE-1 score: 0.5022
Average ROUGE-2 score: 0.0000
Average ROUGE-L score: 0.5022
Yes/No question accuracy: 54.2% (26/48) - for analysis only
BLEU score: 0.4635
ROUGE-1 score: 0.5022
ROUGE-2 score: 0.0000
ROUGE-L score: 0.5022


Epoch 9/50: 100%|██████████| 29/29 [00:01<00:00, 24.58it/s, loss=3.54]


Epoch 9/50, Loss: 3.4690


Evaluating: 100%|██████████| 75/75 [00:00<00:00, 118.84it/s]


Average ROUGE-1 score: 0.5289
Average ROUGE-2 score: 0.0000
Average ROUGE-L score: 0.5289
Yes/No question accuracy: 52.1% (25/48) - for analysis only
BLEU score: 0.4902
ROUGE-1 score: 0.5289
ROUGE-2 score: 0.0000
ROUGE-L score: 0.5289


Epoch 10/50: 100%|██████████| 29/29 [00:01<00:00, 24.98it/s, loss=3.36]


Epoch 10/50, Loss: 3.3933


Evaluating: 100%|██████████| 75/75 [00:00<00:00, 142.23it/s]


Average ROUGE-1 score: 0.5689
Average ROUGE-2 score: 0.0000
Average ROUGE-L score: 0.5689
Yes/No question accuracy: 58.3% (28/48) - for analysis only
BLEU score: 0.5302
ROUGE-1 score: 0.5689
ROUGE-2 score: 0.0000
ROUGE-L score: 0.5689


Epoch 11/50: 100%|██████████| 29/29 [00:01<00:00, 26.18it/s, loss=3.15]


Epoch 11/50, Loss: 3.1943


Evaluating: 100%|██████████| 75/75 [00:00<00:00, 162.79it/s]


Average ROUGE-1 score: 0.5689
Average ROUGE-2 score: 0.0000
Average ROUGE-L score: 0.5689
Yes/No question accuracy: 60.4% (29/48) - for analysis only
BLEU score: 0.5302
ROUGE-1 score: 0.5689
ROUGE-2 score: 0.0000
ROUGE-L score: 0.5689


Epoch 12/50: 100%|██████████| 29/29 [00:01<00:00, 26.02it/s, loss=3.37]


Epoch 12/50, Loss: 3.0995


Evaluating: 100%|██████████| 75/75 [00:00<00:00, 165.63it/s]


Average ROUGE-1 score: 0.5156
Average ROUGE-2 score: 0.0000
Average ROUGE-L score: 0.5156
Yes/No question accuracy: 52.1% (25/48) - for analysis only
BLEU score: 0.4768
ROUGE-1 score: 0.5156
ROUGE-2 score: 0.0000
ROUGE-L score: 0.5156


Epoch 13/50: 100%|██████████| 29/29 [00:01<00:00, 26.12it/s, loss=3.16]


Epoch 13/50, Loss: 3.0164


Evaluating: 100%|██████████| 75/75 [00:00<00:00, 161.18it/s]


Average ROUGE-1 score: 0.5556
Average ROUGE-2 score: 0.0000
Average ROUGE-L score: 0.5556
Yes/No question accuracy: 54.2% (26/48) - for analysis only
BLEU score: 0.5168
ROUGE-1 score: 0.5556
ROUGE-2 score: 0.0000
ROUGE-L score: 0.5556


Epoch 14/50: 100%|██████████| 29/29 [00:01<00:00, 26.14it/s, loss=3.5]


Epoch 14/50, Loss: 2.9543


Evaluating: 100%|██████████| 75/75 [00:00<00:00, 165.61it/s]


Average ROUGE-1 score: 0.5822
Average ROUGE-2 score: 0.0000
Average ROUGE-L score: 0.5822
Yes/No question accuracy: 60.4% (29/48) - for analysis only
BLEU score: 0.5435
ROUGE-1 score: 0.5822
ROUGE-2 score: 0.0000
ROUGE-L score: 0.5822
New best model saved with BLEU score: 0.5435


Epoch 15/50: 100%|██████████| 29/29 [00:01<00:00, 24.22it/s, loss=2.98]


Epoch 15/50, Loss: 2.8345


Evaluating: 100%|██████████| 75/75 [00:00<00:00, 161.88it/s]


Average ROUGE-1 score: 0.6222
Average ROUGE-2 score: 0.0000
Average ROUGE-L score: 0.6222
Yes/No question accuracy: 68.8% (33/48) - for analysis only
BLEU score: 0.5835
ROUGE-1 score: 0.6222
ROUGE-2 score: 0.0000
ROUGE-L score: 0.6222
New best model saved with BLEU score: 0.5835


Epoch 16/50: 100%|██████████| 29/29 [00:01<00:00, 24.44it/s, loss=2.77]


Epoch 16/50, Loss: 2.7541


Evaluating: 100%|██████████| 75/75 [00:00<00:00, 164.04it/s]


Average ROUGE-1 score: 0.5556
Average ROUGE-2 score: 0.0000
Average ROUGE-L score: 0.5556
Yes/No question accuracy: 56.2% (27/48) - for analysis only
BLEU score: 0.5168
ROUGE-1 score: 0.5556
ROUGE-2 score: 0.0000
ROUGE-L score: 0.5556


Epoch 17/50: 100%|██████████| 29/29 [00:01<00:00, 26.08it/s, loss=2.99]


Epoch 17/50, Loss: 2.7027


Evaluating: 100%|██████████| 75/75 [00:00<00:00, 160.72it/s]


Average ROUGE-1 score: 0.5689
Average ROUGE-2 score: 0.0000
Average ROUGE-L score: 0.5689
Yes/No question accuracy: 60.4% (29/48) - for analysis only
BLEU score: 0.5302
ROUGE-1 score: 0.5689
ROUGE-2 score: 0.0000
ROUGE-L score: 0.5689


Epoch 18/50: 100%|██████████| 29/29 [00:01<00:00, 26.03it/s, loss=2.68]


Epoch 18/50, Loss: 2.6264


Evaluating: 100%|██████████| 75/75 [00:00<00:00, 163.85it/s]


Average ROUGE-1 score: 0.5422
Average ROUGE-2 score: 0.0000
Average ROUGE-L score: 0.5422
Yes/No question accuracy: 52.1% (25/48) - for analysis only
BLEU score: 0.5035
ROUGE-1 score: 0.5422
ROUGE-2 score: 0.0000
ROUGE-L score: 0.5422


Epoch 19/50: 100%|██████████| 29/29 [00:01<00:00, 25.98it/s, loss=2.85]


Epoch 19/50, Loss: 2.5849


Evaluating: 100%|██████████| 75/75 [00:00<00:00, 168.33it/s]


Average ROUGE-1 score: 0.5689
Average ROUGE-2 score: 0.0000
Average ROUGE-L score: 0.5689
Yes/No question accuracy: 60.4% (29/48) - for analysis only
BLEU score: 0.5302
ROUGE-1 score: 0.5689
ROUGE-2 score: 0.0000
ROUGE-L score: 0.5689


Epoch 20/50: 100%|██████████| 29/29 [00:01<00:00, 26.12it/s, loss=2.33]


Epoch 20/50, Loss: 2.4963


Evaluating: 100%|██████████| 75/75 [00:00<00:00, 165.26it/s]


Average ROUGE-1 score: 0.5156
Average ROUGE-2 score: 0.0000
Average ROUGE-L score: 0.5156
Yes/No question accuracy: 54.2% (26/48) - for analysis only
BLEU score: 0.4768
ROUGE-1 score: 0.5156
ROUGE-2 score: 0.0000
ROUGE-L score: 0.5156


Epoch 21/50: 100%|██████████| 29/29 [00:01<00:00, 25.96it/s, loss=2.26]


Epoch 21/50, Loss: 2.4358


Evaluating: 100%|██████████| 75/75 [00:00<00:00, 166.67it/s]


Average ROUGE-1 score: 0.5822
Average ROUGE-2 score: 0.0000
Average ROUGE-L score: 0.5822
Yes/No question accuracy: 58.3% (28/48) - for analysis only
BLEU score: 0.5435
ROUGE-1 score: 0.5822
ROUGE-2 score: 0.0000
ROUGE-L score: 0.5822


Epoch 22/50: 100%|██████████| 29/29 [00:01<00:00, 24.95it/s, loss=2.71]


Epoch 22/50, Loss: 2.3979


Evaluating: 100%|██████████| 75/75 [00:00<00:00, 121.10it/s]


Average ROUGE-1 score: 0.5289
Average ROUGE-2 score: 0.0000
Average ROUGE-L score: 0.5289
Yes/No question accuracy: 56.2% (27/48) - for analysis only
BLEU score: 0.4902
ROUGE-1 score: 0.5289
ROUGE-2 score: 0.0000
ROUGE-L score: 0.5289


Epoch 23/50: 100%|██████████| 29/29 [00:01<00:00, 25.51it/s, loss=2.36]


Epoch 23/50, Loss: 2.3567


Evaluating: 100%|██████████| 75/75 [00:00<00:00, 113.33it/s]


Average ROUGE-1 score: 0.5956
Average ROUGE-2 score: 0.0000
Average ROUGE-L score: 0.5956
Yes/No question accuracy: 66.7% (32/48) - for analysis only
BLEU score: 0.5568
ROUGE-1 score: 0.5956
ROUGE-2 score: 0.0000
ROUGE-L score: 0.5956


Epoch 24/50: 100%|██████████| 29/29 [00:01<00:00, 26.24it/s, loss=2.25]


Epoch 24/50, Loss: 2.3009


Evaluating: 100%|██████████| 75/75 [00:00<00:00, 161.15it/s]


Average ROUGE-1 score: 0.6089
Average ROUGE-2 score: 0.0000
Average ROUGE-L score: 0.6089
Yes/No question accuracy: 62.5% (30/48) - for analysis only
BLEU score: 0.5702
ROUGE-1 score: 0.6089
ROUGE-2 score: 0.0000
ROUGE-L score: 0.6089


Epoch 25/50: 100%|██████████| 29/29 [00:01<00:00, 25.95it/s, loss=2.17]


Epoch 25/50, Loss: 2.2653


Evaluating: 100%|██████████| 75/75 [00:00<00:00, 166.07it/s]


Average ROUGE-1 score: 0.5556
Average ROUGE-2 score: 0.0000
Average ROUGE-L score: 0.5556
Yes/No question accuracy: 60.4% (29/48) - for analysis only
BLEU score: 0.5272
ROUGE-1 score: 0.5556
ROUGE-2 score: 0.0000
ROUGE-L score: 0.5556


Epoch 26/50: 100%|██████████| 29/29 [00:01<00:00, 25.93it/s, loss=2.4]


Epoch 26/50, Loss: 2.2139


Evaluating: 100%|██████████| 75/75 [00:00<00:00, 170.52it/s]


Average ROUGE-1 score: 0.5422
Average ROUGE-2 score: 0.0000
Average ROUGE-L score: 0.5422
Yes/No question accuracy: 56.2% (27/48) - for analysis only
BLEU score: 0.5035
ROUGE-1 score: 0.5422
ROUGE-2 score: 0.0000
ROUGE-L score: 0.5422


Epoch 27/50: 100%|██████████| 29/29 [00:01<00:00, 26.13it/s, loss=2.11]


Epoch 27/50, Loss: 2.1750


Evaluating: 100%|██████████| 75/75 [00:00<00:00, 163.29it/s]


Average ROUGE-1 score: 0.5956
Average ROUGE-2 score: 0.0000
Average ROUGE-L score: 0.5956
Yes/No question accuracy: 64.6% (31/48) - for analysis only
BLEU score: 0.5568
ROUGE-1 score: 0.5956
ROUGE-2 score: 0.0000
ROUGE-L score: 0.5956


Epoch 28/50: 100%|██████████| 29/29 [00:01<00:00, 26.09it/s, loss=2.42]


Epoch 28/50, Loss: 2.1304


Evaluating: 100%|██████████| 75/75 [00:00<00:00, 166.71it/s]


Average ROUGE-1 score: 0.5956
Average ROUGE-2 score: 0.0000
Average ROUGE-L score: 0.5956
Yes/No question accuracy: 62.5% (30/48) - for analysis only
BLEU score: 0.5568
ROUGE-1 score: 0.5956
ROUGE-2 score: 0.0000
ROUGE-L score: 0.5956


Epoch 29/50: 100%|██████████| 29/29 [00:01<00:00, 26.10it/s, loss=2]


Epoch 29/50, Loss: 2.1011


Evaluating: 100%|██████████| 75/75 [00:00<00:00, 164.48it/s]


Average ROUGE-1 score: 0.5822
Average ROUGE-2 score: 0.0000
Average ROUGE-L score: 0.5822
Yes/No question accuracy: 60.4% (29/48) - for analysis only
BLEU score: 0.5435
ROUGE-1 score: 0.5822
ROUGE-2 score: 0.0000
ROUGE-L score: 0.5822


Epoch 30/50: 100%|██████████| 29/29 [00:01<00:00, 25.70it/s, loss=2.41]


Epoch 30/50, Loss: 2.0864


Evaluating: 100%|██████████| 75/75 [00:00<00:00, 126.21it/s]


Average ROUGE-1 score: 0.5289
Average ROUGE-2 score: 0.0000
Average ROUGE-L score: 0.5289
Yes/No question accuracy: 56.2% (27/48) - for analysis only
BLEU score: 0.4902
ROUGE-1 score: 0.5289
ROUGE-2 score: 0.0000
ROUGE-L score: 0.5289


Epoch 31/50: 100%|██████████| 29/29 [00:01<00:00, 24.31it/s, loss=1.89]


Epoch 31/50, Loss: 2.0291


Evaluating: 100%|██████████| 75/75 [00:00<00:00, 109.24it/s]


Average ROUGE-1 score: 0.5289
Average ROUGE-2 score: 0.0000
Average ROUGE-L score: 0.5289
Yes/No question accuracy: 54.2% (26/48) - for analysis only
BLEU score: 0.4902
ROUGE-1 score: 0.5289
ROUGE-2 score: 0.0000
ROUGE-L score: 0.5289


Epoch 32/50: 100%|██████████| 29/29 [00:01<00:00, 25.68it/s, loss=2.59]


Epoch 32/50, Loss: 2.0163


Evaluating: 100%|██████████| 75/75 [00:00<00:00, 165.01it/s]


Average ROUGE-1 score: 0.5689
Average ROUGE-2 score: 0.0000
Average ROUGE-L score: 0.5689
Yes/No question accuracy: 60.4% (29/48) - for analysis only
BLEU score: 0.5349
ROUGE-1 score: 0.5689
ROUGE-2 score: 0.0000
ROUGE-L score: 0.5689


Epoch 33/50: 100%|██████████| 29/29 [00:01<00:00, 25.63it/s, loss=1.95]


Epoch 33/50, Loss: 1.9487


Evaluating: 100%|██████████| 75/75 [00:00<00:00, 166.35it/s]


Average ROUGE-1 score: 0.5822
Average ROUGE-2 score: 0.0000
Average ROUGE-L score: 0.5822
Yes/No question accuracy: 58.3% (28/48) - for analysis only
BLEU score: 0.5378
ROUGE-1 score: 0.5822
ROUGE-2 score: 0.0000
ROUGE-L score: 0.5822


Epoch 34/50: 100%|██████████| 29/29 [00:01<00:00, 26.01it/s, loss=1.93]


Epoch 34/50, Loss: 1.9449


Evaluating: 100%|██████████| 75/75 [00:00<00:00, 159.10it/s]


Average ROUGE-1 score: 0.5422
Average ROUGE-2 score: 0.0000
Average ROUGE-L score: 0.5422
Yes/No question accuracy: 54.2% (26/48) - for analysis only
BLEU score: 0.5139
ROUGE-1 score: 0.5422
ROUGE-2 score: 0.0000
ROUGE-L score: 0.5422


Epoch 35/50: 100%|██████████| 29/29 [00:01<00:00, 26.01it/s, loss=1.82]


Epoch 35/50, Loss: 1.9132


Evaluating: 100%|██████████| 75/75 [00:00<00:00, 161.57it/s]


Average ROUGE-1 score: 0.5556
Average ROUGE-2 score: 0.0000
Average ROUGE-L score: 0.5556
Yes/No question accuracy: 58.3% (28/48) - for analysis only
BLEU score: 0.5206
ROUGE-1 score: 0.5556
ROUGE-2 score: 0.0000
ROUGE-L score: 0.5556


Epoch 36/50: 100%|██████████| 29/29 [00:01<00:00, 25.89it/s, loss=1.75]


Epoch 36/50, Loss: 1.8876


Evaluating: 100%|██████████| 75/75 [00:00<00:00, 156.68it/s]


Average ROUGE-1 score: 0.5822
Average ROUGE-2 score: 0.0000
Average ROUGE-L score: 0.5822
Yes/No question accuracy: 58.3% (28/48) - for analysis only
BLEU score: 0.5378
ROUGE-1 score: 0.5822
ROUGE-2 score: 0.0000
ROUGE-L score: 0.5822


Epoch 37/50: 100%|██████████| 29/29 [00:01<00:00, 25.76it/s, loss=1.74]


Epoch 37/50, Loss: 1.8526


Evaluating: 100%|██████████| 75/75 [00:00<00:00, 159.65it/s]


Average ROUGE-1 score: 0.5822
Average ROUGE-2 score: 0.0000
Average ROUGE-L score: 0.5822
Yes/No question accuracy: 62.5% (30/48) - for analysis only
BLEU score: 0.5539
ROUGE-1 score: 0.5822
ROUGE-2 score: 0.0000
ROUGE-L score: 0.5822


Epoch 38/50: 100%|██████████| 29/29 [00:01<00:00, 24.56it/s, loss=1.76]


Epoch 38/50, Loss: 1.8529


Evaluating: 100%|██████████| 75/75 [00:00<00:00, 123.73it/s]


Average ROUGE-1 score: 0.5822
Average ROUGE-2 score: 0.0000
Average ROUGE-L score: 0.5822
Yes/No question accuracy: 64.6% (31/48) - for analysis only
BLEU score: 0.5539
ROUGE-1 score: 0.5822
ROUGE-2 score: 0.0000
ROUGE-L score: 0.5822


Epoch 39/50: 100%|██████████| 29/29 [00:01<00:00, 25.24it/s, loss=1.76]


Epoch 39/50, Loss: 1.8225


Evaluating: 100%|██████████| 75/75 [00:00<00:00, 114.37it/s]


Average ROUGE-1 score: 0.5022
Average ROUGE-2 score: 0.0000
Average ROUGE-L score: 0.5022
Yes/No question accuracy: 54.2% (26/48) - for analysis only
BLEU score: 0.4739
ROUGE-1 score: 0.5022
ROUGE-2 score: 0.0000
ROUGE-L score: 0.5022


Epoch 40/50: 100%|██████████| 29/29 [00:01<00:00, 25.37it/s, loss=1.65]


Epoch 40/50, Loss: 1.8003


Evaluating: 100%|██████████| 75/75 [00:00<00:00, 163.52it/s]


Average ROUGE-1 score: 0.5689
Average ROUGE-2 score: 0.0000
Average ROUGE-L score: 0.5689
Yes/No question accuracy: 60.4% (29/48) - for analysis only
BLEU score: 0.5349
ROUGE-1 score: 0.5689
ROUGE-2 score: 0.0000
ROUGE-L score: 0.5689


Epoch 41/50: 100%|██████████| 29/29 [00:01<00:00, 25.98it/s, loss=1.79]


Epoch 41/50, Loss: 1.8141


Evaluating: 100%|██████████| 75/75 [00:00<00:00, 158.83it/s]


Average ROUGE-1 score: 0.5822
Average ROUGE-2 score: 0.0000
Average ROUGE-L score: 0.5822
Yes/No question accuracy: 60.4% (29/48) - for analysis only
BLEU score: 0.5435
ROUGE-1 score: 0.5822
ROUGE-2 score: 0.0000
ROUGE-L score: 0.5822


Epoch 42/50: 100%|██████████| 29/29 [00:01<00:00, 25.82it/s, loss=1.79]


Epoch 42/50, Loss: 1.7722


Evaluating: 100%|██████████| 75/75 [00:00<00:00, 164.40it/s]


Average ROUGE-1 score: 0.6089
Average ROUGE-2 score: 0.0000
Average ROUGE-L score: 0.6089
Yes/No question accuracy: 62.5% (30/48) - for analysis only
BLEU score: 0.5645
ROUGE-1 score: 0.6089
ROUGE-2 score: 0.0000
ROUGE-L score: 0.6089


Epoch 43/50: 100%|██████████| 29/29 [00:01<00:00, 25.76it/s, loss=2.01]


Epoch 43/50, Loss: 1.8020


Evaluating: 100%|██████████| 75/75 [00:00<00:00, 165.59it/s]


Average ROUGE-1 score: 0.5422
Average ROUGE-2 score: 0.0000
Average ROUGE-L score: 0.5422
Yes/No question accuracy: 54.2% (26/48) - for analysis only
BLEU score: 0.5082
ROUGE-1 score: 0.5422
ROUGE-2 score: 0.0000
ROUGE-L score: 0.5422


Epoch 44/50: 100%|██████████| 29/29 [00:01<00:00, 25.79it/s, loss=1.69]


Epoch 44/50, Loss: 1.7548


Evaluating: 100%|██████████| 75/75 [00:00<00:00, 167.60it/s]


Average ROUGE-1 score: 0.6356
Average ROUGE-2 score: 0.0000
Average ROUGE-L score: 0.6356
Yes/No question accuracy: 64.6% (31/48) - for analysis only
BLEU score: 0.5912
ROUGE-1 score: 0.6356
ROUGE-2 score: 0.0000
ROUGE-L score: 0.6356
New best model saved with BLEU score: 0.5912


Epoch 45/50: 100%|██████████| 29/29 [00:01<00:00, 24.34it/s, loss=1.82]


Epoch 45/50, Loss: 1.7337


Evaluating: 100%|██████████| 75/75 [00:00<00:00, 158.16it/s]


Average ROUGE-1 score: 0.5422
Average ROUGE-2 score: 0.0000
Average ROUGE-L score: 0.5422
Yes/No question accuracy: 54.2% (26/48) - for analysis only
BLEU score: 0.5082
ROUGE-1 score: 0.5422
ROUGE-2 score: 0.0000
ROUGE-L score: 0.5422


Epoch 46/50: 100%|██████████| 29/29 [00:01<00:00, 25.76it/s, loss=1.65]


Epoch 46/50, Loss: 1.7165


Evaluating: 100%|██████████| 75/75 [00:00<00:00, 168.63it/s]


Average ROUGE-1 score: 0.5689
Average ROUGE-2 score: 0.0000
Average ROUGE-L score: 0.5689
Yes/No question accuracy: 62.5% (30/48) - for analysis only
BLEU score: 0.5509
ROUGE-1 score: 0.5689
ROUGE-2 score: 0.0000
ROUGE-L score: 0.5689


Epoch 47/50: 100%|██████████| 29/29 [00:01<00:00, 25.60it/s, loss=1.75]


Epoch 47/50, Loss: 1.7127


Evaluating: 100%|██████████| 75/75 [00:00<00:00, 153.05it/s]


Average ROUGE-1 score: 0.4889
Average ROUGE-2 score: 0.0000
Average ROUGE-L score: 0.4889
Yes/No question accuracy: 47.9% (23/48) - for analysis only
BLEU score: 0.4549
ROUGE-1 score: 0.4889
ROUGE-2 score: 0.0000
ROUGE-L score: 0.4889


Epoch 48/50: 100%|██████████| 29/29 [00:01<00:00, 25.84it/s, loss=1.64]


Epoch 48/50, Loss: 1.6984


Evaluating: 100%|██████████| 75/75 [00:00<00:00, 164.66it/s]


Average ROUGE-1 score: 0.6089
Average ROUGE-2 score: 0.0000
Average ROUGE-L score: 0.6089
Yes/No question accuracy: 62.5% (30/48) - for analysis only
BLEU score: 0.5645
ROUGE-1 score: 0.6089
ROUGE-2 score: 0.0000
ROUGE-L score: 0.6089


Epoch 49/50: 100%|██████████| 29/29 [00:01<00:00, 25.71it/s, loss=1.71]


Epoch 49/50, Loss: 1.6909


Evaluating: 100%|██████████| 75/75 [00:00<00:00, 160.06it/s]


Average ROUGE-1 score: 0.5956
Average ROUGE-2 score: 0.0000
Average ROUGE-L score: 0.5956
Yes/No question accuracy: 58.3% (28/48) - for analysis only
BLEU score: 0.5615
ROUGE-1 score: 0.5956
ROUGE-2 score: 0.0000
ROUGE-L score: 0.5956


Epoch 50/50: 100%|██████████| 29/29 [00:01<00:00, 25.68it/s, loss=1.64]


Epoch 50/50, Loss: 1.6869


Evaluating: 100%|██████████| 75/75 [00:00<00:00, 158.28it/s]

Average ROUGE-1 score: 0.5822
Average ROUGE-2 score: 0.0000
Average ROUGE-L score: 0.5822
Yes/No question accuracy: 62.5% (30/48) - for analysis only
BLEU score: 0.5482
ROUGE-1 score: 0.5822
ROUGE-2 score: 0.0000
ROUGE-L score: 0.5822
Restored best model with BLEU score: 0.5912





ValueError: not enough values to unpack (expected 5, got 2)