In [1]:
!pip install icecream
!pip uninstall transformers -y
!pip install transformers==4.37.2
!pip install flash_attn

Collecting icecream
  Downloading icecream-2.1.4-py3-none-any.whl.metadata (1.3 kB)
Collecting executing>=2.1.0 (from icecream)
  Downloading executing-2.2.0-py2.py3-none-any.whl.metadata (8.9 kB)
Downloading icecream-2.1.4-py3-none-any.whl (14 kB)
Downloading executing-2.2.0-py2.py3-none-any.whl (26 kB)
Installing collected packages: executing, icecream
Successfully installed executing-2.2.0 icecream-2.1.4
Found existing installation: transformers 4.51.3
Uninstalling transformers-4.51.3:
  Successfully uninstalled transformers-4.51.3
Collecting transformers==4.37.2
  Downloading transformers-4.37.2-py3-none-any.whl.metadata (129 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.4/129.4 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.19,>=0.14 (from transformers==4.37.2)
  Downloading tokenizers-0.15.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.37.2-py3-none-any.whl 

In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoProcessor
from transformers import AutoConfig, AutoModel, AutoModelForCausalLM
from PIL import Image
import pandas as pd
from tqdm import tqdm
import os
import string

2025-05-17 15:55:06.832917: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747497307.257899      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747497307.367658      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:

# === Configuration ===
# FIX THESE PATHS
MODEL_PATH = 'mPLUG/mPLUG-Owl3-2B-241014'
# Use your cleaned validation CSV if you have one, otherwise use the training CSV
EVAL_CSV_PATH = "/kaggle/input/vr-csv-cleaned/kaggle/working/cleaned_csvs/vqa_val_cleaned.csv"

OUTPUT_RESULTS_PATH = "/kaggle/working/evaluation_results.csv" # Where to save detailed results (optional)

CORRECT_DELIMITER = ',' # The delimiter for your CSV file

# Inference batch size (can be larger than training batch size as no gradients are stored)
EVAL_BATCH_SIZE = 8 # Adjust based on your GPU memory

# Generation parameters for evaluation (typically greedy decoding for VQA)
MAX_NEW_TOKENS = 10 # Set small for one-word answers + potential punctuation/padding
DO_SAMPLE = False # Use greedy decoding
NUM_BEAMS = 1   # Use greedy decoding

In [4]:
class VQAEvalDataset(Dataset):
    def __init__(self, csv_path, processor, delimiter=','):
        self.df = pd.read_csv(csv_path, sep=delimiter)
        self.df = self.df.head(10000).copy()
        if not all(col in self.df.columns for col in ["image_path", "question", "answer"]):
             raise ValueError(f"CSV must contain 'image_path', 'question', and 'answer' columns. Found: {self.df.columns.tolist()}")
        
        self.df.dropna(subset=["image_path", "question", "answer"], inplace=True)
        print(f"Loaded {len(self.df)} evaluation samples from {csv_path} after dropping NaNs.")

        self.processor = processor

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        image_path = row["image_path"]
        question = str(row["question"]).strip()
        ground_truth_answer = str(row["answer"]).strip()

        try:
            image = Image.open(image_path).convert("RGB")
        except Exception as e:
            print(f"Error loading image for eval at index {idx}: {image_path}. Skipping.")
            return None # Return None for samples that fail to load

        return image, question, ground_truth_answer # Return individual sample data

# This function now just filters out None and returns the list of valid samples
def eval_collate_fn_single(batch):
    # Filter out any samples that might have failed to load (e.g., returned None)
    batch = [item for item in batch if item is not None]
    if not batch:
        return None # Return None if batch is empty after filtering
    return batch # Return the list of valid sample tuples

# === Load Model and Processor ===
print(f"Loading model and processor: {MODEL_PATH}")
config = AutoConfig.from_pretrained(MODEL_PATH, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(MODEL_PATH, config=config, attn_implementation='sdpa', torch_dtype=torch.half, trust_remote_code=True)
model.eval().cuda()
print("Model loaded and moved to CUDA eval mode.")

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
processor = model.init_processor(tokenizer)

# === Dataset and DataLoader ===
print(f"Loading evaluation dataset from {EVAL_CSV_PATH}")
eval_dataset = VQAEvalDataset(EVAL_CSV_PATH, processor, delimiter=CORRECT_DELIMITER)

# Create DataLoader for evaluation - Use batch_size=1 and the simplified collate_fn
eval_loader = DataLoader(
    eval_dataset,
    batch_size=1, # Set batch size to 1 for single-sample processing
    shuffle=False,
    num_workers=2, # Can still use workers for data loading
    pin_memory=True,
    collate_fn=eval_collate_fn_single, # Use the simplified collate function
    persistent_workers=False
)
print(f"Prepared dataloader with {len(eval_dataset)} samples ({len(eval_loader)} batches).")


# === Evaluation Loop (Processing samples individually) ===
print("\nStarting evaluation...")
correct_predictions = 0
total_samples_evaluated = 0 # Renamed for clarity
results_list = []

# Use torch.no_grad() for inference
with torch.no_grad():
    # Iterate through batches (each batch now contains a single sample)
    for batch_data in tqdm(eval_loader, total=len(eval_loader), desc="Evaluating"):

        # batch_data is now a list containing a single tuple: [(image, question, ground_truth_answer)]
        if batch_data is None or not batch_data:
             # Warning already printed by collate_fn or handle empty batch
             continue

        # Unpack the single sample from the batch list
        image, question, ground_truth_answer = batch_data[0]

        # === Process the single sample using the processor ===
        messages_for_sample = [
            {"role": "user", "content": f"<|image|> {question} (one word answer)"},
            {"role": "assistant", "content": ""}, # Include if this is part of the expected chat template
        ]

        try:
            # Call the processor for this *single* sample
            inputs = processor(
                messages=messages_for_sample, # Pass the single sample's message list
                images=[image],              # Pass a list containing the single sample's image
                videos=None,
                return_tensors="pt",         # Get PyTorch tensors (batch size 1)
                padding="longest",           # Pad text sequence for this single sample
            )
        except Exception as e:
            print(f"\nWarning: Error processing single sample (question: '{question}') in batch: {e}. Skipping this sample.")
            # Count this sample as evaluated but incorrect due to processing error
            total_samples_evaluated += 1
            results_list.append({
                 "GroundTruth": ground_truth_answer, "Prediction": "PROCESSING_ERROR", "Correct": False,
                 "CleanedPrediction": "PROCESSING_ERROR", "CleanedGroundTruth": ground_truth_answer.strip().lower(),
             })
            continue # Skip to the next sample


        # === Device Placement for Inputs ===
        device = next(model.parameters()).device
        # Move inputs to the correct device (should already be batch size 1)
        inputs = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
        # ==================================

        # === Model Generation ===
        generated_ids = None # Initialize to None
        generated_text = "" # Initialize as empty string
        try:
            generate_kwargs = {
                **inputs,
                'tokenizer': tokenizer, # Pass tokenizer if required by model.generate
                'max_new_tokens': MAX_NEW_TOKENS, # Use a reasonable max length (e.g., 10 or 15)
                'num_beams': NUM_BEAMS, # Use beam search or greedy (1)
                'do_sample': DO_SAMPLE, # Keep False for deterministic evaluation
            }

            generated_ids = model.generate(**generate_kwargs)

            # Decode generated IDs to text (batch size 1 output)
            if generated_ids is not None and generated_ids.shape[0] > 0:
                # batch_decode returns a list, get the first element for batch_size 1
                generated_text = tokenizer.batch_decode(generated_ids.cpu(), skip_special_tokens=True)[0]
            else:
                 generated_text = "" # Ensure it's an empty string if generation failed

        except Exception as e:
            print(f"\nError during generation for sample (question: '{question}'): {e}. Skipping this sample.")
            total_samples_evaluated += 1
            results_list.append({
                 "GroundTruth": ground_truth_answer, "Prediction": "GENERATION_ERROR", "Correct": False,
                 "CleanedPrediction": "GENERATION_ERROR", "CleanedGroundTruth": ground_truth_answer.strip().lower(),
             })
            continue # Skip to the next sample


        # === Compare Prediction to Ground Truth ===
        prediction = generated_text.strip() # Get raw prediction and strip whitespace

        # --- Clean Prediction for Comparison ---
        # Remove punctuation and convert to lowercase
        cleaned_prediction = prediction.lower()
        cleaned_prediction = cleaned_prediction.translate(str.maketrans('', '', string.punctuation))
        cleaned_prediction = cleaned_prediction.strip()

        # Clean Ground Truth for Comparison (lowercase and strip punctuation)
        cleaned_ground_truth = ground_truth_answer.strip().lower()

        # --- Exact Match Comparison ---
        is_correct = (cleaned_prediction == cleaned_ground_truth)

        if is_correct:
            correct_predictions += 1

        # Always increment total samples evaluated for each successfully processed sample
        total_samples_evaluated += 1

        # Optional: Store results for detailed analysis later
        results_list.append({
            "GroundTruth": ground_truth_answer, # Original GT
            "Prediction": prediction, # Original Prediction
            "CleanedPrediction": cleaned_prediction,
            "CleanedGroundTruth": cleaned_ground_truth, # Cleaned GT
            "Correct": is_correct,
        })
        # # === Print for Debugging/Logging ===
        # print(f"  - Ground Truth        : {ground_truth_answer}")
        # print(f"  - Generated Prediction: {prediction}")
        # print(f"  - Cleaned Prediction  : {cleaned_prediction}")
        # print(f"  - Cleaned Ground Truth: {cleaned_ground_truth}")
        # print(f"  - Match?              : {is_correct}")


# === Calculate and Report Final Metrics ===
print("\n--- Evaluation Complete ---")

accuracy = (correct_predictions / total_samples_evaluated) if total_samples_evaluated > 0 else 0.0

print(f"Total Samples Evaluated: {total_samples_evaluated}")
print(f"Correct Predictions: {correct_predictions}")
print(f"Accuracy: {accuracy:.4f}")

if results_list and OUTPUT_RESULTS_PATH:
    try:
        results_df = pd.DataFrame(results_list)
        results_df.to_csv(OUTPUT_RESULTS_PATH, index=False)
        print(f"Detailed results saved to {OUTPUT_RESULTS_PATH}")
    except Exception as e:
        print(f"Error saving detailed results to {OUTPUT_RESULTS_PATH}: {e}")

print("\nEvaluation script finished.")

Loading model and processor: mPLUG/mPLUG-Owl3-2B-241014




config.json:   0%|          | 0.00/1.14k [00:00<?, ?B/s]

configuration_mplugowl3.py:   0%|          | 0.00/1.49k [00:00<?, ?B/s]

configuration_hyper_qwen2.py:   0%|          | 0.00/5.99k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/mPLUG/mPLUG-Owl3-2B-241014:
- configuration_hyper_qwen2.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/mPLUG/mPLUG-Owl3-2B-241014:
- configuration_mplugowl3.py
- configuration_hyper_qwen2.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_mplugowl3.py:   0%|          | 0.00/7.71k [00:00<?, ?B/s]



processing_mplugowl3.py:   0%|          | 0.00/16.9k [00:00<?, ?B/s]

image_processing_mplugowl3.py:   0%|          | 0.00/16.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/mPLUG/mPLUG-Owl3-2B-241014:
- image_processing_mplugowl3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/mPLUG/mPLUG-Owl3-2B-241014:
- processing_mplugowl3.py
- image_processing_mplugowl3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_hyper_qwen2.py:   0%|          | 0.00/70.3k [00:00<?, ?B/s]

x_sdpa.py:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/mPLUG/mPLUG-Owl3-2B-241014:
- x_sdpa.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/mPLUG/mPLUG-Owl3-2B-241014:
- modeling_hyper_qwen2.py
- x_sdpa.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/mPLUG/mPLUG-Owl3-2B-241014:
- modeling_mplugowl3.py
- processing_mplugowl3.py
- modeling_hyper_qwen2.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


use flash_attn rotary




model.safetensors:   0%|          | 0.00/3.95G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

Model loaded and moved to CUDA eval mode.


tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading evaluation dataset from /kaggle/input/vr-csv-cleaned/kaggle/working/cleaned_csvs/vqa_val_cleaned.csv
Loaded 10000 evaluation samples from /kaggle/input/vr-csv-cleaned/kaggle/working/cleaned_csvs/vqa_val_cleaned.csv after dropping NaNs.
Prepared dataloader with 10000 samples (10000 batches).

Starting evaluation...


Evaluating: 100%|██████████| 10000/10000 [1:43:48<00:00,  1.61it/s]



--- Evaluation Complete ---
Total Samples Evaluated: 10000
Correct Predictions: 6099
Accuracy: 0.6099
Detailed results saved to /kaggle/working/evaluation_results.csv

Evaluation script finished.
