## MME Bench Mark on LoRa Fine-tuned InstructBlip- Vicuana7B Model

In [1]:
# Install required packages
!pip install torch torchvision transformers pillow tqdm scikit-learn datasets huggingface_hub accelerate safetensors


Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting 

In [3]:
# Create directory for MME evaluation
!mkdir -p mme_benchmark
%cd mme_benchmark

# Download the evaluation tools
!wget https://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models/raw/Evaluation/tools/eval_tool.zip
!unzip -o eval_tool.zip

# Check the contents
!ls -la eval_tool/


/content/mme_benchmark
--2025-04-12 18:41:37--  https://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models/raw/Evaluation/tools/eval_tool.zip
Resolving github.com (github.com)... 140.82.114.3
Connecting to github.com (github.com)|140.82.114.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/BradyFU/Awesome-Multimodal-Large-Language-Models/Evaluation/tools/eval_tool.zip [following]
--2025-04-12 18:41:37--  https://raw.githubusercontent.com/BradyFU/Awesome-Multimodal-Large-Language-Models/Evaluation/tools/eval_tool.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 107320 (105K) [application/zip]
Saving to: ‘eval_tool.zip.2’


2025-04-12 18:41:38 (4.49 MB/s) - ‘eval_tool.zip.2’ saved [10

In [4]:
from datasets import load_dataset

# Load the MME dataset
dataset = load_dataset("lmms-lab/MME")
test_data = dataset["test"]

# Display information about the dataset
print(f"Dataset size: {len(test_data)}")
print(f"Example keys: {test_data[0].keys()}")

# Display a sample question
print(f"Sample question: {test_data[0]['question']}")
print(f"Sample answer: {test_data[0]['answer']}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Dataset size: 2374
Example keys: dict_keys(['question_id', 'image', 'question', 'answer', 'category'])
Sample question: Is a python code shown in the picture? Please answer yes or no.
Sample answer: Yes


In [9]:
!unzip -q /content/instructblip-advanced-with-incresed_LoRa-fintuned.zip -d /content/instructblip-advanced-with-incresed_LoRa-fintuned

In [5]:
import torch
from PIL import Image
from typing import Union, Any

class CustomModelAdapter:
    """
    Adapter class for custom vision-language models to interface with MME benchmark.

    """

    def __init__(self, model_path: str, device: str = "cuda"):
        """Initialize custom model."""
        self.device = device
        self.model_path = model_path


        from transformers import AutoProcessor, AutoModelForCausalLM
        # Load the fine-tuned model


        self.model = InstructBlipForConditionalGeneration.from_pretrained("/content/instructblip-advanced-original-with-LoRa-fintuned")
        self.processor = InstructBlipProcessor.from_pretrained("/content/instructblip-advanced-original-with-LoRa-fintuned", use_fast=True)

        # IMPORTANT: Move the entire model to the specified device
        self.model = self.model.to(device)  # This ensures all model parameters are on the GPU

        print(f"Initialized custom model from {model_path} on {device}")

    def generate_response(self, image: Union[str, Image.Image], question: str) -> str:
        """Generate a response for a given image and question using the model."""
        # Load image if path is provided
        if isinstance(image, str):
            image = Image.open(image).convert('RGB')


        # Encode inputs for the model
        encoded_inputs = self.processor(
            images=image,
            text=question,
            return_tensors="pt",
            truncation=True,
            max_length=512
        ).to(self.device)

        # Generate model response
        outputs = self.model.generate(
            **encoded_inputs,
            do_sample=False,
            num_beams=3,
            max_length=150,
            min_length=10,
            length_penalty=2.0,
            repetition_penalty=2.0
        )

        # Decode the generated text
        generated_text = self.processor.decode(outputs[0], skip_special_tokens=True)

        # Extract the response (remove the question if it's included in the output)
        response = generated_text
        if question in generated_text:
            response = generated_text.split(question)[-1].strip()

        return response

    def postprocess_response(self, response: str, question: str) -> str:
        """Postprocess the model's response to match MME benchmark format requirements."""
        # Remove the question if it's included in the response
        if question in response:
            response = response.split(question)[-1].strip()

        # For yes/no questions, ensure the response starts with yes or no
        if "Please answer yes or no" in question:
            response_lower = response.lower()
            if "yes" in response_lower[:20]:
                response = "yes" + response[response_lower.find("yes")+3:]
            elif "no" in response_lower[:20]:
                response = "no" + response[response_lower.find("no")+2:]

        # Ensure response doesn't contain newlines (MME requirement)
        response = response.replace("\n", " ")

        return response


In [6]:
import os
import shutil

def prepare_task_files(eval_tool_dir, results_dir):
    """Prepare empty result files for each task."""
    # Define the task categories
    perception_tasks = ["existence", "count", "position", "color", "posters",
                       "celebrity", "scene", "landmark", "artwork", "OCR"]
    cognition_tasks = ["commonsense_reasoning", "numerical_calculation",
                      "text_translation", "code_reasoning"]
    all_tasks = perception_tasks + cognition_tasks

    # Copy template files from LaVIN directory to Your_Results
    lavin_dir = os.path.join(eval_tool_dir, "LaVIN")

    task_files = {}
    for task in all_tasks:
        src_file = os.path.join(lavin_dir, f"{task}.txt")
        dst_file = os.path.join(results_dir, f"{task}.txt")

        # Copy the file structure but not the content
        with open(src_file, 'r') as src:
            lines = src.readlines()

        # Create empty result files with the same structure
        task_files[task] = open(dst_file, 'w')

        # Write the template lines (without the model responses)
        for line in lines:
            parts = line.strip().split('\t')
            if len(parts) >= 3:
                # Format: Image_Name + "\t" + Question + "\t" + Ground_Truth_Answer
                template_line = f"{parts[0]}\t{parts[1]}\t{parts[2]}\n"
                task_files[task].write(template_line)

    return task_files, all_tasks

# Set up directories
eval_tool_dir = "./eval_tool"
results_dir = os.path.join(eval_tool_dir, "Your_Results")
os.makedirs(results_dir, exist_ok=True)

# Prepare task files
task_files, all_tasks = prepare_task_files(eval_tool_dir, results_dir)

# Close all task files
for file in task_files.values():
    file.close()


In [7]:
from tqdm.notebook import tqdm
from transformers import InstructBlipForConditionalGeneration, InstructBlipProcessor

def evaluate_model(model_adapter, test_data, all_tasks, results_dir):
    """Evaluate the model on the MME benchmark."""
    print("Evaluating model on MME benchmark...")

    # Create a dictionary to store task files
    task_files = {task: os.path.join(results_dir, f"{task}.txt") for task in all_tasks}

    # Process each example in the dataset
    for example in tqdm(test_data):
        question_id = example["question_id"]
        question = example["question"]
        ground_truth = example["answer"]
        image = example["image"]

        # Determine which task this example belongs to
        task = None
        for t in all_tasks:
            if t in question_id:
                task = t
                break

        if task is None:
            print(f"Warning: Could not determine task for question_id {question_id}")
            continue

        # Generate response using the model adapter
        response = model_adapter.generate_response(image, question)
        processed_response = model_adapter.postprocess_response(response, question)


        # Format the result
        # Format: Image_Name + "\t" + Question + "\t" + Ground_Truth_Answer + "\t" + Your_Response + "\n"
        image_name = question_id.split("/")[-1]
        result_line = f"{image_name}\t{question}\t{ground_truth}\t{processed_response}\n"

        # Find the corresponding file and replace the template line with the result
        with open(task_files[task], 'r') as f:
            lines = f.readlines()

        with open(task_files[task], 'w') as f:
            for line in lines:
                if line.startswith(f"{image_name}\t{question}\t{ground_truth}"):
                    f.write(result_line)
                else:
                    f.write(line)

# Initialize your model adapter
model_path = "/content/instructblip-advanced-original-with-LoRa-fintuned"  # Replace with your model path
model_adapter = CustomModelAdapter(model_path=model_path, device="cuda")

# Evaluate your model
evaluate_model(model_adapter, test_data, all_tasks, results_dir)


Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Initialized custom model from /content/instructblip-advanced-original-with-LoRa-fintuned on cuda
Evaluating model on MME benchmark...


  0%|          | 0/2374 [00:00<?, ?it/s]

In [8]:
import subprocess

def calculate_results(eval_tool_dir, results_dir):
    """Run the calculation script to compute evaluation metrics."""
    print("Calculating evaluation results...")
    calculation_script = os.path.join(eval_tool_dir, "calculation.py")

    # Run the calculation script
    cmd = ["python", calculation_script, "--results_dir", results_dir]
    try:
        result = subprocess.run(cmd, check=True, capture_output=True, text=True)
        print(result.stdout)

        # Save the results to a file
        with open(os.path.join(results_dir, "evaluation_results.txt"), 'w') as f:
            f.write(result.stdout)

        print(f"Results saved to {os.path.join(results_dir, 'evaluation_results.txt')}")

        # Optionally save to Google Drive
        #drive_results_path = "/content/drive/MyDrive/mme_results/evaluation_results.txt"
        #!cp {os.path.join(results_dir, "evaluation_results.txt")} {drive_results_path}
        #print(f"Results also saved to Google Drive at {drive_results_path}")

    except subprocess.CalledProcessError as e:
        print(f"Error running calculation script: {e}")
        print(f"Error output: {e.stderr}")

calculate_results(eval_tool_dir, results_dir)


Calculating evaluation results...
total score: 1051.983693477391 

	 existence  score: 185.0
	 count  score: 55.00000000000001
	 position  score: 50.0
	 color  score: 110.0
	 posters  score: 131.2925170068027
	 celebrity  score: 62.94117647058823
	 scene  score: 156.0
	 landmark  score: 159.25
	 artwork  score: 85.0
	 OCR  score: 57.5


total score: 242.85714285714286 

	 commonsense_reasoning  score: 97.85714285714286
	 numerical_calculation  score: 45.0
	 text_translation  score: 50.0
	 code_reasoning  score: 50.0



Results saved to ./eval_tool/Your_Results/evaluation_results.txt


## second model

In [11]:
import torch
from PIL import Image
from typing import Union, Any

class CustomModelAdapter:
    """
    Adapter class for custom vision-language models to interface with MME benchmark.

    """

    def __init__(self, model_path: str, device: str = "cuda"):
        """Initialize custom model."""
        self.device = device
        self.model_path = model_path


        from transformers import AutoProcessor, AutoModelForCausalLM
        # Load the fine-tuned model


        self.model = InstructBlipForConditionalGeneration.from_pretrained("/content/instructblip-advanced-with-incresed_LoRa-fintuned")
        self.processor = InstructBlipProcessor.from_pretrained("/content/instructblip-advanced-with-incresed_LoRa-fintuned", use_fast=True)

        # IMPORTANT: Move the entire model to the specified device
        self.model = self.model.to(device)  # This ensures all model parameters are on the GPU

        print(f"Initialized custom model from {model_path} on {device}")

    def generate_response(self, image: Union[str, Image.Image], question: str) -> str:
        """Generate a response for a given image and question using the model."""
        # Load image if path is provided
        if isinstance(image, str):
            image = Image.open(image).convert('RGB')


        # Encode inputs for the model
        encoded_inputs = self.processor(
            images=image,
            text=question,
            return_tensors="pt",
            truncation=True,
            max_length=512
        ).to(self.device)

        # Generate model response
        outputs = self.model.generate(
            **encoded_inputs,
            do_sample=False,
            num_beams=3,
            max_length=150,
            min_length=10,
            length_penalty=2.0,
            temperature=1.0,
            top_k=50,
            top_p=0.95,
            repetition_penalty=2.0
        )

        # Decode the generated text
        generated_text = self.processor.decode(outputs[0], skip_special_tokens=True)

        # Extract the response (remove the question if it's included in the output)
        response = generated_text
        if question in generated_text:
            response = generated_text.split(question)[-1].strip()

        return response

    def postprocess_response(self, response: str, question: str) -> str:
        """Postprocess the model's response to match MME benchmark format requirements."""
        # Remove the question if it's included in the response
        if question in response:
            response = response.split(question)[-1].strip()

        # For yes/no questions, ensure the response starts with yes or no
        if "Please answer yes or no" in question:
            response_lower = response.lower()
            if "yes" in response_lower[:20]:
                response = "yes" + response[response_lower.find("yes")+3:]
            elif "no" in response_lower[:20]:
                response = "no" + response[response_lower.find("no")+2:]

        # Ensure response doesn't contain newlines (MME requirement)
        response = response.replace("\n", " ")

        return response


In [12]:
import os
import shutil

def prepare_task_files(eval_tool_dir, results_dir):
    """Prepare empty result files for each task."""
    # Define the task categories
    perception_tasks = ["existence", "count", "position", "color", "posters",
                       "celebrity", "scene", "landmark", "artwork", "OCR"]
    cognition_tasks = ["commonsense_reasoning", "numerical_calculation",
                      "text_translation", "code_reasoning"]
    all_tasks = perception_tasks + cognition_tasks

    # Copy template files from LaVIN directory to Your_Results
    lavin_dir = os.path.join(eval_tool_dir, "LaVIN")

    task_files = {}
    for task in all_tasks:
        src_file = os.path.join(lavin_dir, f"{task}.txt")
        dst_file = os.path.join(results_dir, f"{task}.txt")

        # Copy the file structure but not the content
        with open(src_file, 'r') as src:
            lines = src.readlines()

        # Create empty result files with the same structure
        task_files[task] = open(dst_file, 'w')

        # Write the template lines (without the model responses)
        for line in lines:
            parts = line.strip().split('\t')
            if len(parts) >= 3:
                # Format: Image_Name + "\t" + Question + "\t" + Ground_Truth_Answer
                template_line = f"{parts[0]}\t{parts[1]}\t{parts[2]}\n"
                task_files[task].write(template_line)

    return task_files, all_tasks

# Set up directories
eval_tool_dir = "./eval_tool"
results_dir = os.path.join(eval_tool_dir, "Your_Results")
os.makedirs(results_dir, exist_ok=True)

# Prepare task files
task_files, all_tasks = prepare_task_files(eval_tool_dir, results_dir)

# Close all task files
for file in task_files.values():
    file.close()


In [14]:
from tqdm.notebook import tqdm
from transformers import InstructBlipForConditionalGeneration, InstructBlipProcessor

def evaluate_model(model_adapter, test_data, all_tasks, results_dir):
    """Evaluate the model on the MME benchmark."""
    print("Evaluating model on MME benchmark...")

    # Create a dictionary to store task files
    task_files = {task: os.path.join(results_dir, f"{task}.txt") for task in all_tasks}

    # Process each example in the dataset
    for example in tqdm(test_data):
        question_id = example["question_id"]
        question = example["question"]
        ground_truth = example["answer"]
        image = example["image"]

        # Determine which task this example belongs to
        task = None
        for t in all_tasks:
            if t in question_id:
                task = t
                break

        if task is None:
            print(f"Warning: Could not determine task for question_id {question_id}")
            continue

        # Generate response using the model adapter
        response = model_adapter.generate_response(image, question)
        processed_response = model_adapter.postprocess_response(response, question)


        # Format the result
        # Format: Image_Name + "\t" + Question + "\t" + Ground_Truth_Answer + "\t" + Your_Response + "\n"
        image_name = question_id.split("/")[-1]
        result_line = f"{image_name}\t{question}\t{ground_truth}\t{processed_response}\n"

        # Find the corresponding file and replace the template line with the result
        with open(task_files[task], 'r') as f:
            lines = f.readlines()

        with open(task_files[task], 'w') as f:
            for line in lines:
                if line.startswith(f"{image_name}\t{question}\t{ground_truth}"):
                    f.write(result_line)
                else:
                    f.write(line)

# Initialize your model adapter
model_path = "/content/instructblip-advanced-with-incresed_LoRa-fintuned"  # Replace with your model path
model_adapter = CustomModelAdapter(model_path=model_path, device="cuda")

# Evaluate your model
evaluate_model(model_adapter, test_data, all_tasks, results_dir)


Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Initialized custom model from /content/instructblip-advanced-with-incresed_LoRa-fintuned on cuda
Evaluating model on MME benchmark...


  0%|          | 0/2374 [00:00<?, ?it/s]



In [15]:
import subprocess

def calculate_results(eval_tool_dir, results_dir):
    """Run the calculation script to compute evaluation metrics."""
    print("Calculating evaluation results...")
    calculation_script = os.path.join(eval_tool_dir, "calculation.py")

    # Run the calculation script
    cmd = ["python", calculation_script, "--results_dir", results_dir]
    try:
        result = subprocess.run(cmd, check=True, capture_output=True, text=True)
        print(result.stdout)

        # Save the results to a file
        with open(os.path.join(results_dir, "evaluation_results.txt"), 'w') as f:
            f.write(result.stdout)

        print(f"Results saved to {os.path.join(results_dir, 'evaluation_results.txt')}")

        # Optionally save to Google Drive
        #drive_results_path = "/content/drive/MyDrive/mme_results/evaluation_results.txt"
        #!cp {os.path.join(results_dir, "evaluation_results.txt")} {drive_results_path}
        #print(f"Results also saved to Google Drive at {drive_results_path}")

    except subprocess.CalledProcessError as e:
        print(f"Error running calculation script: {e}")
        print(f"Error output: {e.stderr}")

calculate_results(eval_tool_dir, results_dir)


Calculating evaluation results...
total score: 1028.1624649859946 

	 existence  score: 180.0
	 count  score: 55.00000000000001
	 position  score: 50.0
	 color  score: 110.0
	 posters  score: 123.80952380952382
	 celebrity  score: 62.35294117647059
	 scene  score: 155.5
	 landmark  score: 156.25
	 artwork  score: 85.25
	 OCR  score: 50.0


total score: 234.64285714285714 

	 commonsense_reasoning  score: 87.14285714285714
	 numerical_calculation  score: 47.5
	 text_translation  score: 50.0
	 code_reasoning  score: 50.0



Results saved to ./eval_tool/Your_Results/evaluation_results.txt
