In [1]:
from utils.benchmark_evaluator import SelfPreferenceBiasEvaluator
from utils.load_cultural_dataset import CulturalBiasDataset
import pickle
import os
from tqdm import tqdm
import pandas as pd
from datasets import load_dataset
from colpali_engine.models import ColPaliProcessor, ColQwen2Processor, ColQwen2_5_Processor

  from .autonotebook import tqdm as notebook_tqdm


## Setup paths and constants

In [None]:
# Define paths
current_path = os.getcwd()
# BENCHMARK_PATH = os.path.join(current_path, 'benchmarks', 'xcm-bench.csv')
# DATASET_PATH =  os.path.join(current_path, "datasets", "tierone003_deduplicated_and_renamed")
DATASET_PATH =  'Chula-AI/association_bias_benchmark'
EMBEDDING_DIR_PATH = os.path.join(current_path, 'embeddings')

def find_model_type(model_name):
    """Determine model type from model name"""
    model_name = model_name.lower()
    if 'clip' in model_name:
        return 'clip'
    elif 'xlm' in model_name:
        return 'clip'
    elif 'jina' in model_name:
        return 'clip'
    elif 'siglip2' in model_name:
        return 'siglip2'
    elif 'gme' in model_name:
        return 'gme'
    elif 'colqwen2.5' in model_name:
        return 'colqwen2.5'
    elif 'colqwen2' in model_name:
        return 'colqwen2'
    elif 'colpali' in model_name:
        return 'colpali'
    else:
        raise ValueError(f"Unknown model type for model name: {model_name}")

# Get list of embedding files
embedding_files = os.listdir(EMBEDDING_DIR_PATH)
embeddings_list = []

# Process embedding files
for file_name in embedding_files:
    # Skip .gitkeep files
    if file_name == '.gitkeep':
        continue
    model_name = file_name[22:-12].replace('_', '-')  # Extract model name from filename
    model_type = find_model_type(model_name)
    embeddings_list.append({
        'model_name': model_name,
        'model_type': model_type,
        'embedding_file_name': file_name
    })

## Evaluate all embeddings

In [3]:
# Load dataset and benchmark
dataset = CulturalBiasDataset(DATASET_PATH)
benchmark  = load_dataset(DATASET_PATH, name="benchmark", split='train')
# benchmark_df = pd.read_csv(BENCHMARK_PATH)
# benchmark  = benchmark_df.to_dict('records')

# Process each embedding file
for embedding_info in tqdm(embeddings_list, desc="Processing embeddings"):
    model_name = embedding_info['model_name']
    model_type = embedding_info['model_type']
    embedding_file_name = embedding_info['embedding_file_name']

    print(f"\nEvaluating model: {model_name}")
    print(f"Model type: {model_type}")
    print(f"Using embedding file: {embedding_file_name}")

    # Clear previous embeddings to free up memory
    if hasattr(dataset, 'embeddings'):
        del dataset.embeddings
    embeddings = {} # Start with a clean slate
    
    # Load embeddings into dataset
    dataset.import_local_embedding(EMBEDDING_DIR_PATH, embedding_file_name)
    embeddings = dataset.embeddings
    
    # Initialize processor for late interaction models
    processor = None
    if model_type == 'colpali':
        processor = ColPaliProcessor.from_pretrained("vidore/colpali-v1.3-merged")
    elif model_type == 'colqwen2':
        processor = ColQwen2Processor.from_pretrained("vidore/colqwen2-v1.0")
    elif model_type == 'colqwen2.5':
        processor = ColQwen2_5_Processor.from_pretrained("vidore/colqwen2.5-v0.2")
    
    # Create evaluator instance
    evaluator = SelfPreferenceBiasEvaluator(
        dataset=dataset,
        embeddings=embeddings,
        model_name=model_name,
        model_type=model_type,
        processor=processor
    )
    
    # Evaluate both text-to-image and image-to-image experiments
    print("\nEvaluating text-to-image...")
    evaluator.evaluate_and_save(benchmark, experiment_type="text-to-image")
    
    print("\nEvaluating image-to-image...")
    evaluator.evaluate_and_save(benchmark, experiment_type="image-to-image")
    
    print(f"\nCompleted evaluation for model: {model_name}")

Using the latest cached version of the dataset since Chula-AI/association_bias_benchmark couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'image_metadata' at C:\Users\peera\.cache\huggingface\datasets\Chula-AI___association_bias_benchmark\image_metadata\0.0.0\c08ef2c5953051f6b3dac3fd6cd014cd13275f5b (last modified on Fri Oct 17 01:51:19 2025).
Using the latest cached version of the dataset since Chula-AI/association_bias_benchmark couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'benchmark' at C:\Users\peera\.cache\huggingface\datasets\Chula-AI___association_bias_benchmark\benchmark\0.0.0\c08ef2c5953051f6b3dac3fd6cd014cd13275f5b (last modified on Fri Oct 17 02:10:35 2025).
Processing embeddings:   0%|          | 0/7 [00:00<?, ?it/s]


Evaluating model: clip-vit-large-patch14
Model type: clip
Using embedding file: image_text_embeddings_clip-vit-large-patch14_0_11759.pkl
file path: c:\Users\peera\project\submission_arr202510\software\rq2_eval\embeddings\image_text_embeddings_clip-vit-large-patch14_0_11759.pkl
Converted embeddings to dictionary of lists.

Evaluating text-to-image...
len new bench mark: 11724


100%|██████████| 11724/11724 [00:05<00:00, 2225.58it/s]



Evaluating image-to-image...
len new bench mark: 11724


100%|██████████| 11724/11724 [00:04<00:00, 2526.53it/s]
Processing embeddings:  14%|█▍        | 1/7 [00:28<02:50, 28.35s/it]


Completed evaluation for model: clip-vit-large-patch14

Evaluating model: ColQwen2.5-3b-multilingual-v1.0
Model type: colqwen2.5
Using embedding file: image_text_embeddings_ColQwen2.5-3b-multilingual-v1.0_0_11759.pkl
file path: c:\Users\peera\project\submission_arr202510\software\rq2_eval\embeddings\image_text_embeddings_ColQwen2.5-3b-multilingual-v1.0_0_11759.pkl
Converted embeddings to dictionary of lists.


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.



Evaluating text-to-image...
len new bench mark: 11724


100%|██████████| 11724/11724 [02:59<00:00, 65.48it/s] 



Evaluating image-to-image...
len new bench mark: 11724


100%|██████████| 11724/11724 [10:44<00:00, 18.18it/s]
Processing embeddings:  29%|██▊       | 2/7 [15:56<46:28, 557.66s/it]


Completed evaluation for model: ColQwen2.5-3b-multilingual-v1.0

Evaluating model: colqwen2.5-v0.2
Model type: colqwen2.5
Using embedding file: image_text_embeddings_colqwen2.5-v0.2_0_11759.pkl
file path: c:\Users\peera\project\submission_arr202510\software\rq2_eval\embeddings\image_text_embeddings_colqwen2.5-v0.2_0_11759.pkl
Converted embeddings to dictionary of lists.

Evaluating text-to-image...
len new bench mark: 11724


100%|██████████| 11724/11724 [00:25<00:00, 461.53it/s]



Evaluating image-to-image...
len new bench mark: 11724


100%|██████████| 11724/11724 [00:52<00:00, 224.36it/s]
Processing embeddings:  43%|████▎     | 3/7 [17:57<23:53, 358.26s/it]


Completed evaluation for model: colqwen2.5-v0.2

Evaluating model: gme-Qwen2-VL-2B-Instruct
Model type: gme
Using embedding file: image_text_embeddings_gme-Qwen2-VL-2B-Instruct_0_11759.pkl
file path: c:\Users\peera\project\submission_arr202510\software\rq2_eval\embeddings\image_text_embeddings_gme-Qwen2-VL-2B-Instruct_0_11759.pkl
Converted embeddings to dictionary of lists.

Evaluating text-to-image...
len new bench mark: 11724


100%|██████████| 11724/11724 [00:06<00:00, 1750.74it/s]



Evaluating image-to-image...
len new bench mark: 11724


100%|██████████| 11724/11724 [00:05<00:00, 1955.23it/s]
Processing embeddings:  57%|█████▋    | 4/7 [18:35<11:35, 231.89s/it]


Completed evaluation for model: gme-Qwen2-VL-2B-Instruct

Evaluating model: jina-embeddings-v4
Model type: clip
Using embedding file: image_text_embeddings_jina-embeddings-v4_0_11759.pkl
file path: c:\Users\peera\project\submission_arr202510\software\rq2_eval\embeddings\image_text_embeddings_jina-embeddings-v4_0_11759.pkl
Converted embeddings to dictionary of lists.

Evaluating text-to-image...
len new bench mark: 11724


100%|██████████| 11724/11724 [00:06<00:00, 1711.27it/s]



Evaluating image-to-image...
len new bench mark: 11724


100%|██████████| 11724/11724 [00:05<00:00, 2156.10it/s]
Processing embeddings:  71%|███████▏  | 5/7 [19:12<05:22, 161.47s/it]


Completed evaluation for model: jina-embeddings-v4

Evaluating model: XLM-Roberta-Large-Vit-B-16Plus
Model type: clip
Using embedding file: image_text_embeddings_XLM-Roberta-Large-Vit-B-16Plus_0_11759.pkl
file path: c:\Users\peera\project\submission_arr202510\software\rq2_eval\embeddings\image_text_embeddings_XLM-Roberta-Large-Vit-B-16Plus_0_11759.pkl
Converted embeddings to dictionary of lists.

Evaluating text-to-image...
len new bench mark: 11724


100%|██████████| 11724/11724 [00:06<00:00, 1923.35it/s]



Evaluating image-to-image...
len new bench mark: 11724


100%|██████████| 11724/11724 [00:05<00:00, 2057.16it/s]
Processing embeddings:  86%|████████▌ | 6/7 [19:46<01:58, 118.25s/it]


Completed evaluation for model: XLM-Roberta-Large-Vit-B-16Plus

Evaluating model: XLM-Roberta-Large-Vit-L-14
Model type: clip
Using embedding file: image_text_embeddings_XLM-Roberta-Large-Vit-L-14_0_11759.pkl
file path: c:\Users\peera\project\submission_arr202510\software\rq2_eval\embeddings\image_text_embeddings_XLM-Roberta-Large-Vit-L-14_0_11759.pkl
Converted embeddings to dictionary of lists.

Evaluating text-to-image...
len new bench mark: 11724


100%|██████████| 11724/11724 [00:06<00:00, 1799.05it/s]



Evaluating image-to-image...
len new bench mark: 11724


100%|██████████| 11724/11724 [00:07<00:00, 1662.98it/s]
Processing embeddings: 100%|██████████| 7/7 [20:23<00:00, 174.79s/it]


Completed evaluation for model: XLM-Roberta-Large-Vit-L-14





In [None]:
os.getenv("HF_TOKEN")

In [None]:
from datasets import load_dataset
image_metadata = load_dataset('Chula-AI/association_bias_benchmark', name="image_metadata", split='train')
benchmark_data = load_dataset('Chula-AI/association_bias_benchmark', name="benchmark", split='train')

## Evaluate Single Embedding

In [None]:
# Set the embedding file to evaluate
embedding_file_name = r"image_text_embeddings_XLM-Roberta-Large-Vit-B-16Plus_0_11759.pkl"
model_name = embedding_file_name[22:-12].replace('_', '-')  # Extract model name from filename
model_type = find_model_type(model_name)

print(f"Selected model: {model_name}")
print(f"Model type: {model_type}")
print(f"Embedding file: {embedding_file_name}")

In [None]:
# Initialize dataset and load benchmark
dataset = CulturalBiasDataset(DATASET_PATH)
benchmark_df = pd.read_csv(BENCHMARK_PATH)
benchmark  = benchmark_df.to_dict('records')


# Load embedding into dataset
dataset.import_local_embedding(EMBEDDING_DIR_PATH, embedding_file_name)
embeddings = dataset.embeddings

# Initialize processor if needed for late interaction models
processor = None
if model_type == 'colpali':
    processor = ColPaliProcessor.from_pretrained("vidore/colpali-v1.3-merged")
elif model_type == 'colqwen2':
    processor = ColQwen2Processor.from_pretrained("vidore/colqwen2-v1.0")
elif model_type == 'colqwen2.5':
    processor = ColQwen2_5_Processor.from_pretrained("vidore/colqwen2.5-v0.2")

# Create evaluator instance
evaluator = SelfPreferenceBiasEvaluator(
    dataset=dataset,
    embeddings=embeddings,
    model_name=model_name,
    model_type=model_type,
    processor=processor
)

# Evaluate both experiments
print("\nEvaluating text-to-image...")
text_to_image_results = evaluator.evaluate_and_save(benchmark, experiment_type="text-to-image")

print("\nEvaluating image-to-image...")
image_to_image_results = evaluator.evaluate_and_save(benchmark, experiment_type="image-to-image")

print(f"\nEvaluation completed for model: {model_name}")

# Display summary results
print("\nText-to-Image Results:")
print("Overall wins:", text_to_image_results["overall_one_hot"])

print("\nImage-to-Image Results:")
print("Overall wins:", image_to_image_results["overall_one_hot"])

In [None]:
dataset.embeddings['image_embedding'][0].shape

In [None]:
dataset.embeddings['image_embedding'][0].shape

## Aggregate Results

In [None]:
# Define evaluation results paths
EVALUATION_DIR_PATH = os.path.join(current_path, "evaluation_results")
EVALUATION_DIR_TEXT2IMG_PATH = os.path.join(EVALUATION_DIR_PATH, 'text-to-image')
EVALUATION_DIR_IMG2IMG_PATH = os.path.join(EVALUATION_DIR_PATH, 'image-to-image')

def extract_model_from_file_name(file_name):
    """Extract model name from summary file name"""
    last_dot_index = file_name.rfind('.')
    if last_dot_index != -1:
        result = file_name[:last_dot_index]
    else:
        result = file_name
    return result.split('_')[-1]

def aggregate_evaluation_results(start_with):
    """Aggregate evaluation results from all models"""
    combined_df = pd.DataFrame()
    
    # Process image-to-image results
    for file in os.listdir(EVALUATION_DIR_IMG2IMG_PATH):
        if file.startswith(start_with):
            df = pd.read_csv(os.path.join(EVALUATION_DIR_IMG2IMG_PATH, file))
            df['model'] = extract_model_from_file_name(file)
            df['experiment'] = 'image-to-image'
            combined_df = pd.concat([combined_df, df], ignore_index=True)
    
    # Process text-to-image results
    for file in os.listdir(EVALUATION_DIR_TEXT2IMG_PATH):
        if file.startswith(start_with):
            df = pd.read_csv(os.path.join(EVALUATION_DIR_TEXT2IMG_PATH, file))
            df['model'] = extract_model_from_file_name(file)
            df['experiment'] = 'text-to-image'
            combined_df = pd.concat([combined_df, df], ignore_index=True)
            
    return combined_df

# Aggregate results by type
print("Aggregating results...")
concept_df = aggregate_evaluation_results('concept_summary')
country_df = aggregate_evaluation_results('country_summary')
overall_df = aggregate_evaluation_results('overall_summary')
language_df = aggregate_evaluation_results('language_summary')

# Create aggregated directory and save results
aggregated_dir = os.path.join(EVALUATION_DIR_PATH, "aggregated")
os.makedirs(aggregated_dir, exist_ok=True)

def save_if_different(df, filepath):
    """Save DataFrame only if it's different from existing file or file doesn't exist"""
    if os.path.exists(filepath):
        existing_df = pd.read_csv(filepath)
        if not existing_df.equals(df):
            print(f"Updating {os.path.basename(filepath)} - content has changed")
            df.to_csv(filepath, index=False)
        else:
            print(f"Skipping {os.path.basename(filepath)} - content unchanged")
    else:
        print(f"Creating new file {os.path.basename(filepath)}")
        df.to_csv(filepath, index=False)

# Save aggregated results
print("\nSaving aggregated results...")
save_if_different(concept_df, os.path.join(aggregated_dir, "concept_aggregated.csv"))
save_if_different(country_df, os.path.join(aggregated_dir, "country_aggregated.csv"))
save_if_different(overall_df, os.path.join(aggregated_dir, "overall_aggregated.csv"))
save_if_different(language_df, os.path.join(aggregated_dir, "language_aggregated.csv"))

print("\nAggregation complete!")