In [None]:
import torch
from PIL import Image
import numpy as np
from tqdm import tqdm
import pickle
import os
import json
import pandas as pd
import ipywidgets as widgets
from IPython.display import display

# Setup

In [None]:
!pip install transformers datasets tqdm pandas colpali-engine clip multilingual-clip

In [None]:
XM_IMAGE_PATH = ""  # Crossmodal-3600 Image folder path
XM_JSON_PATH = ""   # Crossmodal-3600 JSON file path (captions.jsonl)
EMBEDDING_SAVE_PATH = ""  # Path to save the embeddings

# Embeddings

    expect schema
    - image_key
    - image_embedding
    - text_embeddings
        - caption_embedding_[lang]

In [None]:
# Helper function to load local dataset
def load_local_crossmodal_data(image_path, json_path):
    """
    Load Crossmodal-3600 dataset from local paths.
    
    Args:
        image_path: Path to the folder containing images
        json_path: Path to the captions.jsonl file
    
    Returns:
        List of data entries with image and captions
    """
    data = []
    
    # Load captions from JSON file
    with open(json_path, 'r', encoding='utf-8') as f:
        for line in f:
            entry = json.loads(line)
            data.append(entry)
    
    # Load images and match with captions
    for entry in data:
        image_key = entry['image/key']
        image_file = os.path.join(image_path, f"{image_key}.jpg")
        
        if os.path.exists(image_file):
            entry['image'] = Image.open(image_file).convert('RGB')
        else:
            print(f"Warning: Image not found: {image_file}")
            entry['image'] = None
    
    return data

print("Helper function loaded. Ready to process local data.")

## gme

In [None]:
# GME imports
from transformers import AutoModel

In [None]:
# --- Configuration ---
model_name = "Alibaba-NLP/gme-Qwen2-VL-7B-Instruct"
model_real_name = model_name.split("/")[-1]

# Initialize GME model
gme = AutoModel.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map='cuda',
    trust_remote_code=True
)

# Load local dataset
print(f"Loading local dataset from: {XM_IMAGE_PATH} and {XM_JSON_PATH}")
dataset = load_local_crossmodal_data(XM_IMAGE_PATH, XM_JSON_PATH)

eval_size = min(np.inf, len(dataset))
eval_dataset = dataset[:eval_size]

# Processing configuration
start_index = 0
end_index = eval_size

embeddings_list = []

# Setup save path
os.makedirs(EMBEDDING_SAVE_PATH, exist_ok=True)
save_path = os.path.join(EMBEDDING_SAVE_PATH, f"image_text_embeddings_{model_real_name}_{start_index}_{end_index}.pkl")

print(f"Processing entries from index {start_index} to {end_index}...")
print(f"Total dataset size: {len(eval_dataset)}")

# Process entries within the specified range
for i in tqdm(range(start_index, min(end_index, len(eval_dataset))), desc="Processing entries"):
    current_entry = eval_dataset[i]

    # Initialize entry with expected schema
    entry_embeddings = {
        'image_key': current_entry.get('image/key', None),
        'image_embedding': None,
        'text_embeddings': {}
    }

    try:
        with torch.no_grad():
            # 1. Get image embedding
            if current_entry.get('image') is not None:
                image_embeddings = gme.get_image_embeddings(images=[current_entry['image']])
                entry_embeddings['image_embedding'] = image_embeddings.cpu().numpy()
            else:
                print(f"Warning: No image found for entry index {i}")

            # 2. Get text embeddings for all languages
            if 'captions' in current_entry and isinstance(current_entry['captions'], dict):
                for lang_code, captions_list in current_entry['captions'].items():
                    if isinstance(captions_list, list) and captions_list:
                        # Get embeddings for all captions in this language
                        lang_text_embeddings = gme.get_text_embeddings(texts=captions_list)
                        entry_embeddings['text_embeddings'][f'caption_embedding_{lang_code}'] = lang_text_embeddings.cpu().numpy()
            
            if not entry_embeddings['text_embeddings']:
                print(f"Warning: No valid text embeddings generated for entry index {i}")

        embeddings_list.append(entry_embeddings)

    except Exception as e:
        print(f"Error processing entry {i}: {str(e)}")
        entry_embeddings['error'] = str(e)
        embeddings_list.append(entry_embeddings)

# Final cleanup
torch.cuda.empty_cache()

# Save all collected embeddings
print(f"Saving embeddings to: {save_path}")
with open(save_path, 'wb') as f:
    pickle.dump(embeddings_list, f)

print(f"Processing complete. Embeddings saved to: {save_path}")
print(f"Total entries processed: {len(embeddings_list)}")

# Print summary statistics
successful_image_embeddings = sum(1 for entry in embeddings_list if entry.get('image_embedding') is not None)
successful_text_embeddings = sum(1 for entry in embeddings_list if entry.get('text_embeddings') and len(entry['text_embeddings']) > 0)

print(f"Successful image embeddings: {successful_image_embeddings}/{len(embeddings_list)}")
print(f"Successful text embeddings: {successful_text_embeddings}/{len(embeddings_list)}")

# Show sample of what was saved
if embeddings_list:
    print("\nSample entry structure:")
    sample_entry = embeddings_list[0]
    for key, value in sample_entry.items():
        if key == 'text_embeddings' and isinstance(value, dict):
            print(f"  {key}: {list(value.keys())}")
        elif isinstance(value, np.ndarray):
            print(f"  {key}: numpy array shape {value.shape}")
        else:
            print(f"  {key}: {type(value)}")

## colqwen

In [None]:
# ColQwen imports
from colpali_engine.models import ColQwen2_5, ColQwen2_5_Processor, ColQwen2, ColQwen2Processor

In [None]:
# Create a dropdown widget for model selection
model_options = [
    ("ColQwen2.5-7b-multilingual-v1.0", "Metric-AI/ColQwen2.5-7b-multilingual-v1.0"),
    ("colqwen2.5-v0.2", "vidore/colqwen2.5-v0.2"), 
    ("colqwen2-v1.0", "vidore/colqwen2-v1.0")
]

model_dropdown = widgets.Dropdown(
    options=model_options,
    value="vidore/colqwen2-v1.0",  # default selection
    description='Model:',
    style={'description_width': 'initial'}
)

def on_model_change(change):
    global model_name, model_name_part
    model_name = change['new']
    model_name_part = model_name.split("/")[-1]
    print(f"Selected model: {model_name}")
    print(f"Model name part: {model_name_part}")

model_dropdown.observe(on_model_change, names='value')

# Initialize the variables with the default selection
model_name = model_dropdown.value
model_name_part = model_name.split("/")[-1]

display(model_dropdown)
print(f"Initial model: {model_name}")
print(f"Initial model name part: {model_name_part}")

Dropdown(description='Model:', index=2, options=(('ColQwen2.5-7b-multilingual-v1.0', 'Metric-AI/ColQwen2.5-7b-…

Initial model: vidore/colqwen2-v1.0
Initial model name part: colqwen2-v1.0


In [None]:
if "colqwen2.5" in model_name:
    processor = ColQwen2_5_Processor.from_pretrained(model_name)
    model = ColQwen2_5.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
else:
    processor = ColQwen2Processor.from_pretrained(model_name)
    model = ColQwen2.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")

In [None]:
# Load local dataset
print(f"Loading local dataset from: {XM_IMAGE_PATH} and {XM_JSON_PATH}")
dataset = load_local_crossmodal_data(XM_IMAGE_PATH, XM_JSON_PATH)

eval_size = min(np.inf, len(dataset))
eval_dataset = dataset[:eval_size]

# Define the start and end indices for processing
start_index = 0
end_index = eval_size

embeddings_list = []

# Setup save path
os.makedirs(EMBEDDING_SAVE_PATH, exist_ok=True)
save_path = os.path.join(EMBEDDING_SAVE_PATH, f"image_text_embeddings_{model_name_part}_{start_index}_{end_index}.pkl")

print(f"Processing entries from index {start_index} to {end_index}...")

# Process entries within the specified range
for i in tqdm(range(start_index, min(end_index, len(eval_dataset))), desc="Processing entries"):
    current_entry = eval_dataset[i]

    # Initialize entry with expected schema
    entry_embeddings = {
        'image_key': current_entry.get('image/key', None),
        'image_embedding': None,
        'text_embeddings': {}
    }

    with torch.no_grad():
        # 1. Get image embedding
        if current_entry.get('image') is not None:
            batch_images = processor.process_images([current_entry['image']]).to(model.device)
            image_embeddings = model(**batch_images)
            entry_embeddings['image_embedding'] = image_embeddings.float().cpu().numpy()

        # 2. Get text embeddings for ALL languages
        if 'captions' in current_entry and isinstance(current_entry['captions'], dict):
            for lang_code, captions_list in current_entry['captions'].items():
                if isinstance(captions_list, list) and captions_list:
                    # Process all captions for this language
                    lang_embeddings = []
                    for caption in captions_list:
                        batch_caption = processor.process_queries([caption]).to(model.device)
                        caption_embedding = model(**batch_caption)
                        lang_embeddings.append(caption_embedding.float().cpu().numpy())
                    
                    # Store embeddings
                    entry_embeddings['text_embeddings'][f'caption_embedding_{lang_code}'] = lang_embeddings
                else:
                    print(f"Warning: No valid captions found for language '{lang_code}' in entry index {i}")
        else:
            print(f"Warning: 'captions' field not found or not a dictionary for entry index {i}")

        embeddings_list.append(entry_embeddings)

    # Clean up GPU memory
    torch.cuda.empty_cache()

# Save all collected embeddings
with open(save_path, 'wb') as f:
    pickle.dump(embeddings_list, f)

print(f"Processing complete. Embeddings saved to: {save_path}")
print(f"Total entries processed: {len(embeddings_list)}")

## jina

In [None]:
# Jina imports
from transformers import AutoModel

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

def concatenate_tensors(tensor_list):
    """
    Concatenates a list of tensors along dimension 0.
    
    Args:
        tensor_list: A list of PyTorch tensors.
    
    Returns:
        A single PyTorch tensor concatenated along dimension 0.
        If the input list contains only one tensor, that tensor is returned.
    """
    if len(tensor_list) == 1:
        return tensor_list[0]
    buffed_list = []
    for i, tensor in enumerate(tensor_list):
        buffed_list.append(tensor.unsqueeze(0))
    return torch.cat(buffed_list, dim=0)

# --- Configuration ---
model_name = 'jinaai/jina-embeddings-v4'
model_real_name = model_name.split("/")[-1]

# Load Model
model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16)
model.to("cuda")

# Load local dataset
print(f"Loading local dataset from: {XM_IMAGE_PATH} and {XM_JSON_PATH}")
dataset = load_local_crossmodal_data(XM_IMAGE_PATH, XM_JSON_PATH)

eval_size = min(np.inf, len(dataset))
eval_dataset = dataset[:eval_size]

# Define the start and end indices for processing
start_index = 0
end_index = eval_size

embeddings_list = []

# Setup save path
os.makedirs(EMBEDDING_SAVE_PATH, exist_ok=True)
save_path = os.path.join(EMBEDDING_SAVE_PATH, f"image_text_embeddings_{model_real_name}_{start_index}_{end_index}.pkl")

print(f"Processing entries from index {start_index} to {end_index}...")

# Process entries within the specified range
for i in tqdm(range(start_index, min(end_index, len(eval_dataset))), desc="Processing entries"):
    current_entry = eval_dataset[i]

    # Initialize entry with expected schema
    entry_embeddings = {
        'image_key': current_entry.get('image/key', None),
        'image_embedding': None,
        'text_embeddings': {}
    }

    with torch.no_grad():
        # 1. Get image embedding
        if current_entry.get('image') is not None:
            image_embeddings = model.encode_image(
                images=current_entry['image'],
                task="retrieval",
            ).unsqueeze(0)
            entry_embeddings['image_embedding'] = image_embeddings.cpu().numpy()

        # 2. Get text embeddings for ALL languages
        if 'captions' in current_entry and isinstance(current_entry['captions'], dict):
            for lang_code, captions_list in current_entry['captions'].items():
                if isinstance(captions_list, list) and captions_list:
                    lang_text_embeddings = model.encode_text(
                        texts=captions_list,
                        task="retrieval",
                        prompt_name="query",
                    )
                    lang_text_embeddings = concatenate_tensors(lang_text_embeddings)
                    entry_embeddings['text_embeddings'][f'caption_embedding_{lang_code}'] = lang_text_embeddings.cpu().numpy()
                else:
                    print(f"Warning: No valid captions found for language '{lang_code}' in entry index {i}")
        else:
            print(f"Warning: 'captions' field not found or not a dictionary for entry index {i}")

        embeddings_list.append(entry_embeddings)

    # Clean up GPU memory
    torch.cuda.empty_cache()

# Save all collected embeddings
with open(save_path, 'wb') as f:
    pickle.dump(embeddings_list, f)

print(f"Processing complete. Embeddings saved to: {save_path}")
print(f"Total entries processed: {len(embeddings_list)}")

## mclip

In [None]:
# M-CLIP imports
from transformers import AutoModel
from multilingual_clip import pt_multilingual_clip
import transformers
import clip

device = "cuda" if torch.cuda.is_available() else "cpu"

# --- Configuration ---
model_name = 'M-CLIP/XLM-Roberta-Large-Vit-L-14'
model_real_name = model_name.split("/")[-1]

# Load Model & Tokenizer
model = pt_multilingual_clip.MultilingualCLIP.from_pretrained(model_name).cuda()
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)

vision_model, vision_preprocess = clip.load("ViT-L/14", device=device)

# Load local dataset
print(f"Loading local dataset from: {XM_IMAGE_PATH} and {XM_JSON_PATH}")
dataset = load_local_crossmodal_data(XM_IMAGE_PATH, XM_JSON_PATH)

eval_size = min(np.inf, len(dataset))
eval_dataset = dataset[:eval_size]

# Define the start and end indices for processing
start_index = 0
end_index = eval_size

embeddings_list = []

# Setup save path
os.makedirs(EMBEDDING_SAVE_PATH, exist_ok=True)
save_path = os.path.join(EMBEDDING_SAVE_PATH, f"image_text_embeddings_{model_real_name}_{start_index}_{end_index}.pkl")

print(f"Processing entries from index {start_index} to {end_index}...")

# Process entries within the specified range
for i in tqdm(range(start_index, min(end_index, len(eval_dataset))), desc="Processing entries"):
    current_entry = eval_dataset[i]

    # Initialize entry with expected schema
    entry_embeddings = {
        'image_key': current_entry.get('image/key', None),
        'image_embedding': None,
        'text_embeddings': {}
    }

    with torch.no_grad():
        # 1. Get image embedding
        if current_entry.get('image') is not None:
            image = vision_preprocess(current_entry['image']).unsqueeze(0).to(device)
            image_embeddings = vision_model.encode_image(image)
            entry_embeddings['image_embedding'] = image_embeddings.cpu().numpy()

        # 2. Get text embeddings for ALL languages
        if 'captions' in current_entry and isinstance(current_entry['captions'], dict):
            for lang_code, captions_list in current_entry['captions'].items():
                if isinstance(captions_list, list) and captions_list:
                    lang_text_embeddings = model.forward(captions_list, tokenizer)
                    entry_embeddings['text_embeddings'][f'caption_embedding_{lang_code}'] = lang_text_embeddings.cpu().numpy()
                else:
                    print(f"Warning: No valid captions found for language '{lang_code}' in entry index {i}")
        else:
            print(f"Warning: 'captions' field not found or not a dictionary for entry index {i}")

        embeddings_list.append(entry_embeddings)

    # Clean up GPU memory
    torch.cuda.empty_cache()

# Save all collected embeddings
with open(save_path, 'wb') as f:
    pickle.dump(embeddings_list, f)

print(f"Processing complete. Embeddings saved to: {save_path}")
print(f"Total entries processed: {len(embeddings_list)}")