### Function to compute UTS
Utility function to compute the signatures for any input array. You might have to apply additional normalization if needed.

In [1]:
from config.metrics import METRICS_CONFIG

def compute_metrics(vectors):
    results_data = []
    computed_metrics = []
    for metric in METRICS_CONFIG["metrics"]:
        compute_func = METRICS_CONFIG["metrics"][metric]["compute_func"]
        max_sample_size = METRICS_CONFIG["metrics"][metric]["max_sample_size"]
        is_local = METRICS_CONFIG["metrics"][metric].get("is_local", False)
        kwargs = METRICS_CONFIG["metrics"][metric]["kwargs"]
        requires_distance = METRICS_CONFIG["metrics"][metric].get("requires_distance", False)
        distance_metrics = METRICS_CONFIG.get("distance_metrics", [])

        if is_local:
            continue

        for distance_metric in distance_metrics:
            if requires_distance:
                kwargs["distance_metric"] = distance_metric
                computed_metrics.append(f"{metric}_{distance_metric}")
            else:
                computed_metrics.append(metric)
            
            if vectors.shape[0] > max_sample_size:
                # Sample from vectors
                sample_indices = np.random.choice(vectors.shape[0], size=max_sample_size, replace=False)
                vectors_capped = vectors[sample_indices]
            else:
                vectors_capped = vectors.copy()

            result = compute_func(vectors_capped)
            results_data.append(result)

            if not requires_distance:
                # Just one iteration needed
                break
            
    results = {k: v for k, v in zip(computed_metrics, results_data)}
    return results

### Function to check processing status

Here you can check how many embeddings have been computed and if the MTEB results are found.

In [3]:
import os
import json
import time
import argparse
import numpy as np
import pandas as pd
from dotenv import load_dotenv
from mteb.models.cache_wrapper import TextVectorMap
from config.eval import RETRIEVAL_TASKS
load_dotenv()

# Put here the models you want to check
MODELS = ["google/gemini-embedding-001"]

results = []
for task in RETRIEVAL_TASKS:
    for model in MODELS:
        embeddings = False
        shape = 0
        size = 0
        try:
            # Load embeddings memmap
            cache_path = os.environ.get("CACHE_PATH") + f"cache_{model.replace('/', '_')}" + f"/{task}"
            data = TextVectorMap(cache_path)
            data.load(name=task)

            last_non_zero_row = data.vectors.shape[0] - 1
            while last_non_zero_row >= 0 and np.all(data.vectors[last_non_zero_row] == 0):
                last_non_zero_row -= 1

            truncated_data = data.vectors[:last_non_zero_row]

            # Remove padding vectors
            vectors = np.asarray(truncated_data)
            shape = vectors.shape
            size = vectors.nbytes
            embeddings = True
        except:
            pass

        # Look for mteb results
        mteb = False
        try:
            org_model = model.split("/")
            org = org_model[0]

            if len(model) > 2:
                model_dir = "_".join(org_model[1:])
            else:
                model_dir = org_model[1]
            if org == "dunzhang":
                model_path = f"NovaSearch__{model_dir}"
            else:
                model_path = f"{org}__{model_dir}"
            path = f"results/{org}/{model_dir}/{model_path}"
            print("Looking for files at: ", path)
            if list(os.walk(path)) == []:
                print("ERROR: No revision found.")
            print("Revisions: ", list(os.walk(path))[0][1])
            assert len(list(os.walk(path))[0][1]) == 1, "There should be exactly one revision folder."
            revision = list(os.walk(path))[0][1][0]
            result_path = f"{path}/{revision}/"
            path = result_path + f"/{task}.json"
            with open(path, 'r') as file:
                mteb_results = json.load(file)
            mteb = True
        except:
            pass

        results.append({
            "task": task,
            "model": model,
            "mteb_results": mteb,
            "embeddings": embeddings,
            "shape": shape,
            "size": size,
        })

pd.DataFrame(results)

Looking for files at:  results/google/gemini-embedding-001/google__gemini-embedding-001
ERROR: No revision found.
Looking for files at:  results/google/gemini-embedding-001/google__gemini-embedding-001
ERROR: No revision found.


Unnamed: 0,task,model,mteb_results,embeddings,shape,size
0,ClimateFEVER,google/gemini-embedding-001,False,True,"(5418127, 3072)",66577944576
1,MSMARCO,google/gemini-embedding-001,False,True,"(8848640, 3072)",108732088320
