In [None]:
!pip install -q sentence-transformers scikit-learn datasets faiss-cpu

In [None]:
import time, numpy as np, torch
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score, normalized_mutual_info_score, adjusted_rand_score
from sklearn.cluster import KMeans
from datasets import load_dataset
import faiss
from google.colab import userdata

# Base Interface

In [None]:
class EmbeddingModel:
    def encode(self, texts):
        raise NotImplementedError

# MiniLM baseline
class MiniLMModel(EmbeddingModel):
    def __init__(self):
        self.model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
    def encode(self, texts):
        if isinstance(texts, str):
            texts = [texts]
        return np.array(self.model.encode(texts, convert_to_numpy=True), dtype="float32")

# EmbeddingGemma
class EmbeddingGemmaModel(EmbeddingModel):
    def __init__(self, token=None):
        self.model = SentenceTransformer("google/embeddinggemma-300m", token=token)
    def encode(self, texts):
        if isinstance(texts, str):
            texts = [texts]
        return np.array(self.model.encode(texts, convert_to_numpy=True), dtype="float32")

# Jina Code Embeddings
class JinaCodeEmbeddingsModel(EmbeddingModel):
    def __init__(self, token=None):
        self.model = SentenceTransformer(
            "jinaai/jina-code-embeddings-1.5b",
            trust_remote_code=True, token=token
        )
    def encode(self, texts):
        if isinstance(texts, str):
            texts = [texts]
        return np.array(self.model.encode(texts, convert_to_numpy=True), dtype="float32")


# Evaluation Functions

In [None]:
# Similarity (STS)
def eval_similarity(model):
    dataset = load_dataset("mteb/stsbenchmark-sts", split="test[:200]")
    sims, labels = [], []
    for row in dataset:
        e1 = model.encode(row["sentence1"])[0]
        e2 = model.encode(row["sentence2"])[0]
        sims.append(np.dot(e1, e2) / (np.linalg.norm(e1)*np.linalg.norm(e2)))
        labels.append(row["score"]/5.0)
    return np.corrcoef(sims, labels)[0,1]

# Classification
def eval_classification(model, texts=None, labels=None):
    if texts is None:
        dataset = load_dataset("ag_news", split="train[:1000]")
        texts = dataset["text"]
        labels = dataset["label"]
    X = model.encode(texts)
    clf = LogisticRegression(max_iter=200).fit(X, labels)
    preds = clf.predict(X)
    return accuracy_score(labels, preds), f1_score(labels, preds, average="macro")

# Clustering
def eval_clustering(model, texts=None, labels=None):
    if texts is None:
        dataset = load_dataset("ag_news", split="test[:500]")
        texts = dataset["text"]
        labels = np.array(dataset["label"])
    else:
        labels = np.array(labels)
    X = model.encode(texts)
    kmeans = KMeans(n_clusters=len(np.unique(labels)), n_init=10).fit(X)
    return normalized_mutual_info_score(labels, kmeans.labels_), adjusted_rand_score(labels, kmeans.labels_)

# Retrieval
def eval_retrieval(model):
    dataset = load_dataset("bansalaman18/msmarco-reranking-1m", split="train[:1000]")
    queries = [row["query"] for row in dataset]
    documents = [row["document"] for row in dataset]
    labels = np.array([1 if row["relevant"] == "True" else 0 for row in dataset])

    query_embs = model.encode(queries)
    doc_embs = model.encode(documents)

    # Cosine similarity
    sim_matrix = query_embs @ doc_embs.T
    sim_matrix /= np.linalg.norm(query_embs, axis=1)[:, None]
    sim_matrix /= np.linalg.norm(doc_embs, axis=1)[None, :]

    top1 = np.argmax(sim_matrix, axis=1)
    recall_at_1 = np.mean(labels[top1])
    return recall_at_1

# Efficiency
def eval_efficiency(model):
    texts = ["The quick brown fox jumps over the lazy dog."] * 256
    start = time.time()
    emb = model.encode(texts)
    dur = time.time() - start
    return emb.shape[1], len(texts)/dur



# Load MeTTa JSON Dataset

In [None]:
def load_metta_json(path="metta_dataset.json"):
    with open(path, "r") as f:
        data = json.load(f)
    texts = [f"{item['code']} # {item['comment']}" for item in data]
    labels = [item['label'] for item in data]
    return texts, labels

# Run Benchmarks

In [None]:
token = userdata.get("embeddingGemma")
MODELS = {
    "MiniLM": MiniLMModel(),
    "EmbeddingGemma": EmbeddingGemmaModel(token=token),
    "JinaCode": JinaCodeEmbeddingsModel(token=token)
}

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
results = []
for name, model in MODELS.items():
    print(f"\n🔹 Evaluating {name}...")

    sim = eval_similarity(model)
    acc, f1 = eval_classification(model)
    nmi, ari = eval_clustering(model)
    recall = eval_retrieval(model)
    dim, speed = eval_efficiency(model)

    results.append({
        "Model": name,
        "Similarity": round(sim,3),
        "Cls Acc": round(acc,3),
        "Cls F1": round(f1,3),
        "Cluster NMI": round(nmi,3),
        "Cluster ARI": round(ari,3),
        "Retrieval R@10": round(recall,3),
        "Dim": dim,
        "Throughput (txt/s)": round(speed,2)
    })

df = pd.DataFrame(results)
print("\n===== Benchmark Results =====\n")
print(df)


🔹 Evaluating MiniLM...

🔹 Evaluating EmbeddingGemma...

🔹 Evaluating JinaCode...

===== Benchmark Results =====

            Model  Similarity  Cls Acc  Cls F1  Cluster NMI  Cluster ARI  \
0          MiniLM       0.923    0.899   0.886        0.415        0.312   
1  EmbeddingGemma       0.650    0.875   0.859        0.520        0.538   
2        JinaCode       0.820    0.896   0.882        0.568        0.588   

   Retrieval R@10   Dim  Throughput (txt/s)  
0           0.475   384             2650.60  
1           0.025   768              266.40  
2           0.275  1536              117.36  
