In [3]:
pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from evaluate)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.17-py311-none-any.whl.metadata (7.2 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec>=2021.05.0 (from fsspec[http]>=2021.05.0->evaluate)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m 

In [5]:
pip install rouge-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=c5d20ecb242f0e0ae51b951191627bd3e4ba71389eef398b59822ec56dbf3e12
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [6]:
import torch
import numpy as np
import evaluate
from sklearn.metrics.pairwise import cosine_similarity

# Hugging Face metrics
bleu = evaluate.load("bleu")
meteor = evaluate.load("meteor")
rouge = evaluate.load("rouge")
exact = evaluate.load("exact_match")

def evaluate_custom_encoder_decoder(
    df,
    encoder,            # your encoder model: text → embedding
    decoder,            # your decoder model: embedding → text
    llama_vectorizer,   # precomputed LLaMA emoji vectors (for ground truth)
    decoder_labels,     # ground truth decoded texts (e.g., emoji captions)
    use_tokenizer=False,
    tokenizer=None
):
    predictions = []
    references = []
    cosine_scores = []

    for idx, row in df.iterrows():
        input_text = row["text"]
        target_vector = llama_vectorizer[idx]           # from LLaMA
        target_text = decoder_labels[idx]               # demojized text version

        # Encode: text → embedding
        with torch.no_grad():
            embedding = encoder(input_text)
            if isinstance(embedding, tuple):  # if encoder returns (hidden, pooled)
                embedding = embedding[0]

        # Decode: embedding → text
        if use_tokenizer and tokenizer:
            # decoder is a LM like T5
            input_tokens = tokenizer(input_text, return_tensors="pt").to(embedding.device)
            decoded_ids = decoder.generate(**input_tokens)
            decoded_text = tokenizer.decode(decoded_ids[0], skip_special_tokens=True)
        else:
            decoded_text = decoder(embedding)

        predictions.append(decoded_text)
        references.append(target_text)

        # Cosine similarity between encoder output and LLaMA vector
        sim = cosine_similarity(
            embedding.detach().cpu().numpy().reshape(1, -1),
            target_vector.reshape(1, -1)
        )[0][0]
        cosine_scores.append(sim)

    # HF Metrics
    results = {
        "BLEU": bleu.compute(predictions=predictions, references=[[r] for r in references])["bleu"],
        "METEOR": meteor.compute(predictions=predictions, references=references)["meteor"],
        "ROUGE-L": rouge.compute(predictions=predictions, references=references)["rougeL"],
        "Exact Match": exact.compute(predictions=predictions, references=references)["exact_match"],
        "Cosine Similarity (Encoder)": float(np.mean(cosine_scores))
    }

    print("\n🧪 Evaluation Results (Custom Pipeline):")
    for k, v in results.items():
        print(f"{k}: {v:.4f}")

    return results


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Downloading builder script:   0%|          | 0.00/5.67k [00:00<?, ?B/s]

In [None]:
results = evaluate_custom_encoder_decoder(
    df=my_data_df,                            # must contain "text" column
    encoder=my_encoder,                       # your encoder wrapper
    decoder=my_decoder,                       # your decoder wrapper
    llama_vectorizer=llama_vectors,           # (N, D) embeddings from LLaMA
    decoder_labels=emoji_text_descriptions,   # ground truth captions
    use_tokenizer=False,                      # decoder is not HF model
    tokenizer=None
)
