In [2]:
from sacrebleu.metrics import BLEU
from collections import Counter
from nltk import ngrams, word_tokenize
import numpy as np

In [3]:
toy = [
    "effects of caffeine on sleep",
    "how does caffeine influence sleep quality",
    "benefits of regular exercise for heart health",
    "cardio workouts and heart health benefits",
    "sleep deprivation caffeine relationship",
    "cardio workouts and heart health benefits",
    "cardio workouts and heart health benefits",
]

bleu = BLEU(effective_order=True)

def tokenize(text):
    return word_tokenize(text.lower())

def distinct2(queries) -> float:
    bigram_counter = Counter()
    total_bigrams = 0
    for q in queries:
        tokens = tokenize(q)
        bgs = list(ngrams(tokens, 2))
        bigram_counter.update(bgs)
        total_bigrams += len(bgs)

    if total_bigrams == 0:
        return 0.0
    return len(bigram_counter) / total_bigrams

def self_bleu(queries) -> float:
    if len(queries) < 2:
            return 0.0
    scores = []
    for i, hyp in enumerate(queries):
        refs = [queries[j] for j in range(len(queries)) if j != i]
        score = bleu.sentence_score(hyp, refs).score / 100.0  # sacrebleu return 0–100
        scores.append(score)
    return float(np.mean(scores))

d2 = distinct2(toy)
s2 = self_bleu(toy)
print(f"Distinct-2: {d2:.4f}")
print(f"Self-BLEU: {s2:.4f}")



Distinct-2: 0.6667
Self-BLEU: 0.5059


In [14]:
from tqdm import tqdm
import json

def calc_metrics(gen_qs):
    d2_scores = []
    self_bleu_scores = []
    for q in tqdm(gen_qs, desc="Calculating metrics"):
        d2_scores.append(distinct2(q["predicted_queries"]))
        self_bleu_scores.append(self_bleu(q["predicted_queries"]))

    avg_d2 = np.mean(d2_scores)
    avg_self_bleu = np.mean(self_bleu_scores)
    
    return avg_d2, avg_self_bleu

# Example usage
GEN_Q_PATHS = {
    "grpo": "/home/guest/r12922050/GitHub/d2qplus/gen/nfcorpus/Llama-3.2-1B-Instruct-GRPO-separate-reward.jsonl",
    "promptagator": "/home/guest/r12922050/GitHub/d2qplus/gen/nfcorpus/Llama-3.2-1B-Instruct-promptagator.jsonl",
    "t5": "/home/guest/r12922050/GitHub/d2qplus/gen/nfcorpus/t5_10q.jsonl",
    "with-topic-prompt": "/home/guest/r12922050/GitHub/d2qplus/gen/nfcorpus/with_topic_llama_1b.jsonl",
    "without-topic-prompt": "/home/guest/r12922050/GitHub/d2qplus/gen/nfcorpus/without_topic_llama_1b.jsonl"
}

for name, path in GEN_Q_PATHS.items():
    with open(path, "r") as f:
        gen_qs = [json.loads(line) for line in f]
    if name in ["grpo", "with-topic-prompt", "without-topic-prompt"]:
        pred_queries = []
        for q in gen_qs:
            trimmed = q["predicted_queries"].strip()
            # segment by newline
            queries = [line.strip() for line in trimmed.split("\n") if line.strip()]
            pred_queries.append(queries)
        gen_qs = [{"predicted_queries": queries} for queries in pred_queries]

    avg_d2, avg_self_bleu = calc_metrics(gen_qs)
    print(f"Metrics for {name}:")
    print(f"  Average Distinct-2: {avg_d2:.4f}")
    print(f"  Average Self-BLEU: {avg_self_bleu:.4f}\n")

Calculating metrics: 100%|██████████| 3633/3633 [00:15<00:00, 231.19it/s]


Metrics for grpo:
  Average Distinct-2: 0.7870
  Average Self-BLEU: 0.2512



Calculating metrics: 100%|██████████| 3633/3633 [00:20<00:00, 175.29it/s]


Metrics for promptagator:
  Average Distinct-2: 0.5459
  Average Self-BLEU: 0.4904



Calculating metrics: 100%|██████████| 3633/3633 [00:07<00:00, 498.52it/s]


Metrics for t5:
  Average Distinct-2: 0.7613
  Average Self-BLEU: 0.3386



Calculating metrics: 100%|██████████| 3633/3633 [00:18<00:00, 195.78it/s]


Metrics for with-topic-prompt:
  Average Distinct-2: 0.6836
  Average Self-BLEU: 0.3893



Calculating metrics: 100%|██████████| 3633/3633 [00:18<00:00, 195.75it/s]

Metrics for without-topic-prompt:
  Average Distinct-2: 0.6798
  Average Self-BLEU: 0.3928






In [None]:
GEN_Q_PATHS = {
    "grpo": "/home/guest/r12922050/GitHub/d2qplus/gen/nfcorpus/Llama-3.2-1B-Instruct-GRPO-separate-reward.jsonl",
    "promptagator": "/home/guest/r12922050/GitHub/d2qplus/gen/nfcorpus/Llama-3.2-1B-Instruct-promptagator.jsonl",
    "t5": "/home/guest/r12922050/GitHub/d2qplus/gen/nfcorpus/t5_10q.jsonl",
    "with-topic-prompt": "/home/guest/r12922050/GitHub/d2qplus/gen/nfcorpus/with_topic_llama_1b.jsonl",
    "without-topic-prompt": "/home/guest/r12922050/GitHub/d2qplus/gen/nfcorpus/without_topic_llama_1b.jsonl"
}

for name, path in GEN_Q_PATHS.items():
    if name in ["grpo", "with-topic-prompt", "without-topic-prompt"]:
        

In [13]:
with open("/home/guest/r12922050/GitHub/d2qplus/gen/nfcorpus/with_topic_llama_1b.jsonl", "r") as f:
    promptagator_qs = [json.loads(line) for line in f]
promptagator_qs[0]

{'id': 'MED-10',
 'title': 'Statin Use and Breast Cancer Survival: A Nationwide Cohort Study from Finland',
 'text': 'Recent studies have suggested that statins, an established drug group in the prevention of cardiovascular mortality, could delay or prevent breast cancer recurrence but the effect on disease-specific mortality remains unclear. We evaluated risk of breast cancer death among statin users in a population-based cohort of breast cancer patients. The study cohort included all newly diagnosed breast cancer patients in Finland during 1995–2003 (31,236 cases), identified from the Finnish Cancer Registry. Information on statin use before and after the diagnosis was obtained from a national prescription database. We used the Cox proportional hazards regression method to estimate mortality among statin users with statin use as time-dependent variable. A total of 4,151 participants had used statins. During the median follow-up of 3.25 years after the diagnosis (range 0.08–9.0 years)