In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/domain/sleep_stress_dataset.json


In [2]:
pip install rouge-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=ed6ceec22619d2902424161f9f3189b970c590268328362f8b03016f13bad575
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2
Note: you may need to restart the kernel to use updated packages.


In [None]:
import json
import torch
import gc
import torch.nn.functional as F
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    AutoModel,
    pipeline
)
from rouge_score import rouge_scorer

from huggingface_hub import login
login("your_hf_token") 
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

2025-10-03 13:46:24.868287: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1759499185.231915      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1759499185.334612      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [4]:
import torch

# Disable TorchDynamo (fixes RecompileLimitExceeded)
if hasattr(torch, "_dynamo"):
    torch._dynamo.config.cache_size_limit = 64  # increase limit (optional)
    torch._dynamo.config.suppress_errors = True
    torch._dynamo.disable()


In [5]:
from transformers.utils import logging
logging.set_verbosity_error()

def cleanup():
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.ipc_collect()

cleanup()

In [6]:
EMBED_MODEL_NAME = "sentence-transformers/all-mpnet-base-v2"
embed_tokenizer = AutoTokenizer.from_pretrained(EMBED_MODEL_NAME)
embed_model = AutoModel.from_pretrained(EMBED_MODEL_NAME).to(DEVICE)
embed_model.eval()

def embed(texts):
    inputs = embed_tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
    inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = embed_model(**inputs, return_dict=True)
        embeddings = outputs.last_hidden_state.mean(dim=1)
        embeddings = F.normalize(embeddings, p=2, dim=1)
    return embeddings
    
def cosine_similarity(text1, text2):
    emb1 = embed([text1])
    emb2 = embed([text2])
    return F.cosine_similarity(emb1, emb2).item()

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [8]:
JUDGE_MODELS = {
    "Judge A: MedGemma - 4B": "google/medgemma-4b-it", 
    "Judge B: Meditron-7B": "epfl-llm/meditron-7b",
}
MODELS = [
    "microsoft/phi-3-mini-4k-instruct",     
    "TinyLlama/TinyLlama-1.1B-Chat-v1.0",   
    "google/gemma-2b",                       
]

In [9]:
def load_judge(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name, torch_dtype=torch.float16, device_map="auto"
    )
    judge_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)
    return judge_pipeline, model, tokenizer


In [10]:
def judge_consistency(judge_pipeline, question, reference, response):
    prompt = f"""
    You are an expert fact-checking evaluator.

    Question:
    {question}

    Reference Answer (ground truth):
    {reference}

    Model Answer:
    {response}

    Task:
    Evaluate if the Model Answer is factually consistent with the Reference Answer.
    Respond with ONLY one of the following labels:
    - "consistent"
    - "hallucinated"
    """
    result = judge_pipeline(prompt, max_new_tokens=50, do_sample=False, temperature=0.0)
    verdict = result[0]["generated_text"].lower()

    if "consistent" in verdict:
        return "consistent"
    elif "hallucinated" in verdict:
        return "hallucinated"
    else:
        return "unknown"

In [12]:
def benchmark_model_long_with_judge(model_name, data, judge_pipeline, max_samples=None):
    print(f"\n=== Benchmarking {model_name} with LLM Judge ===")

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name, torch_dtype=torch.float16, device_map="auto"
    )
    gen_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)

    rouge = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)
    results = []

    for i, (qid, entry) in enumerate(data.items()):
        if max_samples is not None and i >= max_samples:
            break

        question = entry["question"].strip()
        ref_answer = entry["reference"].strip()
        if not ref_answer:
            continue

        prompt = f"Question: {question}\nAnswer in detail:"

        response = gen_pipeline(prompt, max_new_tokens=200, do_sample=False)
        model_answer = response[0]["generated_text"].replace(prompt, "").strip()

        sim = cosine_similarity(ref_answer, model_answer)
        rouge_score = rouge.score(ref_answer, model_answer)["rougeL"].fmeasure
        verdict = judge_consistency(judge_pipeline, question, ref_answer, model_answer)

        results.append({
            "id": qid,
            "similarity": sim,
            "rougeL": rouge_score,
            "verdict": verdict,
        })

        if (i + 1) % 10 == 0:
            print(f"Processed {i+1} datapoints...")

    avg_sim = sum(r["similarity"] for r in results) / len(results)
    avg_rouge = sum(r["rougeL"] for r in results) / len(results)
    verdict_counts = {
        v: sum(r["verdict"] == v for r in results)
        for v in ["consistent", "hallucinated", "unknown"]
    }

    print(f"Samples Evaluated: {len(results)}")
    print(f"Avg Semantic Similarity: {avg_sim:.3f}")
    print(f"Avg ROUGE-L: {avg_rouge:.3f}")
    print(f"Judge Verdicts: {verdict_counts}")

    del model, tokenizer, gen_pipeline
    cleanup()

    return {
        "model": model_name,
        "avg_similarity": round(avg_sim, 3),
        "avg_rouge": round(avg_rouge, 3),
        "samples": len(results),
        "verdicts": verdict_counts
    }


In [13]:
with open("/kaggle/input/domain/sleep_stress_dataset.json", "r", encoding="utf-8") as f:
    raw_data = json.load(f)

data = {
    str(i): {
        "question": entry["instruction"] + ("\n" + entry["input"] if entry["input"] else ""),
        "reference": entry["output"]
    }
    for i, entry in enumerate(raw_data)
}


In [12]:
leaderboards = {}

judge_label = "Judge A: MedGemma - 4B"
judge_model_id = "google/medgemma-4b-it"
print(f"\n\n########## Using {judge_label} ({judge_model_id}) ##########\n")
judge_pipeline, judge_model, judge_tokenizer = load_judge(judge_model_id)

leaderboard = []
for model_id in MODELS:
    summary = benchmark_model_long_with_judge(
        model_id, data, judge_pipeline, max_samples=200
    )
    leaderboard.append(summary)

leaderboards[judge_label] = leaderboard

del judge_model, judge_tokenizer, judge_pipeline
cleanup()




########## Using Judge A: MedGemma - 4B (google/medgemma-4b-it) ##########



tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

chat_template.jinja:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/90.6k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.64G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/156 [00:00<?, ?B/s]


=== Benchmarking microsoft/phi-3-mini-4k-instruct with LLM Judge ===


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

W1003 11:28:33.153000 36 torch/_inductor/utils.py:1137] [0/0] Not enough SMs to use max_autotune_gemm mode
W1003 11:28:33.176000 36 torch/_inductor/utils.py:1137] [0/0] Not enough SMs to use max_autotune_gemm mode
skipping cudagraphs due to skipping cudagraphs due to multiple devices: device(type='cuda', index=0), device(type='cuda', index=1)
skipping cudagraphs due to skipping cudagraphs due to multiple devices: device(type='cuda', index=0), device(type='cuda', index=1)
skipping cudagraphs due to skipping cudagraphs due to multiple devices: device(type='cuda', index=0), device(type='cuda', index=1)
skipping cudagraphs due to skipping cudagraphs due to multiple devices: device(type='cuda', index=0), device(type='cuda', index=1)


Processed 10 datapoints...
Processed 20 datapoints...


skipping cudagraphs due to skipping cudagraphs due to multiple devices: device(type='cuda', index=0), device(type='cuda', index=1)
skipping cudagraphs due to skipping cudagraphs due to multiple devices: device(type='cuda', index=0), device(type='cuda', index=1)


Processed 30 datapoints...


skipping cudagraphs due to skipping cudagraphs due to multiple devices: device(type='cuda', index=0), device(type='cuda', index=1)


Processed 40 datapoints...
Processed 50 datapoints...
Processed 60 datapoints...
Processed 70 datapoints...
Processed 80 datapoints...
Processed 90 datapoints...
Processed 100 datapoints...
Processed 110 datapoints...
Processed 120 datapoints...
Processed 130 datapoints...
Processed 140 datapoints...
Processed 150 datapoints...
Processed 160 datapoints...
Processed 170 datapoints...
Processed 180 datapoints...
Processed 190 datapoints...
Processed 200 datapoints...
Samples Evaluated: 200
Avg Semantic Similarity: 0.721
Avg ROUGE-L: 0.113
Judge Verdicts: {'consistent': 200, 'hallucinated': 0, 'unknown': 0}

=== Benchmarking TinyLlama/TinyLlama-1.1B-Chat-v1.0 with LLM Judge ===


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Processed 10 datapoints...
Processed 20 datapoints...
Processed 30 datapoints...
Processed 40 datapoints...


skipping cudagraphs due to skipping cudagraphs due to multiple devices: device(type='cuda', index=0), device(type='cuda', index=1)


Processed 50 datapoints...
Processed 60 datapoints...
Processed 70 datapoints...
Processed 80 datapoints...
Processed 90 datapoints...
Processed 100 datapoints...
Processed 110 datapoints...
Processed 120 datapoints...
Processed 130 datapoints...
Processed 140 datapoints...
Processed 150 datapoints...
Processed 160 datapoints...
Processed 170 datapoints...
Processed 180 datapoints...
Processed 190 datapoints...
Processed 200 datapoints...
Samples Evaluated: 200
Avg Semantic Similarity: 0.704
Avg ROUGE-L: 0.131
Judge Verdicts: {'consistent': 200, 'hallucinated': 0, 'unknown': 0}

=== Benchmarking google/gemma-2b with LLM Judge ===


tokenizer_config.json:   0%|          | 0.00/33.6k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

Processed 10 datapoints...
Processed 20 datapoints...
Processed 30 datapoints...
Processed 40 datapoints...


skipping cudagraphs due to skipping cudagraphs due to multiple devices: device(type='cuda', index=0), device(type='cuda', index=1)


Processed 50 datapoints...
Processed 60 datapoints...
Processed 70 datapoints...
Processed 80 datapoints...
Processed 90 datapoints...
Processed 100 datapoints...
Processed 110 datapoints...
Processed 120 datapoints...
Processed 130 datapoints...
Processed 140 datapoints...
Processed 150 datapoints...
Processed 160 datapoints...
Processed 170 datapoints...
Processed 180 datapoints...
Processed 190 datapoints...
Processed 200 datapoints...
Samples Evaluated: 200
Avg Semantic Similarity: 0.642
Avg ROUGE-L: 0.081
Judge Verdicts: {'consistent': 200, 'hallucinated': 0, 'unknown': 0}


In [24]:
# Clean Hugging Face cache and temp files
!rm -rf /root/.cache/huggingface/hub/*
!rm -rf /kaggle/temp/*

# Check available disk
!df -h


Filesystem                                                              Size  Used Avail Use% Mounted on
overlay                                                                 7.9T  6.3T  1.7T  80% /
tmpfs                                                                    64M     0   64M   0% /dev
shm                                                                      14G   24K   14G   1% /dev/shm
/dev/loop1                                                               20G   76K   20G   1% /kaggle/lib
192.168.5.2:/data/kagglesdsdata/datasets/8391833/13243919/dd8191qiwlg1   73T   54T   20T  74% /kaggle/input/domain
/dev/sda1                                                               122G   90G   33G  74% /opt/bin
/dev/mapper/snap                                                        7.9T  6.3T  1.7T  80% /etc/hosts
tmpfs                                                                    16G     0   16G   0% /proc/acpi
tmpfs                                                          

In [20]:
import json
with open("/kaggle/working/leaderboards_partial.json", "w") as f:
    json.dump(leaderboards, f, indent=2)


In [20]:
with open("/kaggle/input/leaderboard/leaderboards_partial.json", "r") as f:
    leaderboards = json.load(f)

In [21]:
print(leaderboards)

{'Judge A: MedGemma - 4B': [{'model': 'microsoft/phi-3-mini-4k-instruct', 'avg_similarity': 0.721, 'avg_rouge': 0.113, 'samples': 200, 'verdicts': {'consistent': 200, 'hallucinated': 0, 'unknown': 0}}, {'model': 'TinyLlama/TinyLlama-1.1B-Chat-v1.0', 'avg_similarity': 0.704, 'avg_rouge': 0.131, 'samples': 200, 'verdicts': {'consistent': 200, 'hallucinated': 0, 'unknown': 0}}, {'model': 'google/gemma-2b', 'avg_similarity': 0.642, 'avg_rouge': 0.081, 'samples': 200, 'verdicts': {'consistent': 200, 'hallucinated': 0, 'unknown': 0}}]}


In [22]:
judge_label = "Judge B: Meditron-7B"
judge_model_id = "epfl-llm/meditron-7b"
print(f"\n\n########## Using {judge_label} ({judge_model_id}) ##########\n")
judge_pipeline, judge_model, judge_tokenizer = load_judge(judge_model_id)

leaderboard = []
for model_id in MODELS:
    summary = benchmark_model_long_with_judge(
        model_id, data, judge_pipeline, max_samples=200
    )
    leaderboard.append(summary)

leaderboards[judge_label] = leaderboard

del judge_model, judge_tokenizer, judge_pipeline
cleanup()



########## Using Judge B: Meditron-7B (epfl-llm/meditron-7b) ##########



tokenizer_config.json:   0%|          | 0.00/4.08k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.85M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/344 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/736 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/610 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 8 files:   0%|          | 0/8 [00:00<?, ?it/s]

model-00006-of-00008.safetensors:   0%|          | 0.00/1.84G [00:00<?, ?B/s]

model-00002-of-00008.safetensors:   0%|          | 0.00/1.90G [00:00<?, ?B/s]

model-00004-of-00008.safetensors:   0%|          | 0.00/1.92G [00:00<?, ?B/s]

model-00003-of-00008.safetensors:   0%|          | 0.00/1.84G [00:00<?, ?B/s]

model-00007-of-00008.safetensors:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

model-00001-of-00008.safetensors:   0%|          | 0.00/1.91G [00:00<?, ?B/s]

model-00008-of-00008.safetensors:   0%|          | 0.00/262M [00:00<?, ?B/s]

model-00005-of-00008.safetensors:   0%|          | 0.00/1.90G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]


=== Benchmarking microsoft/phi-3-mini-4k-instruct with LLM Judge ===


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

Processed 10 datapoints...
Processed 20 datapoints...
Processed 30 datapoints...
Processed 40 datapoints...
Processed 50 datapoints...
Processed 60 datapoints...
Processed 70 datapoints...
Processed 80 datapoints...
Processed 90 datapoints...
Processed 100 datapoints...
Processed 110 datapoints...
Processed 120 datapoints...
Processed 130 datapoints...
Processed 140 datapoints...
Processed 150 datapoints...
Processed 160 datapoints...
Processed 170 datapoints...
Processed 180 datapoints...
Processed 190 datapoints...
Processed 200 datapoints...
Samples Evaluated: 200
Avg Semantic Similarity: 0.721
Avg ROUGE-L: 0.113
Judge Verdicts: {'consistent': 200, 'hallucinated': 0, 'unknown': 0}

=== Benchmarking TinyLlama/TinyLlama-1.1B-Chat-v1.0 with LLM Judge ===


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Processed 10 datapoints...
Processed 20 datapoints...
Processed 30 datapoints...
Processed 40 datapoints...
Processed 50 datapoints...
Processed 60 datapoints...
Processed 70 datapoints...
Processed 80 datapoints...
Processed 90 datapoints...
Processed 100 datapoints...
Processed 110 datapoints...
Processed 120 datapoints...
Processed 130 datapoints...
Processed 140 datapoints...
Processed 150 datapoints...
Processed 160 datapoints...
Processed 170 datapoints...
Processed 180 datapoints...
Processed 190 datapoints...
Processed 200 datapoints...
Samples Evaluated: 200
Avg Semantic Similarity: 0.704
Avg ROUGE-L: 0.131
Judge Verdicts: {'consistent': 200, 'hallucinated': 0, 'unknown': 0}

=== Benchmarking google/gemma-2b with LLM Judge ===


tokenizer_config.json:   0%|          | 0.00/33.6k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

Processed 10 datapoints...
Processed 20 datapoints...
Processed 30 datapoints...
Processed 40 datapoints...
Processed 50 datapoints...
Processed 60 datapoints...
Processed 70 datapoints...
Processed 80 datapoints...
Processed 90 datapoints...
Processed 100 datapoints...
Processed 110 datapoints...
Processed 120 datapoints...
Processed 130 datapoints...
Processed 140 datapoints...
Processed 150 datapoints...
Processed 160 datapoints...
Processed 170 datapoints...
Processed 180 datapoints...
Processed 190 datapoints...
Processed 200 datapoints...
Samples Evaluated: 200
Avg Semantic Similarity: 0.642
Avg ROUGE-L: 0.081
Judge Verdicts: {'consistent': 200, 'hallucinated': 0, 'unknown': 0}


In [23]:
for judge_label, leaderboard in leaderboards.items():
    print(f"\n=== Leaderboard ({judge_label}) ===")
    print(f"{'Model':40s} | {'Sim':>6s} | {'ROUGE-L':>8s} | {'Samples':>8s} | {'Consistent':>10s} | {'Halluc':>8s} | {'Unknown':>8s}")
    print("-" * 100)
    for row in leaderboard:
        print(f"{row['model']:40s} | {row['avg_similarity']:6.3f} | {row['avg_rouge']:8.3f} | {row['samples']:8d} | "
              f"{row['verdicts']['consistent']:10d} | {row['verdicts']['hallucinated']:8d} | {row['verdicts']['unknown']:8d}")


=== Leaderboard (Judge A: MedGemma - 4B) ===
Model                                    |    Sim |  ROUGE-L |  Samples | Consistent |   Halluc |  Unknown
----------------------------------------------------------------------------------------------------
microsoft/phi-3-mini-4k-instruct         |  0.721 |    0.113 |      200 |        200 |        0 |        0
TinyLlama/TinyLlama-1.1B-Chat-v1.0       |  0.704 |    0.131 |      200 |        200 |        0 |        0
google/gemma-2b                          |  0.642 |    0.081 |      200 |        200 |        0 |        0

=== Leaderboard (Judge B: Meditron-7B) ===
Model                                    |    Sim |  ROUGE-L |  Samples | Consistent |   Halluc |  Unknown
----------------------------------------------------------------------------------------------------
microsoft/phi-3-mini-4k-instruct         |  0.721 |    0.113 |      200 |        200 |        0 |        0
TinyLlama/TinyLlama-1.1B-Chat-v1.0       |  0.704 |    0.131 |    

In [26]:
import json
with open("/kaggle/working/leaderboards_full.json", "w") as f:
    json.dump(leaderboards, f, indent=2)