### Post processing text output of ID retrievals (gpt, llama)

In [166]:
'''
>>> Script for post processing gpt and llama ID retrievals...
It takes text files and converts them to json files for evaluation.

'''

import re
import json
from pathlib import Path

results_txt = Path("retrievals/txt/llama4.scout_id_retrieval_nl.txt") # change name of the file [llama3.3.70b_id_retrieval_{lang}.txt], [gpt4.1.mini_pw_retrievals_nl.txt], [qwen3-235B_id_retrieval_nl.txt]
output_json = Path("retrievals/json/llama4.scout_id_retrieval_nl.json") # output

pattern_query = re.compile(r"^query id:\s*(\d+)")
pattern_relevant = re.compile(r"^relevant articles:\s*(.*)")

results_dict = {}

with open(results_txt, encoding="utf-8") as f:
    lines = [line.strip() for line in f if line.strip()]

i = 0
while i < len(lines) - 1:
    m_query = pattern_query.match(lines[i])
    m_relevant = pattern_relevant.match(lines[i+1])
    if m_query and m_relevant:
        qid = m_query.group(1)
        relevant_articles = [x.strip() for x in m_relevant.group(1).split(",") if x.strip()]
        results_dict[qid] = relevant_articles
        i += 2
    else:
        i += 1

with open(output_json, "w", encoding="utf-8") as out:
    json.dump(results_dict, out, indent=2, ensure_ascii=False)

print(f"results_nl.json written to: {output_json}")

results_nl.json written to: retrievals/json/llama4.scout_id_retrieval_nl.json


### Post processing gpt binary-classification retrievals

In [220]:
import json

input_path = "retrievals/llama4.scout_bin_full_class_retrievals_nl.jsonl"
output_path = "retrievals/llama4.scout_bin_class_retrieval_nl_fixed.jsonl"

buffer = ""
results = []

with open(input_path, "r", encoding="utf-8") as f:
    for line in f:
        buffer += line
        if line.strip().endswith("}"):  # crude heuristic: object may be complete
            try:
                obj = json.loads(buffer)

                # --- Normalize relevance field ---
                if "relevance" in obj and isinstance(obj["relevance"], dict):
                    normalized_relevance = {}
                    for k, v in obj["relevance"].items():
                        # Convert both keys and values to strings
                        str_k = str(k)
                        str_v = str(v) if v in [0, 1, "0", "1"] else "?"
                        normalized_relevance[str_k] = str_v
                    obj["relevance"] = normalized_relevance

                results.append(obj)
                buffer = ""

            except json.JSONDecodeError:
                continue  # wait for more lines

# Write clean JSONL
with open(output_path, "w", encoding="utf-8") as out_f:
    for obj in results:
        json_line = json.dumps(obj, ensure_ascii=False)
        out_f.write(json_line + "\n")

print(f"✅ Rewritten to proper JSONL with string-normalized relevance. Total entries: {len(results)}")
print(f"📄 Saved to: {output_path}")

✅ Rewritten to proper JSONL with string-normalized relevance. Total entries: 204
📄 Saved to: retrievals/llama4.scout_bin_class_retrieval_nl_fixed.jsonl


In [221]:
'''
>>> Script for checking the LLM output. It checks if LLM changes or added or removed any article ids --> comparing to all hard negatives.
It also checks the values if they are only 0 or 1. No other characters or empty values. FOR GPT output...

'''

import json

gpt_output_path = "retrievals/llama4.scout_bin_class_retrieval_nl_fixed.jsonl"
hard_negatives_path = "../sampling_hard_negatives/hard_negatives/hard_negatives_nl.jsonl"

# load hard negatives
hard_negatives = {}
with open(hard_negatives_path, encoding="utf-8") as f:
    for line in f:
        entry = json.loads(line)
        hard_negatives[entry["query_id"]] = set(entry["candidate_docs"])

invalid_values_queries = []
missing_ids_queries = {}
extra_ids_queries = {}
valid_queries = []

with open(gpt_output_path, encoding="utf-8") as f:
    for idx, line in enumerate(f, 1):
        line = line.strip()
        if not line:
            continue

        try:
            obj = json.loads(line)
        except json.JSONDecodeError:
            print(f"Line {idx}: INVALID JSON")
            continue

        query_id = obj.get("query_id")
        relevance = obj.get("relevance", {})

        # check relevance values
        invalid_values = [v for v in relevance.values() if v not in ("0", "1")]
        if invalid_values:
            invalid_values_queries.append(query_id)

        # check candidate IDs
        expected_ids = hard_negatives.get(query_id)
        if not expected_ids:
            continue

        actual_ids = set(relevance.keys())

        missing = expected_ids - actual_ids
        extra = actual_ids - expected_ids

        if missing:
            missing_ids_queries[query_id] = missing
        if extra:
            extra_ids_queries[query_id] = extra
        if not invalid_values and not missing and not extra:
            valid_queries.append(query_id)

print("\n=== SUMMARY ===")
print(f"Total queries processed: {len(valid_queries) + len(invalid_values_queries) + len(missing_ids_queries) + len(extra_ids_queries)}")
print(f"Fully correct queries: {len(valid_queries)}")

if invalid_values_queries:
    print(f"\nQueries with invalid relevance values: {invalid_values_queries}")
if missing_ids_queries:
    print("\nQueries with missing article IDs:")
    for qid, ids in missing_ids_queries.items():
        print(f"  Query {qid}: Missing IDs: {ids}")
if extra_ids_queries:
    print("\nQueries with extra article IDs:")
    for qid, ids in extra_ids_queries.items():
        print(f"  Query {qid}: Extra IDs: {ids}")


=== SUMMARY ===
Total queries processed: 204
Fully correct queries: 204


In [222]:
'''
>>> Script for post ptocessing the binary classification (0/1) outputs and convert them to json retrievals. Only getting the 1 values --> relevant articles. 

'''

import json

input_path = "retrievals/llama4.scout_bin_class_retrieval_nl_fixed.jsonl"
output_path = "retrievals/json/llama4.scout_bin_class_retrieval_nl.json"

result = {}

with open(input_path, encoding="utf-8") as fin:
    for line in fin:
        line = line.strip()
        if not line:
            continue
        obj = json.loads(line)
        query_id = obj["query_id"]
        relevance = obj["relevance"]

        relevant_articles = [aid for aid, val in relevance.items() if val == "1"]
        result[query_id] = relevant_articles

with open(output_path, "w", encoding="utf-8") as fout:
    json.dump(result, fout, indent=2, ensure_ascii=False)


### checking all retrievals, if all queries are present.

In [175]:
import json
from pathlib import Path

# Paths — adjust if needed
#predictions_json = Path("retrievals/json/gemini_2.5_flash_id_retrieval_nl.json")
#predictions_json = Path("retrievals/json/gpt4.1.mini_id_retrievals_nl.json")
#predictions_json = Path("retrievals/json/gpt4o.mini_id_retrievals_nl.json")
#predictions_json = Path("retrievals/json/gpt4.1.mini_bin_class_retrievals_nl.json")
#predictions_json = Path("retrievals/json/gpt4o.mini_bin_class_retrievals_nl.json")
#predictions_json = Path("retrievals/json/gemini_2.5_flash_pro_id_retrieval_nl.json")
#predictions_json = Path("retrievals/json/gemini_2.5.flash_bin_class_retrieval_nl.json")
#predictions_json = Path("retrievals/json/gemini_2.5.pro_id_retrieval_nl.json")
#predictions_json = Path("retrievals/json/llama3.3.70b_id_retrieval_nl.json")
#predictions_json = Path("retrievals/json/llama3.3.70b_bin_class_retrieval_nl.json")
#predictions_json = Path("retrievals/json/qwen3.235b_bin_class_retrieval_nl.json")
predictions_json = Path("retrievals/json/llama4.scout_bin_class_retrieval_nl.json")

gold_json = Path("gold_data/gold_standard_nl.json")

# Load predictions
with open(predictions_json, encoding="utf-8") as f:
    predictions = json.load(f)

# Load gold
with open(gold_json, encoding="utf-8") as f:
    gold = json.load(f)

pred_ids = set(predictions.keys())
gold_ids = set(gold.keys())

missing_in_preds = gold_ids - pred_ids
extra_in_preds = pred_ids - gold_ids

print(f"Total gold queries: {len(gold_ids)}")
print(f"Total predicted queries: {len(pred_ids)}\n")

if missing_in_preds:
    print(f"Missing in predictions ({len(missing_in_preds)}):")
    for qid in sorted(missing_in_preds):
        print(f"  {qid}")
else:
    print("All gold query IDs are present in predictions.")

if extra_in_preds:
    print(f"\nExtra query IDs in predictions ({len(extra_in_preds)}):")
    for qid in sorted(extra_in_preds):
        print(f"  {qid}")
else:
    print("No extra query IDs in predictions.")

if not missing_in_preds and not extra_in_preds:
    print("\nPredictions file matches gold file perfectly.")
else:
    print("\nPlease fix mismatches before evaluating.")

Total gold queries: 203
Total predicted queries: 203

All gold query IDs are present in predictions.
No extra query IDs in predictions.

Predictions file matches gold file perfectly.


## Checking the Output of Sorted Ranked Lists

In [305]:
import json
import re
from pathlib import Path
from collections import Counter

# -------- CONFIG --------
output_file_path = Path("rankings/gpt4o-mini_sorted_ranking_nl.txt")
hard_negatives_path = Path("../sampling_hard_negatives/hard_negatives/hard_negatives_nl.jsonl")

# -------- LOAD HARD NEGATIVE SET --------
with open(hard_negatives_path, "r", encoding="utf-8") as f:
    hard_data = [json.loads(line) for line in f]

query_to_candidates = {
    str(entry["query_id"]).strip(): set(str(doc) for doc in entry["candidate_docs"])
    for entry in hard_data
}

# -------- TRY JSONL / JSON ARRAY PARSE --------
def try_json_parse(path):
    def get_field(obj, *keys):
        """Return the first available key value."""
        for k in keys:
            if k in obj:
                return obj[k]
        return None

    try:
        # Try JSON array
        with open(path, "r", encoding="utf-8") as f:
            data = json.load(f)
        if isinstance(data, dict):
            data = [data]
        return {
            str(get_field(obj, "query_id", "query id")).strip():
                re.findall(r"\d+", str(get_field(obj, "ranked_articles", "ranked articles")))
            for obj in data
        }
    except json.JSONDecodeError:
        pass

    # Try JSONL
    parsed = {}
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                obj = json.loads(line)
                qid = get_field(obj, "query_id", "query id")
                ranks = get_field(obj, "ranked_articles", "ranked articles")
                if qid is not None:
                    parsed[str(qid).strip()] = re.findall(r"\d+", str(ranks))
            except json.JSONDecodeError:
                return None
    return parsed if parsed else None

queries = try_json_parse(output_file_path)

# -------- FALLBACK: MESSY GPT-STYLE TXT --------
if queries is None:
    print("Falling back to regex messy-text parser...")
    queries = {}
    current_qid = None
    current_ids = []

    with open(output_file_path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue

            # Detect query id (underscore or space)
            qid_match = re.search(r'"?query[_ ]id"?\s*:\s*"?(\d+)"?', line, re.IGNORECASE)
            if qid_match:
                if current_qid is not None:
                    queries[current_qid] = current_ids
                current_qid = qid_match.group(1)
                current_ids = []
                continue

            # Detect ranked articles line
            if "ranked articles" in line.lower():
                ids_in_line = re.findall(r"\d+", line)
                current_ids.extend(ids_in_line)
                continue

            # Detect standalone article IDs
            ids_in_line = re.findall(r"\d+", line)
            if ids_in_line:
                current_ids.extend(ids_in_line)

        # Save last query
        if current_qid is not None:
            queries[current_qid] = current_ids

print(f"Parsed {len(queries)} queries from {output_file_path}")



# -------- OVERALL VALIDATION --------
total_missing = 0
total_extra = 0
total_duplicates = 0
queries_with_issues = 0

for qid, candidates in query_to_candidates.items():
    predicted = queries.get(qid, [])
    pred_set = set(predicted)

    missing_ids = candidates - pred_set
    extra_ids = pred_set - candidates
    dupes = [doc for doc, count in Counter(predicted).items() if count > 1]

    total_missing += len(missing_ids)
    total_extra += len(extra_ids)
    total_duplicates += len(dupes)

    if len(predicted) != 100 or missing_ids or extra_ids or dupes:
        queries_with_issues += 1

# -------- REPORT --------
total_expected_articles = len(query_to_candidates) * 100

print("\n===== OVERALL REPORT =====")
print(f"Total queries in hard negatives: {len(query_to_candidates)}")
print(f"Total queries parsed: {len(queries)}")
print(f"Queries with any issues: {queries_with_issues} ({queries_with_issues / len(query_to_candidates) * 100:.2f}%)")
print(f"Missing article IDs: {total_missing} ({total_missing / total_expected_articles * 100:.2f}%)")
print(f"Extra (hallucinated) article IDs: {total_extra} ({total_extra / total_expected_articles * 100:.2f}%)")
print(f"Duplicate article IDs: {total_duplicates} ({total_duplicates / total_expected_articles * 100:.2f}%)")

Falling back to regex messy-text parser...
Parsed 203 queries from rankings/gpt4o-mini_sorted_ranking_nl.txt

===== OVERALL REPORT =====
Total queries in hard negatives: 203
Total queries parsed: 203
Queries with any issues: 202 (99.51%)
Missing article IDs: 10371 (51.09%)
Extra (hallucinated) article IDs: 6715 (33.08%)
Duplicate article IDs: 2606 (12.84%)


#### Step 2: Converting .txt Files to jsonl Files for Evaluation

In [306]:
import json
import re
from pathlib import Path

# -------- CONFIG --------
input_txt_path = Path("rankings/gpt4o-mini_sorted_ranking_nl.txt")
output_jsonl_path = Path("rankings/gpt4o-mini_sorted_ranking_nl.jsonl")

def extract_ids_any_format(ranked_field):
    """Extract article IDs from either a list, number string, or mixed format."""
    if isinstance(ranked_field, list):
        blob = " ".join(str(x) for x in ranked_field)
    else:
        blob = str(ranked_field)
    return re.findall(r"\d+", blob)

def get_field(obj, *possible_keys):
    for k in possible_keys:
        if k in obj:
            return obj[k]
    return None

jsonl_entries = []

with open(input_txt_path, "r", encoding="utf-8") as f:
    raw_text = f.read().strip()

parsed_qids = set()

# -------- TRY FULL JSON ARRAY PARSE --------
try:
    cleaned = re.sub(r"}\s*{", "},\n{", raw_text)
    cleaned = re.sub(r",\s*]", "]", cleaned)
    cleaned = re.sub(r",\s*}", "}", cleaned)
    if not cleaned.strip().startswith("["):
        cleaned = "[" + cleaned + "]"
    data = json.loads(cleaned)
    for obj in data:
        qid_raw = get_field(obj, "query_id", "query id")
        ranks_raw = get_field(obj, "ranked_articles", "ranked articles")
        if qid_raw is None:
            continue
        query_id = str(qid_raw).strip().strip('"').strip("'")
        ranks_list = extract_ids_any_format(ranks_raw)
        jsonl_entries.append({"query_id": query_id, "ranks": ranks_list})
        parsed_qids.add(query_id)
    print("Parsed using full JSON array method.")
except json.JSONDecodeError:
    print("Full JSON array parse failed, continuing...")

# -------- TRY JSON OBJECT REGEX EXTRACTION --------
json_objects = re.findall(r"\{.*?\}", raw_text, flags=re.DOTALL)
for obj_str in json_objects:
    try:
        obj = json.loads(obj_str)
    except json.JSONDecodeError:
        continue
    qid_raw = get_field(obj, "query_id", "query id")
    ranks_raw = get_field(obj, "ranked_articles", "ranked articles")
    if qid_raw is None:
        continue
    query_id = str(qid_raw).strip().strip('"').strip("'")
    if query_id in parsed_qids:
        continue
    ranks_list = extract_ids_any_format(ranks_raw)
    jsonl_entries.append({"query_id": query_id, "ranks": ranks_list})
    parsed_qids.add(query_id)
print("Parsed using JSON object extraction method (where possible).")

# -------- FALLBACK: RAW TEXT SCAN FOR ANY REMAINING QIDS --------
current_qid = None
current_ids = []
for line in raw_text.splitlines():
    line = line.strip()
    if not line:
        continue
    qid_match = re.search(r'"?query[_ ]id"?\s*:\s*"?(\d+)"?', line, re.IGNORECASE)
    if qid_match:
        if current_qid and current_qid not in parsed_qids:
            jsonl_entries.append({"query_id": current_qid, "ranks": current_ids})
            parsed_qids.add(current_qid)
        current_qid = qid_match.group(1)
        current_ids = []
        continue
    ids_in_line = re.findall(r"\d+", line)
    if ids_in_line:
        current_ids.extend(ids_in_line)
# flush last
if current_qid and current_qid not in parsed_qids:
    jsonl_entries.append({"query_id": current_qid, "ranks": current_ids})
    parsed_qids.add(current_qid)

# -------- SAVE JSONL --------
with open(output_jsonl_path, "w", encoding="utf-8") as f_out:
    for entry in jsonl_entries:
        f_out.write(json.dumps(entry, ensure_ascii=False) + "\n")

print(f"Converted {len(jsonl_entries)} queries to {output_jsonl_path}")

Full JSON array parse failed, continuing...
Parsed using JSON object extraction method (where possible).
Converted 203 queries to rankings/gpt4o-mini_sorted_ranking_nl.jsonl


#### Step 3: Validating the jsonl Files

In [307]:
import json
from pathlib import Path
from collections import Counter

# -------- CONFIG --------
rankings_jsonl_path = Path("rankings/gpt4o-mini_sorted_ranking_nl.jsonl")
hard_negatives_path = Path("../sampling_hard_negatives/hard_negatives/hard_negatives_nl.jsonl")
normalized_jsonl_path = Path("rankings/gpt4o-mini_sorted_ranking_nl_fixed.jsonl")  # output

PAD_TOKEN = "PAD"

# -------- LOAD RANKINGS --------
rankings = {}
with open(rankings_jsonl_path, "r", encoding="utf-8") as f:
    for line in f:
        entry = json.loads(line)
        rankings[str(entry["query_id"]).strip()] = [str(x).strip() for x in entry["ranks"]]

# -------- LOAD HARD NEGATIVES --------
hard_negatives = {}
with open(hard_negatives_path, "r", encoding="utf-8") as f:
    for line in f:
        entry = json.loads(line)
        hard_negatives[str(entry["query_id"]).strip()] = set(str(doc) for doc in entry["candidate_docs"])

# -------- STATS + NORMALIZATION --------
total_missing = 0
total_extra = 0
total_duplicates = 0
queries_with_issues = 0
queries_lt_100 = 0
queries_gt_100 = 0

missing_queries = set(hard_negatives.keys()) - set(rankings.keys())
extra_queries = set(rankings.keys()) - set(hard_negatives.keys())

normalized_entries = []

for qid, candidates in hard_negatives.items():
    predicted = rankings.get(qid, [])
    n_pred = len(predicted)

    # Count <100 and >100 BEFORE normalization
    if n_pred < 100:
        queries_lt_100 += 1
    elif n_pred > 100:
        queries_gt_100 += 1

    # Validation ignores PAD
    non_pad = [p for p in predicted if p != PAD_TOKEN]
    pred_set = set(non_pad)

    missing_ids = candidates - pred_set
    extra_ids = pred_set - candidates

    # Count duplicates (ignoring PAD)
    dupes = [doc for doc, cnt in Counter(non_pad).items() if cnt > 1]

    total_missing += len(missing_ids)
    total_extra += len(extra_ids)
    total_duplicates += len(dupes)

    if n_pred != 100 or missing_ids or extra_ids or dupes:
        queries_with_issues += 1

    # ---- Duplicate removal before normalization ----
    seen = set()
    deduped = []
    for doc_id in predicted:
        if doc_id in seen:
            deduped.append(PAD_TOKEN)  # replace duplicate with PAD
        else:
            deduped.append(doc_id)
            seen.add(doc_id)

    # ---- Normalize to exactly 100 ----
    if len(deduped) > 100:
        normalized = deduped[:100]
    elif len(deduped) < 100:
        normalized = deduped + [PAD_TOKEN] * (100 - len(deduped))
    else:
        normalized = deduped

    normalized_entries.append({"query_id": qid, "ranks": normalized})

# Also include any extra queries present only in rankings
for qid in sorted(extra_queries):
    pred = rankings[qid]

    # Remove duplicates first
    seen = set()
    deduped = []
    for doc_id in pred:
        if doc_id in seen:
            deduped.append(PAD_TOKEN)
        else:
            deduped.append(doc_id)
            seen.add(doc_id)

    if len(deduped) > 100:
        norm = deduped[:100]
    else:
        norm = deduped + [PAD_TOKEN] * (100 - len(deduped))
    normalized_entries.append({"query_id": qid, "ranks": norm})

# -------- WRITE NORMALIZED JSONL --------
with open(normalized_jsonl_path, "w", encoding="utf-8") as f_out:
    for e in normalized_entries:
        f_out.write(json.dumps(e, ensure_ascii=False) + "\n")

# -------- REPORT --------
total_expected_articles = len(hard_negatives) * 100
pct = lambda x, d: (x / d * 100) if d else 0.0

print("\n===== OVERALL REPORT =====")
print(f"Total queries in hard negatives: {len(hard_negatives)}")
print(f"Total queries in rankings file: {len(rankings)}")
print(f"Missing queries: {len(missing_queries)}")
print(f"Extra queries: {len(extra_queries)}")
print(f"Queries with <100 IDs: {queries_lt_100} ({pct(queries_lt_100, len(hard_negatives)):.2f}%)")
print(f"Queries with >100 IDs: {queries_gt_100} ({pct(queries_gt_100, len(hard_negatives)):.2f}%)")
print(f"Queries with any issues: {queries_with_issues} ({pct(queries_with_issues, len(hard_negatives)):.2f}%)")
print(f"Missing article IDs: {total_missing} ({pct(total_missing, total_expected_articles):.2f}%)")
print(f"Extra (hallucinated) article IDs: {total_extra} ({pct(total_extra, total_expected_articles):.2f}%)")
print(f"Duplicate article IDs: {total_duplicates} ({pct(total_duplicates, total_expected_articles):.2f}%)")
print(f"\nNormalized rankings written to: {normalized_jsonl_path}")


===== OVERALL REPORT =====
Total queries in hard negatives: 203
Total queries in rankings file: 203
Missing queries: 0
Extra queries: 0
Queries with <100 IDs: 33 (16.26%)
Queries with >100 IDs: 168 (82.76%)
Queries with any issues: 202 (99.51%)
Missing article IDs: 10371 (51.09%)
Extra (hallucinated) article IDs: 6715 (33.08%)
Duplicate article IDs: 2606 (12.84%)

Normalized rankings written to: rankings/gpt4o-mini_sorted_ranking_nl_fixed.jsonl


### Step 4: Evaluation

In [316]:
import json
from pathlib import Path
import math
from tqdm import tqdm
from statistics import mean

'''
>>> scored_ranking Files
'''
#predictions_json = Path("rankings/scored/json/gpt4.1.mini.ranks.nl.jsonl") 
#predictions_json = Path("rankings/scored/json/gpt4o.mini.ranks.nl.jsonl") 

#predictions_json = Path("rankings/scored/json/gemini2.5.flash.ranks.nl.jsonl") 

#predictions_json = Path("rankings/scored/json/qwen3.235b.ranks.nl.jsonl") 
#predictions_json = Path("rankings/scored/json/llama3.3.70b.ranks.nl.jsonl") 
#predictions_json = Path("rankings/scored/json/llama4.scout.ranks.nl.jsonl") 


'''
>>> Sorted_ranking Files
'''
#predictions_json = Path("rankings/sorted/json/gemini2.5.flash_sorted_ranks_nl.jsonl") 
#predictions_json = Path("rankings/sorted/json/gpt4.1.mini_sorted_ranks_nl.jsonl") 
#predictions_json = Path("rankings/sorted/json/gpt4o-mini_sorted_ranks_nl.jsonl") 
#predictions_json = Path("rankings/sorted/json/qwen3.235b_sorted_ranks_nl.jsonl") 
#predictions_json = Path("rankings/sorted/json/llama4_scout_sorted_ranks_nl.jsonl") 
#predictions_json = Path("rankings/sorted/json/llama3.3_70b_sorted_ranks_nl.jsonl") 


'''
>>> Baselines
'''
#predictions_json = Path("rankings/sorted/json/me5_top100_ranks_nl.jsonl")
predictions_json = Path("rankings/sorted/json/jina_ranks_nl.jsonl")


gold_json = Path("gold_data/gold_standard_nl.json")
output_dir = Path("evaluation")
output_dir.mkdir(parents=True, exist_ok=True)
output_file = output_dir / "eval_sorted_ranking_jina.txt" 

ks = [1, 5, 10, 20, 50, 100]

# load predictions from JSONL
predictions = {}
with open(predictions_json, encoding="utf-8") as f:
    for line in f:
        item = json.loads(line)
        predictions[item["query_id"]] = item["ranks"]

# load gold
with open(gold_json, encoding="utf-8") as f:
    gold = json.load(f)

print(f"Loaded predictions for {len(predictions)} queries")
print(f"Loaded gold standard for {len(gold)} queries")

metrics = {f"R@{k}": [] for k in ks}
metrics.update({f"MRR@{k}": [] for k in ks})
metrics.update({f"MAP@{k}": [] for k in ks})
metrics.update({f"nDCG@{k}": [] for k in ks})

def compute_hits(relevant, predicted, k):
    hits = [1 if doc in relevant else 0 for doc in predicted[:k]]
    return hits

for qid in tqdm(predictions.keys(), desc="Evaluating"):
    pred = predictions[qid]
    gold_set = set(gold[qid])

    for k in ks:
        hits = compute_hits(gold_set, pred, k)

        recall = sum(hits) / len(gold_set) if gold_set else 0.0
        metrics[f"R@{k}"].append(recall)

        rr = 0.0
        for rank, h in enumerate(hits):
            if h:
                rr = 1.0 / (rank + 1)
                break
        metrics[f"MRR@{k}"].append(rr)

        ap = 0.0
        hit_count = 0
        for rank, h in enumerate(hits):
            if h:
                hit_count += 1
                ap += hit_count / (rank + 1)
        ap /= len(gold_set) if gold_set else 1
        metrics[f"MAP@{k}"].append(ap)

        dcg = 0.0
        for i, h in enumerate(hits):
            if h:
                dcg += 1.0 / (math.log2(i + 2))  # +2 because log2(rank+1), and rank = 0-based

        ideal_hits = [1] * min(len(gold_set), k)
        idcg = sum(1.0 / math.log2(i + 2) for i in range(len(ideal_hits)))
        ndcg = dcg / idcg if idcg > 0 else 0.0
        metrics[f"nDCG@{k}"].append(ndcg)

print()
with open(output_file, "w", encoding="utf-8") as out:
    for m, vals in metrics.items():
        line = f"{m}: {mean(vals):.4f}"
        print(line)
        out.write(line + "\n")

print(f"\nEvaluation results saved to: {output_file}")

Loaded predictions for 203 queries
Loaded gold standard for 203 queries


Evaluating: 100%|██████████| 203/203 [00:00<00:00, 19534.35it/s]


R@1: 0.1572
R@5: 0.4061
R@10: 0.5237
R@20: 0.6701
R@50: 0.8529
R@100: 1.0066
MRR@1: 0.3005
MRR@5: 0.4089
MRR@10: 0.4214
MRR@20: 0.4299
MRR@50: 0.4335
MRR@100: 0.4346
MAP@1: 0.1572
MAP@5: 0.2725
MAP@10: 0.3016
MAP@20: 0.3208
MAP@50: 0.3334
MAP@100: 0.3393
nDCG@1: 0.3005
nDCG@5: 0.3463
nDCG@10: 0.3905
nDCG@20: 0.4376
nDCG@50: 0.4858
nDCG@100: 0.5209

Evaluation results saved to: evaluation/eval_sorted_ranking_jina.txt





In [318]:
import json
import math
from pathlib import Path
from tqdm import tqdm
from statistics import mean
import pandas as pd

# --- CONFIG ---
prediction_files = [
    "rankings/scored/json/gpt4.1.mini.ranks.nl.jsonl",
    "rankings/scored/json/gpt4o.mini.ranks.nl.jsonl",
    "rankings/scored/json/gemini2.5.flash.ranks.nl.jsonl",
    "rankings/scored/json/qwen3.235b.ranks.nl.jsonl",
    "rankings/scored/json/llama3.3.70b.ranks.nl.jsonl",
    "rankings/scored/json/llama4.scout.ranks.nl.jsonl",
    "rankings/sorted/json/gemini2.5.flash_sorted_ranks_nl.jsonl",
    "rankings/sorted/json/gpt4.1.mini_sorted_ranks_nl.jsonl",
    "rankings/sorted/json/gpt4o-mini_sorted_ranks_nl.jsonl",
    "rankings/sorted/json/qwen3.235b_sorted_ranks_nl.jsonl",
    "rankings/sorted/json/llama4_scout_sorted_ranks_nl.jsonl",
    "rankings/sorted/json/llama3.3_70b_sorted_ranks_nl.jsonl",
    "rankings/sorted/json/me5_top100_ranks_nl.jsonl",
    "rankings/sorted/json/jina_ranks_nl.jsonl"
]

gold_json = Path("gold_data/gold_standard_nl.json")
output_dir = Path("evaluation")
output_dir.mkdir(parents=True, exist_ok=True)

output_txt = output_dir / "eval_all_models.txt"
output_csv = output_dir / "eval_all_models.csv"

ks = [1, 5, 10, 20, 50, 100]

# Load gold
with open(gold_json, encoding="utf-8") as f:
    gold = json.load(f)

def compute_hits(relevant, predicted, k):
    return [1 if doc in relevant else 0 for doc in predicted[:k]]

results_table = []

for file_path in prediction_files:
    file_path = Path(file_path)
    model_name = file_path.stem  # e.g. "gemini2.5.flash_sorted_ranks_nl"

    # Load predictions
    predictions = {}
    with open(file_path, encoding="utf-8") as f:
        for line in f:
            item = json.loads(line)
            predictions[item["query_id"]] = item["ranks"]

    print(f"Evaluating {model_name}: {len(predictions)} queries loaded.")

    metrics = {f"R@{k}": [] for k in ks}
    metrics.update({f"MRR@{k}": [] for k in ks})
    metrics.update({f"MAP@{k}": [] for k in ks})
    metrics.update({f"nDCG@{k}": [] for k in ks})

    for qid in tqdm(predictions.keys(), desc=f"Evaluating {model_name}"):
        pred = predictions[qid]
        gold_set = set(gold[qid])

        for k in ks:
            hits = compute_hits(gold_set, pred, k)

            recall = sum(hits) / len(gold_set) if gold_set else 0.0
            metrics[f"R@{k}"].append(recall)

            rr = 0.0
            for rank, h in enumerate(hits):
                if h:
                    rr = 1.0 / (rank + 1)
                    break
            metrics[f"MRR@{k}"].append(rr)

            ap = 0.0
            hit_count = 0
            for rank, h in enumerate(hits):
                if h:
                    hit_count += 1
                    ap += hit_count / (rank + 1)
            ap /= len(gold_set) if gold_set else 1
            metrics[f"MAP@{k}"].append(ap)

            dcg = 0.0
            for i, h in enumerate(hits):
                if h:
                    dcg += 1.0 / (math.log2(i + 2))
            ideal_hits = [1] * min(len(gold_set), k)
            idcg = sum(1.0 / math.log2(i + 2) for i in range(len(ideal_hits)))
            ndcg = dcg / idcg if idcg > 0 else 0.0
            metrics[f"nDCG@{k}"].append(ndcg)

    # Add to results table
    results_table.append({
        "Model": model_name,
        "R@1": mean(metrics["R@1"]),
        "R@5": mean(metrics["R@5"]),
        "R@10": mean(metrics["R@10"]),
        "MRR@10": mean(metrics["MRR@10"]),
        "MAP@10": mean(metrics["MAP@10"]),
        "nDCG@1": mean(metrics["nDCG@1"]),
        "nDCG@10": mean(metrics["nDCG@10"]),
        "nDCG@100": mean(metrics["nDCG@100"]),
    })

# Save TXT
with open(output_txt, "w", encoding="utf-8") as f:
    header = "Model\tR@1\tR@5\tR@10\tMRR@10\tMAP@10\tnDCG@1\tnDCG@10\tnDCG@100\n"
    f.write(header)
    for row in results_table:
        f.write(
            f"{row['Model']}\t"
            f"{row['R@1']:.4f}\t{row['R@5']:.4f}\t{row['R@10']:.4f}\t"
            f"{row['MRR@10']:.4f}\t{row['MAP@10']:.4f}\t"
            f"{row['nDCG@1']:.4f}\t{row['nDCG@10']:.4f}\t{row['nDCG@100']:.4f}\n"
        )

# Save CSV
df = pd.DataFrame(results_table)
df.to_csv(output_csv, index=False)

print(f"\n✅ Evaluation done for {len(prediction_files)} models.")
print(f"TXT table saved to: {output_txt}")
print(f"CSV table saved to: {output_csv}")

Evaluating gpt4.1.mini.ranks.nl: 203 queries loaded.


Evaluating gpt4.1.mini.ranks.nl: 100%|██████████| 203/203 [00:00<00:00, 30799.19it/s]


Evaluating gpt4o.mini.ranks.nl: 203 queries loaded.


Evaluating gpt4o.mini.ranks.nl: 100%|██████████| 203/203 [00:00<00:00, 26982.85it/s]


Evaluating gemini2.5.flash.ranks.nl: 203 queries loaded.


Evaluating gemini2.5.flash.ranks.nl: 100%|██████████| 203/203 [00:00<00:00, 29739.56it/s]


Evaluating qwen3.235b.ranks.nl: 203 queries loaded.


Evaluating qwen3.235b.ranks.nl: 100%|██████████| 203/203 [00:00<00:00, 35445.81it/s]


Evaluating llama3.3.70b.ranks.nl: 203 queries loaded.


Evaluating llama3.3.70b.ranks.nl: 100%|██████████| 203/203 [00:00<00:00, 35670.03it/s]


Evaluating llama4.scout.ranks.nl: 203 queries loaded.


Evaluating llama4.scout.ranks.nl: 100%|██████████| 203/203 [00:00<00:00, 33498.99it/s]


Evaluating gemini2.5.flash_sorted_ranks_nl: 203 queries loaded.


Evaluating gemini2.5.flash_sorted_ranks_nl: 100%|██████████| 203/203 [00:00<00:00, 36411.38it/s]


Evaluating gpt4.1.mini_sorted_ranks_nl: 203 queries loaded.


Evaluating gpt4.1.mini_sorted_ranks_nl: 100%|██████████| 203/203 [00:00<00:00, 28027.38it/s]


Evaluating gpt4o-mini_sorted_ranks_nl: 203 queries loaded.


Evaluating gpt4o-mini_sorted_ranks_nl: 100%|██████████| 203/203 [00:00<00:00, 37592.99it/s]


Evaluating qwen3.235b_sorted_ranks_nl: 203 queries loaded.


Evaluating qwen3.235b_sorted_ranks_nl: 100%|██████████| 203/203 [00:00<00:00, 35910.74it/s]


Evaluating llama4_scout_sorted_ranks_nl: 203 queries loaded.


Evaluating llama4_scout_sorted_ranks_nl: 100%|██████████| 203/203 [00:00<00:00, 29945.62it/s]


Evaluating llama3.3_70b_sorted_ranks_nl: 203 queries loaded.


Evaluating llama3.3_70b_sorted_ranks_nl: 100%|██████████| 203/203 [00:00<00:00, 35809.55it/s]


Evaluating me5_top100_ranks_nl: 203 queries loaded.


Evaluating me5_top100_ranks_nl: 100%|██████████| 203/203 [00:00<00:00, 34988.44it/s]


Evaluating jina_ranks_nl: 203 queries loaded.


Evaluating jina_ranks_nl: 100%|██████████| 203/203 [00:00<00:00, 36006.42it/s]


✅ Evaluation done for 14 models.
TXT table saved to: evaluation/eval_all_models.txt
CSV table saved to: evaluation/eval_all_models.csv





## chacking and post-processing the scored_ranking results

In [236]:
import os
import json
from pathlib import Path
from tqdm import tqdm

# === CONFIGURATION ===
output_path = Path(f"rankings/scored/llama4.scout_score_ranking_nl.jsonl")
hard_negatives_path = Path(f"../sampling_hard_negatives/hard_negatives/hard_negatives_nl.jsonl")
fixed_output_path = Path(f"rankings/scored/llama4.scout_score_ranking_nl_sorted.jsonl")


# === LOAD HARD NEGATIVES (GROUND TRUTH) ===
with open(hard_negatives_path, "r", encoding="utf-8") as f:
    hard_negatives = [json.loads(line) for line in f]

expected_query_ids = set()
expected_doc_ids_per_query = {}

for entry in hard_negatives:
    qid = str(entry["query_id"])
    expected_query_ids.add(qid)
    expected_doc_ids_per_query[qid] = set(entry["candidate_docs"])


# === READ AND FIX MALFORMED JSONL ===
fixed_entries = []
current_entry_lines = []

with open(output_path, "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if line.startswith('{') and current_entry_lines:
            try:
                entry = json.loads(' '.join(current_entry_lines))
                fixed_entries.append(entry)
            except json.JSONDecodeError:
                print("Skipping malformed entry.")
            current_entry_lines = [line]
        else:
            current_entry_lines.append(line)

    # Process last entry
    if current_entry_lines:
        try:
            entry = json.loads(' '.join(current_entry_lines))
            fixed_entries.append(entry)
        except json.JSONDecodeError:
            print("Skipping final malformed entry.")


# === SORT AND VALIDATE ===
valid_entries = []

for entry in tqdm(fixed_entries, desc="Validating and sorting"):
    qid = entry.get("query_id")
    scores = entry.get("relevance_scores", {})
    if not qid or qid not in expected_query_ids:
        print(f"Skipping unknown or missing query ID: {qid}")
        continue

    if set(scores.keys()) != expected_doc_ids_per_query[qid]:
        print(f"Skipping query {qid}: mismatched article IDs.")
        continue

    # Sort by score descending
        # Convert all scores to integers (handle str/int mix)
    try:
        int_scores = {doc_id: int(score) for doc_id, score in scores.items()}
    except ValueError:
        print(f"Skipping query {qid}: contains non-integer-convertible score.")
        continue

    # Sort by descending score
    sorted_scores = dict(sorted(int_scores.items(), key=lambda x: -x[1]))
    valid_entries.append({
        "query_id": qid,
        "relevance_scores": sorted_scores
    })


# === WRITE FIXED JSONL ===
fixed_output_path.parent.mkdir(parents=True, exist_ok=True)
with open(fixed_output_path, "w", encoding="utf-8") as f_out:
    for entry in valid_entries:
        f_out.write(json.dumps(entry) + "\n")

print(f"\nFixed and validated output written to: {fixed_output_path}")

# === WRITE SECOND OUTPUT (ONLY RANKED DOC IDS, NO SCORES) ===
ranks_only_output_path = Path("rankings/scored/llama4.scout.ranks.nl.jsonl")

with open(ranks_only_output_path, "w", encoding="utf-8") as f_out:
    for entry in valid_entries:
        qid = entry["query_id"]
        ranked_ids = list(entry["relevance_scores"].keys())  # Already sorted
        f_out.write(json.dumps({
            "query_id": qid,
            "ranks": ranked_ids
        }) + "\n")

print(f"Ranks-only output written to: {ranks_only_output_path}")

# === REPORT MISSING QUERIES ===
fixed_query_ids = {entry["query_id"] for entry in valid_entries}
missing_query_ids = sorted(expected_query_ids - fixed_query_ids)

if missing_query_ids:
    print(f"\n⚠ Missing query IDs ({len(missing_query_ids)}):")
    print(missing_query_ids)
else:
    print("\nAll query IDs accounted for.")

FileNotFoundError: [Errno 2] No such file or directory: 'rankings/scored/llama4.scout_score_ranking_nl.jsonl'

In [224]:
import os
import json
import pandas as pd

# Path to your evaluation directory
evaluation_folder = "retrievals/evaluation"

# List all .txt files in the folder
files = [f for f in os.listdir(evaluation_folder) if f.endswith(".txt")]

# Prepare lists to collect macro and micro data
macro_data = []
micro_data = []

# Function to clean and round values
def parse_score(value):
    num = float(value)
    if num > 1.0 and num <= 100:
        num = num / 1000  # e.g., 322 -> 0.322
    return round(num, 2)  # keep 2 digits

for filename in files:
    filepath = os.path.join(evaluation_folder, filename)
    
    with open(filepath, "r") as f:
        # Strip and remove empty lines
        lines = [line.strip() for line in f if line.strip()]
    
    # Find the relevant sections
    macro_index = lines.index("Macro-averaged metrics:")
    micro_index = lines.index("Micro-averaged metrics:")
    
    macro_precision = parse_score(lines[macro_index + 1].split(":")[1].strip())
    macro_recall = parse_score(lines[macro_index + 2].split(":")[1].strip())
    macro_f1 = parse_score(lines[macro_index + 3].split(":")[1].strip())
    
    micro_precision = parse_score(lines[micro_index + 1].split(":")[1].strip())
    micro_recall = parse_score(lines[micro_index + 2].split(":")[1].strip())
    micro_f1 = parse_score(lines[micro_index + 3].split(":")[1].strip())

    # Extract model name and method
    name = filename.replace(".txt", "")
    if "id_retr_" in name:
        method = "ID-Retrieval"
        model = name.split("id_retr_")[-1]
    elif "bin_retr_" in name:
        method = "Relevance-Classification"
        model = name.split("bin_retr_")[-1]
    else:
        method = "Unknown"
        model = name

    # Append to data lists
    macro_data.append({
        "Model": model,
        "Method": method,
        "Precision": macro_precision,
        "Recall": macro_recall,
        "F1-Score": macro_f1
    })

    micro_data.append({
        "Model": model,
        "Method": method,
        "Precision": micro_precision,
        "Recall": micro_recall,
        "F1-Score": micro_f1
    })

# Save to CSV
df_macro = pd.DataFrame(macro_data)
df_micro = pd.DataFrame(micro_data)

df_macro.to_csv(os.path.join(evaluation_folder, "macro_scores.csv"), index=False, float_format="%.2f")
df_micro.to_csv(os.path.join(evaluation_folder, "micro_scores.csv"), index=False, float_format="%.2f")

print("✅ CSV files saved:")
print(" - macro_scores.csv")
print(" - micro_scores.csv")

✅ CSV files saved:
 - macro_scores.csv
 - micro_scores.csv
