### Post processing gpt pair-wise retrievals

In [None]:
'''
>>> Script for post processing gpt pair-wise retrievals
It takes text files and converts them to json files

'''

import re
import json
from pathlib import Path

results_txt = Path("retrievals/gpt4.1.mini_pw_retrievals_nl.txt") # change name of the file
output_json = Path("retrievals/gpt4.1.mini_pw_retrievals_nl.json") # output

pattern_query = re.compile(r"^query id:\s*(\d+)")
pattern_relevant = re.compile(r"^relevant articles:\s*(.*)")

results_dict = {}

with open(results_txt, encoding="utf-8") as f:
    lines = [line.strip() for line in f if line.strip()]

i = 0
while i < len(lines) - 1:
    m_query = pattern_query.match(lines[i])
    m_relevant = pattern_relevant.match(lines[i+1])
    if m_query and m_relevant:
        qid = m_query.group(1)
        relevant_articles = [x.strip() for x in m_relevant.group(1).split(",") if x.strip()]
        results_dict[qid] = relevant_articles
        i += 2
    else:
        i += 1

with open(output_json, "w", encoding="utf-8") as out:
    json.dump(results_dict, out, indent=2, ensure_ascii=False)

print(f"results_nl.json written to: {output_json}")

### Post processing gpt binary-classification retrievals

In [127]:
'''
>>> Script for checking the LLM output. It checks if LLM changes or added or removed any article ids --> comparing to all hard negatives.
It also checks the values if they are only 0 or 1. No other characters or empty values. FOR GPT output...

'''

import json

gpt_output_path = "retrievals/gemini_2.5_flash_bin_class_retrievals_nl.jsonl"
hard_negatives_path = "../sampling_hard_negatives/hard_negatives/hard_negatives_nl.jsonl"

# load hard negatives
hard_negatives = {}
with open(hard_negatives_path, encoding="utf-8") as f:
    for line in f:
        entry = json.loads(line)
        hard_negatives[entry["query_id"]] = set(entry["candidate_docs"])

invalid_values_queries = []
missing_ids_queries = {}
extra_ids_queries = {}
valid_queries = []

with open(gpt_output_path, encoding="utf-8") as f:
    for idx, line in enumerate(f, 1):
        line = line.strip()
        if not line:
            continue

        try:
            obj = json.loads(line)
        except json.JSONDecodeError:
            print(f"Line {idx}: INVALID JSON")
            continue

        query_id = obj.get("query_id")
        relevance = obj.get("relevance", {})

        # check relevance values
        invalid_values = [v for v in relevance.values() if v not in ("0", "1")]
        if invalid_values:
            invalid_values_queries.append(query_id)

        # check candidate IDs
        expected_ids = hard_negatives.get(query_id)
        if not expected_ids:
            continue

        actual_ids = set(relevance.keys())

        missing = expected_ids - actual_ids
        extra = actual_ids - expected_ids

        if missing:
            missing_ids_queries[query_id] = missing
        if extra:
            extra_ids_queries[query_id] = extra
        if not invalid_values and not missing and not extra:
            valid_queries.append(query_id)

print("\n=== SUMMARY ===")
print(f"Total queries processed: {len(valid_queries) + len(invalid_values_queries) + len(missing_ids_queries) + len(extra_ids_queries)}")
print(f"Fully correct queries: {len(valid_queries)}")

if invalid_values_queries:
    print(f"\nQueries with invalid relevance values: {invalid_values_queries}")
if missing_ids_queries:
    print("\nQueries with missing article IDs:")
    for qid, ids in missing_ids_queries.items():
        print(f"  Query {qid}: Missing IDs: {ids}")
if extra_ids_queries:
    print("\nQueries with extra article IDs:")
    for qid, ids in extra_ids_queries.items():
        print(f"  Query {qid}: Extra IDs: {ids}")


=== SUMMARY ===
Total queries processed: 203
Fully correct queries: 203


In [129]:
'''
>>> Script for post ptocessing the binary classification (0/1) outputs and convert them to json retrievals. Only getting the 1 values --> relevant articles. 

'''

import json

input_path = "retrievals/xtra/gemini_2.5_flash_bin_class_retrievals_nl.jsonl"
output_path = "retrievals/json/gemini_2.5.flash_bin_class_retrieval_nl.json"

result = {}

with open(input_path, encoding="utf-8") as fin:
    for line in fin:
        line = line.strip()
        if not line:
            continue
        obj = json.loads(line)
        query_id = obj["query_id"]
        relevance = obj["relevance"]

        relevant_articles = [aid for aid, val in relevance.items() if val == "1"]
        result[query_id] = relevant_articles

with open(output_path, "w", encoding="utf-8") as fout:
    json.dump(result, fout, indent=2, ensure_ascii=False)


### checking all retrievals, if all queries are present.

In [2]:
import json
from pathlib import Path

# Paths — adjust if needed
#predictions_json = Path("retrievals/json/gemini_2.5_flash_id_retrieval_nl.json")
#predictions_json = Path("retrievals/json/gpt4.1.mini_id_retrievals_nl.json")
#predictions_json = Path("retrievals/json/gpt4o.mini_id_retrievals_nl.json")
#predictions_json = Path("retrievals/json/gpt4.1.mini_bin_class_retrievals_nl.json")
#predictions_json = Path("retrievals/json/gpt4o.mini_bin_class_retrievals_nl.json")
#predictions_json = Path("retrievals/json/gemini_2.5_flash_pro_id_retrieval_nl.json")
#predictions_json = Path("retrievals/json/gemini_2.5.flash_bin_class_retrieval_nl.json")
predictions_json = Path("retrievals/json/gemini_2.5.flash_lite_id_retrieval_nl.json")

gold_json = Path("gold_data/gold_standard_nl.json")

# Load predictions
with open(predictions_json, encoding="utf-8") as f:
    predictions = json.load(f)

# Load gold
with open(gold_json, encoding="utf-8") as f:
    gold = json.load(f)

pred_ids = set(predictions.keys())
gold_ids = set(gold.keys())

missing_in_preds = gold_ids - pred_ids
extra_in_preds = pred_ids - gold_ids

print(f"Total gold queries: {len(gold_ids)}")
print(f"Total predicted queries: {len(pred_ids)}\n")

if missing_in_preds:
    print(f"Missing in predictions ({len(missing_in_preds)}):")
    for qid in sorted(missing_in_preds):
        print(f"  {qid}")
else:
    print("All gold query IDs are present in predictions.")

if extra_in_preds:
    print(f"\nExtra query IDs in predictions ({len(extra_in_preds)}):")
    for qid in sorted(extra_in_preds):
        print(f"  {qid}")
else:
    print("No extra query IDs in predictions.")

if not missing_in_preds and not extra_in_preds:
    print("\nPredictions file matches gold file perfectly.")
else:
    print("\nPlease fix mismatches before evaluating.")

Total gold queries: 203
Total predicted queries: 182

Missing in predictions (21):
  310
  316
  400
  412
  417
  420
  493
  512
  524
  585
  633
  634
  637
  7
  761
  822
  864
  867
  875
  934
  971
No extra query IDs in predictions.

Please fix mismatches before evaluating.


### Checking the output of rankings

In [69]:
import os
import json
from pathlib import Path
from tqdm import tqdm

# -------- CONFIG --------
lang = "nl"  # or 'fr'
output_file_path = Path(f"retrievals/txt/gemini2.0.flash_sorted_ranking_{lang}.txt") # gemini2.5.flash_sorted_ranking_{lang}.txt # gpt4o.mini_sorted_ranking_{lang}
hard_negatives_path = Path(f"../sampling_hard_negatives/hard_negatives/hard_negatives_{lang}.jsonl")

# -------- LOAD HARD NEGATIVE SET --------
with open(hard_negatives_path, "r", encoding="utf-8") as f:
    hard_data = [json.loads(line) for line in f]

query_to_candidates = {
    entry["query_id"]: set(entry["candidate_docs"])
    for entry in hard_data
}

# -------- PARSE MODEL OUTPUT --------
with open(output_file_path, "r", encoding="utf-8") as f:
    lines = f.readlines()

report = []
for i in range(0, len(lines), 2):
    qid_line = lines[i].strip()
    result_line = lines[i + 1].strip() if i + 1 < len(lines) else ""

    if not (qid_line.startswith("query id:") and result_line.startswith("ranked articles:")):
        continue

    qid = qid_line.split(":", 1)[1].strip()
    ids_str = result_line.split(":", 1)[1].strip()
    predicted_ids = [x.strip() for x in ids_str.split(",") if x.strip()]
    predicted_set = set(predicted_ids)

    candidate_set = query_to_candidates.get(qid, set())

    hallucinated_ids = sorted(predicted_set - candidate_set)
    missing_ids = sorted(candidate_set - predicted_set)

    report.append({
        "query_id": qid,
        "n_predicted": len(predicted_ids),
        "n_candidates": len(candidate_set),
        "n_hallucinated": len(hallucinated_ids),
        "hallucinated_ids": hallucinated_ids,
        "n_missing": len(missing_ids),
        "missing_ids": missing_ids
    })

# -------- PRINT REPORT --------
for entry in report:
    if entry["n_hallucinated"] > 0:
        print(f"Query {entry['query_id']}:")
        print(f"  Predicted {entry['n_predicted']} articles.")
        print(f"  Hallucinated IDs: {entry['hallucinated_ids']}")
        print(f"  Missing IDs: {entry['missing_ids']}")
        print()