### Post processing gpt pair-wise retrievals

In [None]:
'''
>>> Script for post processing gpt pair-wise retrievals
It takes text files and converts them to json files

'''

import re
import json
from pathlib import Path

results_txt = Path("retrievals/gpt4.1.mini_pw_retrievals_nl.txt") # change name of the file
output_json = Path("retrievals/gpt4.1.mini_pw_retrievals_nl.json") # output

pattern_query = re.compile(r"^query id:\s*(\d+)")
pattern_relevant = re.compile(r"^relevant articles:\s*(.*)")

results_dict = {}

with open(results_txt, encoding="utf-8") as f:
    lines = [line.strip() for line in f if line.strip()]

i = 0
while i < len(lines) - 1:
    m_query = pattern_query.match(lines[i])
    m_relevant = pattern_relevant.match(lines[i+1])
    if m_query and m_relevant:
        qid = m_query.group(1)
        relevant_articles = [x.strip() for x in m_relevant.group(1).split(",") if x.strip()]
        results_dict[qid] = relevant_articles
        i += 2
    else:
        i += 1

with open(output_json, "w", encoding="utf-8") as out:
    json.dump(results_dict, out, indent=2, ensure_ascii=False)

print(f"results_nl.json written to: {output_json}")

### Post processing gpt binary-classification retrievals

In [None]:
'''
>>> Script for checking the LLM output. It checks if LLM changes or added or removed any article ids --> comparing to all hard negatives.
It also checks the values if they are only 0 or 1. No other characters or empty values. FOR GPT output...

'''

import json

gpt_output_path = "retrievals/gemini_2.5_pro_task2_bin_relevance_nl.jsonl"
hard_negatives_path = "../sampling_hard_negatives/hard_negatives/hard_negatives_nl.jsonl"

# load hard negatives
hard_negatives = {}
with open(hard_negatives_path, encoding="utf-8") as f:
    for line in f:
        entry = json.loads(line)
        hard_negatives[entry["query_id"]] = set(entry["candidate_docs"])

invalid_values_queries = []
missing_ids_queries = {}
extra_ids_queries = {}
valid_queries = []

with open(gpt_output_path, encoding="utf-8") as f:
    for idx, line in enumerate(f, 1):
        line = line.strip()
        if not line:
            continue

        try:
            obj = json.loads(line)
        except json.JSONDecodeError:
            print(f"Line {idx}: INVALID JSON")
            continue

        query_id = obj.get("query_id")
        relevance = obj.get("relevance", {})

        # check relevance values
        invalid_values = [v for v in relevance.values() if v not in ("0", "1")]
        if invalid_values:
            invalid_values_queries.append(query_id)

        # check candidate IDs
        expected_ids = hard_negatives.get(query_id)
        if not expected_ids:
            continue

        actual_ids = set(relevance.keys())

        missing = expected_ids - actual_ids
        extra = actual_ids - expected_ids

        if missing:
            missing_ids_queries[query_id] = missing
        if extra:
            extra_ids_queries[query_id] = extra
        if not invalid_values and not missing and not extra:
            valid_queries.append(query_id)

print("\n=== SUMMARY ===")
print(f"Total queries processed: {len(valid_queries) + len(invalid_values_queries) + len(missing_ids_queries) + len(extra_ids_queries)}")
print(f"Fully correct queries: {len(valid_queries)}")

if invalid_values_queries:
    print(f"\nQueries with invalid relevance values: {invalid_values_queries}")
if missing_ids_queries:
    print("\nQueries with missing article IDs:")
    for qid, ids in missing_ids_queries.items():
        print(f"  Query {qid}: Missing IDs: {ids}")
if extra_ids_queries:
    print("\nQueries with extra article IDs:")
    for qid, ids in extra_ids_queries.items():
        print(f"  Query {qid}: Extra IDs: {ids}")


=== SUMMARY ===
Total queries processed: 0
Fully correct queries: 0


In [None]:
import json

# === CONFIGURATION ===
gemini_output_path = "retrievals/1.jsonl"
hard_negatives_path = "../sampling_hard_negatives/hard_negatives/hard_negatives_nl.jsonl"
max_queries_to_check = 150  # Set to None to process all

# === Load hard negatives ===
hard_negatives = {}
with open(hard_negatives_path, encoding="utf-8") as f:
    for line in f:
        entry = json.loads(line)
        qid = str(entry["query_id"])
        hard_negatives[qid] = set(entry["candidate_docs"])

# === Load Gemini output from JSONL ===
gemini_output = {}
with open(gemini_output_path, encoding="utf-8") as f:
    for idx, line in enumerate(f, 1):
        line = line.strip()
        if not line:
            continue
        try:
            obj = json.loads(line)
        except json.JSONDecodeError:
            print(f"Line {idx}: INVALID JSON")
            continue

        for query_id, relevance in obj.items():
            gemini_output[query_id] = relevance

# === Validation ===
invalid_values_queries = []
missing_ids_queries = {}
extra_ids_queries = {}
valid_queries = []

query_ids = list(gemini_output.keys())
if max_queries_to_check is not None:
    query_ids = query_ids[:max_queries_to_check]

for query_id in query_ids:
    relevance = gemini_output[query_id]

    # Check values
    invalid = [v for v in relevance.values() if v not in ("0", "1")]
    if invalid:
        invalid_values_queries.append(query_id)

    expected_ids = hard_negatives.get(query_id)
    if not expected_ids:
        continue

    actual_ids = set(relevance.keys())
    missing = expected_ids - actual_ids
    extra = actual_ids - expected_ids

    if missing:
        missing_ids_queries[query_id] = missing
    if extra:
        extra_ids_queries[query_id] = extra

    if not invalid and not missing and not extra:
        valid_queries.append(query_id)

# === Report ===
print("\n=== SUMMARY ===")
total = len(valid_queries) + len(invalid_values_queries) + len(missing_ids_queries) + len(extra_ids_queries)
print(f"Total queries processed: {total}")
print(f"Fully correct queries: {len(valid_queries)}")

if invalid_values_queries:
    print(f"\nQueries with invalid relevance values: {invalid_values_queries}")
if missing_ids_queries:
    print("\nQueries with missing article IDs:")
    for qid, ids in missing_ids_queries.items():
        print(f"  Query {qid}: Missing IDs: {sorted(ids)}")
if extra_ids_queries:
    print("\nQueries with extra article IDs:")
    for qid, ids in extra_ids_queries.items():
        print(f"  Query {qid}: Extra IDs: {sorted(ids)}")


=== SUMMARY ===
Total queries processed: 5
Fully correct queries: 5


In [42]:
'''
>>> Script for post ptocessing the binary classification (0/1) outputs and convert them to json retrievals. Only getting the 1 values --> relevant articles. 

'''

import json

input_path = "retrievals/xtra/gpt4.1.mini_bin_class_retrievals_nl.jsonl"
output_path = "retrievals/json/gpt4.1.mini_bin_class_retrievals_nl.json"

result = {}

with open(input_path, encoding="utf-8") as fin:
    for line in fin:
        line = line.strip()
        if not line:
            continue
        obj = json.loads(line)
        query_id = obj["query_id"]
        relevance = obj["relevance"]

        relevant_articles = [aid for aid, val in relevance.items() if val == "1"]
        result[query_id] = relevant_articles

with open(output_path, "w", encoding="utf-8") as fout:
    json.dump(result, fout, indent=2, ensure_ascii=False)


### checking all retrievals, if all queries are present.

In [53]:
import json
from pathlib import Path

# Paths — adjust if needed
#predictions_json = Path("retrievals/json/gemini_2.5_flash_id_retrieval_nl.json")
#predictions_json = Path("retrievals/json/gpt4.1.mini_id_retrievals_nl.json")
#predictions_json = Path("retrievals/json/gpt4o.mini_id_retrievals_nl.json")
#predictions_json = Path("retrievals/json/gpt4.1.mini_bin_class_retrievals_nl.json")
#predictions_json = Path("retrievals/json/gpt4o.mini_bin_class_retrievals_nl.json")
predictions_json = Path("retrievals/json/gemini_2.5_flash_pro_id_retrieval_nl.json")
gold_json = Path("gold_data/gold_standard_nl.json")

# Load predictions
with open(predictions_json, encoding="utf-8") as f:
    predictions = json.load(f)

# Load gold
with open(gold_json, encoding="utf-8") as f:
    gold = json.load(f)

pred_ids = set(predictions.keys())
gold_ids = set(gold.keys())

missing_in_preds = gold_ids - pred_ids
extra_in_preds = pred_ids - gold_ids

print(f"Total gold queries: {len(gold_ids)}")
print(f"Total predicted queries: {len(pred_ids)}\n")

if missing_in_preds:
    print(f"Missing in predictions ({len(missing_in_preds)}):")
    for qid in sorted(missing_in_preds):
        print(f"  {qid}")
else:
    print("All gold query IDs are present in predictions.")

if extra_in_preds:
    print(f"\nExtra query IDs in predictions ({len(extra_in_preds)}):")
    for qid in sorted(extra_in_preds):
        print(f"  {qid}")
else:
    print("No extra query IDs in predictions.")

if not missing_in_preds and not extra_in_preds:
    print("\nPredictions file matches gold file perfectly.")
else:
    print("\nPlease fix mismatches before evaluating.")

Total gold queries: 203
Total predicted queries: 203

All gold query IDs are present in predictions.
No extra query IDs in predictions.

Predictions file matches gold file perfectly.
