### Join tests

Core Idea:
1. Only relevance scoring linked to an existing query are needed,
2. Only pairs of `(relevance score, query)` to which a corpus is linked are needed.
3. Remove all entries where `qid = NaN`


This results in the following two join operations:
- ( (relevance-scoring [topic_id] ⟖ queries [qid]) [doc_id]) ⟖ corpus [docno]


In [27]:
import json
import pandas as pd
import sys
from pathlib import Path
from IPython.display import display

from slt_positional_bias.dataset import generate_data_frame

corpus_df = generate_data_frame("data/interim/inputs/corpus.jsonl")
queries_df = generate_data_frame("data/interim/inputs/queries.jsonl")
relevance_df = generate_data_frame("data/interim/inputs/qrels.rag24.test-umbrela-all.txt")

relevance_df.columns = ["topic_id", "q0", "doc_id", "scoring"]
corpus_df

[32m2025-07-03 12:18:18.589[0m | [1mINFO    [0m | [36mslt_positional_bias.dataset[0m:[36mgenerate_data_frame[0m:[36m20[0m - [1mfile path exists[0m
[32m2025-07-03 12:18:29.631[0m | [1mINFO    [0m | [36mslt_positional_bias.dataset[0m:[36mgenerate_data_frame[0m:[36m20[0m - [1mfile path exists[0m
[32m2025-07-03 12:18:29.639[0m | [1mINFO    [0m | [36mslt_positional_bias.dataset[0m:[36mgenerate_data_frame[0m:[36m20[0m - [1mfile path exists[0m


Unnamed: 0,docno,text,original_document
0,msmarco_v2.1_doc_52_1400719578#0_2842698387,"Taylor Swift's ""Invisible String"" Lyrics Meani...",{'doc_id': 'msmarco_v2.1_doc_52_1400719578#0_2...
1,msmarco_v2.1_doc_05_1495231781#3_2850145650,History of Ancient Rome for Kids: The Roman Em...,{'doc_id': 'msmarco_v2.1_doc_05_1495231781#3_2...
2,msmarco_v2.1_doc_12_1016316941#1_1992316811,Apiphobia and Spheksophobia - Fear Of Wasps An...,{'doc_id': 'msmarco_v2.1_doc_12_1016316941#1_1...
3,msmarco_v2.1_doc_34_1545449353#0_3251041439,Reies Tijerina | American activist | Britannic...,{'doc_id': 'msmarco_v2.1_doc_34_1545449353#0_3...
4,msmarco_v2.1_doc_23_1497031463#10_3313663249,"Salome: Was the ""dancing"" daughter of Herodias...",{'doc_id': 'msmarco_v2.1_doc_23_1497031463#10_...
...,...,...,...
116689,msmarco_v2.1_doc_56_94836053#11_205606001,TSHA | DeWitt's Colony DeWitt's Colony\nSuppor...,{'doc_id': 'msmarco_v2.1_doc_56_94836053#11_20...
116690,msmarco_v2.1_doc_44_596645538#4_1407144514,SAFE Investment Company (China) Definition SAF...,{'doc_id': 'msmarco_v2.1_doc_44_596645538#4_14...
116691,msmarco_v2.1_doc_49_1086472702#22_2229413969,Money is Everything - Physician on FIRE Money ...,{'doc_id': 'msmarco_v2.1_doc_49_1086472702#22_...
116692,msmarco_v2.1_doc_34_1540555152#9_3240635353,"Maximilien Robespierre | Biography, French Rev...",{'doc_id': 'msmarco_v2.1_doc_34_1540555152#9_3...


In [28]:
queries_df

Unnamed: 0,qid,query,original_query
0,2024-145979,what is vicarious trauma and how can it be cop...,"{'query_id': '2024-145979', 'text': 'what is v..."
1,2024-216592,why disability insurance is a smart investment,"{'query_id': '2024-216592', 'text': 'why disab..."
2,2024-32912,how bad did the vietnam war devastate the econ...,"{'query_id': '2024-32912', 'text': 'how bad di..."
3,2024-153051,what target stors's policies for shoplifting,"{'query_id': '2024-153051', 'text': 'what targ..."
4,2024-128784,what can a career in civil engineering be like?,"{'query_id': '2024-128784', 'text': 'what can ..."
...,...,...,...
84,2024-219563,why free things have less percieved value,"{'query_id': '2024-219563', 'text': 'why free ..."
85,2024-121840,should teachers notify parents about state tes...,"{'query_id': '2024-121840', 'text': 'should te..."
86,2024-58819,how many latino have been misinformed about ed...,"{'query_id': '2024-58819', 'text': 'how many l..."
87,2024-42014,how does religion show in public school,"{'query_id': '2024-42014', 'text': 'how does r..."


In [29]:
relevance_df

Unnamed: 0,topic_id,q0,doc_id,scoring
0,2024-145979,Q0,msmarco_v2.1_doc_01_523681915#0_449763684,2
1,2024-145979,Q0,msmarco_v2.1_doc_13_1647729865#0_3617397938,3
2,2024-145979,Q0,msmarco_v2.1_doc_25_1148700328#0_2179512504,2
3,2024-145979,Q0,msmarco_v2.1_doc_25_1148700328#1_2179514079,2
4,2024-145979,Q0,msmarco_v2.1_doc_44_1043805224#2_2182641075,2
...,...,...,...,...
108473,2024-21669,Q0,msmarco_v2.1_doc_08_298503251#1_568833050,0
108474,2024-21669,Q0,msmarco_v2.1_doc_27_877180220#2_1765802089,1
108475,2024-21669,Q0,msmarco_v2.1_doc_01_1752638978#7_2584141250,1
108476,2024-21669,Q0,msmarco_v2.1_doc_09_914336366#11_1554208777,0


In [30]:
queries_df["qid"].is_unique

True

In [31]:
relevance_queries_merged_df = relevance_df.merge(queries_df, how="right", left_on="topic_id", right_on="qid")
corpus_df["docno"].is_unique

True

In [32]:
relevance_queries_corpus_merged = relevance_queries_merged_df.merge(corpus_df, how="right", left_on="doc_id", right_on="docno")
relevance_queries_corpus_merged["qid"].isna().any()

np.True_

In [33]:
unified_df = relevance_queries_corpus_merged[relevance_queries_corpus_merged["qid"].notna()]
unified_df

Unnamed: 0,topic_id,q0,doc_id,scoring,qid,query,original_query,docno,text,original_document
3,2024-36302,Q0,msmarco_v2.1_doc_34_1545449353#0_3251041439,0.0,2024-36302,how did old riaño residents protest relocation?,"{'query_id': '2024-36302', 'text': 'how did ol...",msmarco_v2.1_doc_34_1545449353#0_3251041439,Reies Tijerina | American activist | Britannic...,{'doc_id': 'msmarco_v2.1_doc_34_1545449353#0_3...
8,2024-224279,Q0,msmarco_v2.1_doc_53_1158657184#4_2578103489,3.0,2024-224279,why should teachers always be reviewing their ...,"{'query_id': '2024-224279', 'text': 'why shoul...",msmarco_v2.1_doc_53_1158657184#4_2578103489,4 Benefits of Monitoring Student Progress in t...,{'doc_id': 'msmarco_v2.1_doc_53_1158657184#4_2...
13,2024-41918,Q0,msmarco_v2.1_doc_25_386750115#1_757017149,1.0,2024-41918,how does plutonium help us improve technology?,"{'query_id': '2024-41918', 'text': 'how does p...",msmarco_v2.1_doc_25_386750115#1_757017149,What Is Plutonium? Characteristics of This Nuc...,{'doc_id': 'msmarco_v2.1_doc_25_386750115#1_75...
14,2024-43983,Q0,msmarco_v2.1_doc_47_847662737#3_1828675438,0.0,2024-43983,how has inclusivity made vogue magazine more p...,"{'query_id': '2024-43983', 'text': 'how has in...",msmarco_v2.1_doc_47_847662737#3_1828675438,"Despite Featuring Plus-Size Models, Victoria’s...",{'doc_id': 'msmarco_v2.1_doc_47_847662737#3_18...
18,2024-158677,Q0,msmarco_v2.1_doc_17_535302055#4_580294086,1.0,2024-158677,what was entertainment like in the 1990s in th...,"{'query_id': '2024-158677', 'text': 'what was ...",msmarco_v2.1_doc_17_535302055#4_580294086,Entertainment technology - Wikipedia Entertain...,{'doc_id': 'msmarco_v2.1_doc_17_535302055#4_58...
...,...,...,...,...,...,...,...,...,...,...
116789,2024-96359,Q0,msmarco_v2.1_doc_02_897225352#7_1560248853,1.0,2024-96359,how were the non aligned countries affected by...,"{'query_id': '2024-96359', 'text': 'how were t...",msmarco_v2.1_doc_02_897225352#7_1560248853,Boris Yeltsin's Legacy BORIS YELTSIN'S FOREIGN...,{'doc_id': 'msmarco_v2.1_doc_02_897225352#7_15...
116798,2024-42497,Q0,msmarco_v2.1_doc_55_291506052#15_666138048,2.0,2024-42497,how does the informal recycling sector add to ...,"{'query_id': '2024-42497', 'text': 'how does t...",msmarco_v2.1_doc_55_291506052#15_666138048,Essay on Air Pollution: 9 Selected Essays on A...,{'doc_id': 'msmarco_v2.1_doc_55_291506052#15_6...
116800,2024-29182,Q0,msmarco_v2.1_doc_38_1238509282#5_2504074815,1.0,2024-29182,does your mood have an effect on buying things,"{'query_id': '2024-29182', 'text': 'does your ...",msmarco_v2.1_doc_38_1238509282#5_2504074815,"Top 6 Reasons to Buy Her Flowers, and What Eve...",{'doc_id': 'msmarco_v2.1_doc_38_1238509282#5_2...
116802,2024-35284,Q0,msmarco_v2.1_doc_33_579672422#1_1236816866,0.0,2024-35284,how did andrew carnegie impact modern medicine,"{'query_id': '2024-35284', 'text': 'how did an...",msmarco_v2.1_doc_33_579672422#1_1236816866,Paracelsus — Toxipedia \n He felt that sicknes...,{'doc_id': 'msmarco_v2.1_doc_33_579672422#1_12...
