### Inner Join Operations

Core Idea:
1. left join didn't alleviate the problem with multiple NaN in data frame. Experiment with inner join
2. Remove entries which contain duplicated doc_id
3. Remove NaNs

In [1]:
from loguru import logger


from slt_positional_bias.dataset import generate_data_frame, generate_merged_data_frame

df_docs = generate_data_frame("data/interim/inputs/corpus.jsonl")
df_topics = generate_data_frame("data/interim/inputs/queries.jsonl")
df_qrels = generate_data_frame("data/interim/inputs/qrels.rag24.test-umbrela-all.txt")

df_qrels.columns = ["topic_id", "q0", "doc_id", "rel_scoring"]

df_topics = df_topics.rename(columns={"qid": "topic_id","query": "topic"})
df_docs = df_docs.rename(columns={"docno": "doc_id", "text": "doc"})

logger.info(f"df_qrels column 'topic_id' is unique: {df_qrels["topic_id"].is_unique}")
logger.info(f"df_topics column 'topic_id' is unique: {df_topics["topic_id"].is_unique}")
logger.info(f"df_qrels column 'doc_id' is unique: {df_qrels["doc_id"].is_unique}")
logger.info(f"df_docs column 'doc_id' is unique: {df_docs["doc_id"].is_unique}")
logger.info("--------------------------------------")

df_merged = (
    df_qrels
    .loc[:, ["topic_id", "doc_id", "rel_scoring"]]
    .merge(
        df_topics.loc[:, ["topic_id", "topic"]],
        on="topic_id",
        how="inner"
    )
    .merge(
        df_docs.loc[:, ["doc_id", "doc"]],
        on="doc_id",
        how="inner"
    )
)

df_merged   

[32m2025-07-22 19:29:50.657[0m | [1mINFO    [0m | [36mslt_positional_bias.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: C:\Users\Albert\Documents\SLT\slt_group_2_positional_bias[0m


[32m2025-07-22 19:29:51.459[0m | [1mINFO    [0m | [36mslt_positional_bias.dataset[0m:[36mgenerate_data_frame[0m:[36m50[0m - [1mfile path exists[0m
[32m2025-07-22 19:29:56.399[0m | [1mINFO    [0m | [36mslt_positional_bias.dataset[0m:[36mgenerate_data_frame[0m:[36m50[0m - [1mfile path exists[0m
[32m2025-07-22 19:29:56.404[0m | [1mINFO    [0m | [36mslt_positional_bias.dataset[0m:[36mgenerate_data_frame[0m:[36m50[0m - [1mfile path exists[0m
[32m2025-07-22 19:29:56.608[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m15[0m - [1mdf_qrels column 'topic_id' is unique: False[0m
[32m2025-07-22 19:29:56.609[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m16[0m - [1mdf_topics column 'topic_id' is unique: True[0m
[32m2025-07-22 19:29:56.632[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m17[0m - [1mdf_qrels column 'doc_id' is unique: False[0m
[32m2025-07-22 19:29:56.664[0m | [1mINFO    [0m | [3

Unnamed: 0,topic_id,doc_id,rel_scoring,topic,doc
0,2024-145979,msmarco_v2.1_doc_01_523681915#0_449763684,2,what is vicarious trauma and how can it be cop...,Vicarious traumatization - Wikipedia Vicarious...
1,2024-145979,msmarco_v2.1_doc_13_1647729865#0_3617397938,3,what is vicarious trauma and how can it be cop...,Vicarious Trauma: What Causes it and Ways to H...
2,2024-145979,msmarco_v2.1_doc_25_1148700328#0_2179512504,2,what is vicarious trauma and how can it be cop...,What is Vicarious Trauma? | What is Vicarious ...
3,2024-145979,msmarco_v2.1_doc_25_1148700328#1_2179514079,2,what is vicarious trauma and how can it be cop...,What is Vicarious Trauma? | What is Vicarious ...
4,2024-145979,msmarco_v2.1_doc_44_1043805224#2_2182641075,2,what is vicarious trauma and how can it be cop...,What is Vicarious Trauma? - Jefferson Center -...
...,...,...,...,...,...
29158,2024-127266,msmarco_v2.1_doc_58_1230815815#5_1795053441,2,what are some key challenges related to the re...,Highest E-Waste Generating Nations In The Worl...
29159,2024-127266,msmarco_v2.1_doc_34_470264225#3_1018882699,0,what are some key challenges related to the re...,The Pros & Cons Of Landfills (Benefits & Disad...
29160,2024-127266,msmarco_v2.1_doc_57_781229369#6_1221950031,1,what are some key challenges related to the re...,The Patron Saint of Electronics? Do you Know t...
29161,2024-127266,msmarco_v2.1_doc_57_781235684#6_1221962946,1,what are some key challenges related to the re...,Destroy VHS Tapes \nDestroy VHS Tapes\nDaliah ...


In [11]:
logger.info(f"df_merged column 'doc_id' is unique: {df_merged["doc_id"].is_unique}")

[32m2025-07-22 19:40:24.955[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m1[0m - [1mdf_merged column 'doc_id' is unique: False[0m
[32m2025-07-22 19:40:24.958[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mdf_merged column 'topic_id' is unique: False[0m


In [4]:
dupes = df_merged[df_merged["doc_id"].duplicated(keep=False)]
dupes

Unnamed: 0,topic_id,doc_id,rel_scoring,topic,doc
638,2024-32912,msmarco_v2.1_doc_21_1227054210#2_2626884929,1,how bad did the vietnam war devastate the econ...,"Considering the Losses, Does it Matter Who Won..."
1513,2024-42497,msmarco_v2.1_doc_34_1636826970#6_3436858544,3,how does the informal recycling sector add to ...,Electronic waste | Britannica Electronic waste...
1514,2024-42497,msmarco_v2.1_doc_08_1276221077#1_2305275296,3,how does the informal recycling sector add to ...,The Human and Environmental Effects of E-Waste...
1516,2024-42497,msmarco_v2.1_doc_17_458289946#6_496645752,1,how does the informal recycling sector add to ...,Electronic waste in China - Wikipedia Electron...
1517,2024-42497,msmarco_v2.1_doc_08_1276221077#0_2305273239,3,how does the informal recycling sector add to ...,The Human and Environmental Effects of E-Waste...
...,...,...,...,...,...
29143,2024-127266,msmarco_v2.1_doc_34_920686972#4_1884834759,1,what are some key challenges related to the re...,How Does Waste Impact The Environment? How Doe...
29147,2024-127266,msmarco_v2.1_doc_17_458289946#6_496645752,2,what are some key challenges related to the re...,Electronic waste in China - Wikipedia Electron...
29148,2024-127266,msmarco_v2.1_doc_25_186084585#1_358397316,1,what are some key challenges related to the re...,How to Start E-waste Recycling Business in Ind...
29149,2024-127266,msmarco_v2.1_doc_51_880750803#8_1852939826,1,what are some key challenges related to the re...,Electronic Recycling: How to Recycle Your Busi...


In [9]:
dupe_638 = df_merged[df_merged["doc_id"] == "msmarco_v2.1_doc_21_1227054210#2_2626884929"]
dupe_1513 = df_merged[df_merged["doc_id"] == "msmarco_v2.1_doc_34_1636826970#6_3436858544"]
dupe_638


Unnamed: 0,topic_id,doc_id,rel_scoring,topic,doc
638,2024-32912,msmarco_v2.1_doc_21_1227054210#2_2626884929,1,how bad did the vietnam war devastate the econ...,"Considering the Losses, Does it Matter Who Won..."
20856,2024-213978,msmarco_v2.1_doc_21_1227054210#2_2626884929,0,why are french farmers protesting,"Considering the Losses, Does it Matter Who Won..."


In [10]:
dupe_1513

Unnamed: 0,topic_id,doc_id,rel_scoring,topic,doc
1513,2024-42497,msmarco_v2.1_doc_34_1636826970#6_3436858544,3,how does the informal recycling sector add to ...,Electronic waste | Britannica Electronic waste...
28964,2024-127266,msmarco_v2.1_doc_34_1636826970#6_3436858544,3,what are some key challenges related to the re...,Electronic waste | Britannica Electronic waste...


In [12]:
df_clean = df_merged[df_merged["doc_id"].duplicated(keep=False) == False]
logger.info(f"df_clean column 'doc_id' is unique: {df_clean["doc_id"].is_unique}")
df_clean


[32m2025-07-22 20:06:21.222[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mdf_clean column 'doc_id' is unique: True[0m


Unnamed: 0,topic_id,doc_id,rel_scoring,topic,doc
0,2024-145979,msmarco_v2.1_doc_01_523681915#0_449763684,2,what is vicarious trauma and how can it be cop...,Vicarious traumatization - Wikipedia Vicarious...
1,2024-145979,msmarco_v2.1_doc_13_1647729865#0_3617397938,3,what is vicarious trauma and how can it be cop...,Vicarious Trauma: What Causes it and Ways to H...
2,2024-145979,msmarco_v2.1_doc_25_1148700328#0_2179512504,2,what is vicarious trauma and how can it be cop...,What is Vicarious Trauma? | What is Vicarious ...
3,2024-145979,msmarco_v2.1_doc_25_1148700328#1_2179514079,2,what is vicarious trauma and how can it be cop...,What is Vicarious Trauma? | What is Vicarious ...
4,2024-145979,msmarco_v2.1_doc_44_1043805224#2_2182641075,2,what is vicarious trauma and how can it be cop...,What is Vicarious Trauma? - Jefferson Center -...
...,...,...,...,...,...
29158,2024-127266,msmarco_v2.1_doc_58_1230815815#5_1795053441,2,what are some key challenges related to the re...,Highest E-Waste Generating Nations In The Worl...
29159,2024-127266,msmarco_v2.1_doc_34_470264225#3_1018882699,0,what are some key challenges related to the re...,The Pros & Cons Of Landfills (Benefits & Disad...
29160,2024-127266,msmarco_v2.1_doc_57_781229369#6_1221950031,1,what are some key challenges related to the re...,The Patron Saint of Electronics? Do you Know t...
29161,2024-127266,msmarco_v2.1_doc_57_781235684#6_1221962946,1,what are some key challenges related to the re...,Destroy VHS Tapes \nDestroy VHS Tapes\nDaliah ...


In [13]:
df_clean = df_clean.dropna()
df_clean

Unnamed: 0,topic_id,doc_id,rel_scoring,topic,doc
0,2024-145979,msmarco_v2.1_doc_01_523681915#0_449763684,2,what is vicarious trauma and how can it be cop...,Vicarious traumatization - Wikipedia Vicarious...
1,2024-145979,msmarco_v2.1_doc_13_1647729865#0_3617397938,3,what is vicarious trauma and how can it be cop...,Vicarious Trauma: What Causes it and Ways to H...
2,2024-145979,msmarco_v2.1_doc_25_1148700328#0_2179512504,2,what is vicarious trauma and how can it be cop...,What is Vicarious Trauma? | What is Vicarious ...
3,2024-145979,msmarco_v2.1_doc_25_1148700328#1_2179514079,2,what is vicarious trauma and how can it be cop...,What is Vicarious Trauma? | What is Vicarious ...
4,2024-145979,msmarco_v2.1_doc_44_1043805224#2_2182641075,2,what is vicarious trauma and how can it be cop...,What is Vicarious Trauma? - Jefferson Center -...
...,...,...,...,...,...
29158,2024-127266,msmarco_v2.1_doc_58_1230815815#5_1795053441,2,what are some key challenges related to the re...,Highest E-Waste Generating Nations In The Worl...
29159,2024-127266,msmarco_v2.1_doc_34_470264225#3_1018882699,0,what are some key challenges related to the re...,The Pros & Cons Of Landfills (Benefits & Disad...
29160,2024-127266,msmarco_v2.1_doc_57_781229369#6_1221950031,1,what are some key challenges related to the re...,The Patron Saint of Electronics? Do you Know t...
29161,2024-127266,msmarco_v2.1_doc_57_781235684#6_1221962946,1,what are some key challenges related to the re...,Destroy VHS Tapes \nDestroy VHS Tapes\nDaliah ...
