In [1]:
import pandas as pd

In [10]:
queries = pd.read_csv(
    "collectionandqueries/queries.dev.small.tsv",
    sep="\t",
    header=None,
    names=["qid", "query"]
)

print(queries.head())
print(len(queries))

       qid                                          query
0  1048585                   what is paula deen's brother
1        2                       Androgen receptor define
2   524332  treating tension headaches without medication
3  1048642                            what is paranoid sc
4   524447            treatment of varicose veins in legs
6980


In [11]:
qrels = pd.read_csv(
    "collectionandqueries/qrels.dev.small.tsv",
    sep="\t",
    header=None,
    names=["qid", "unused", "pid", "relevance"]
)

print(qrels.head())
print(len(qrels))


      qid  unused      pid  relevance
0  300674       0  7067032          1
1  125705       0  7067056          1
2   94798       0  7067181          1
3    9083       0  7067274          1
4  174249       0  7067348          1
7437


In [23]:
docs = pd.read_csv(
    "collectionandqueries/collection.tsv",
    sep="\t",
    header=None,
    names=["pid", "text"],
    #nrows=3000000,             # <<< lê apenas 10 linhas
    quoting=3,            # evita problemas com aspas
    on_bad_lines="skip"   # ignora linhas malformadas
)

print(docs)
print(len(docs))

             pid                                               text
0              0  The presence of communication amid scientific ...
1              1  The Manhattan Project and its atomic bomb help...
2              2  Essay on The Manhattan Project - The Manhattan...
3              3  The Manhattan Project was the name for a proje...
4              4  versions of each volume as well as complementa...
...          ...                                                ...
8841818  8841818  When metal salts emit short wavelengths of vis...
8841819  8841819  Thousands of people across the United States w...
8841820  8841820  The recipe that creates blue, for example, inc...
8841821  8841821  On Independence Days of yore, old-timey crowds...
8841822  8841822  View full size image. Behind the scenes of the...

[8841823 rows x 2 columns]
8841823


In [25]:
# 1. Filtrar apenas relevância positiva
qrels_pos = qrels[qrels["relevance"] > 0].copy()

# 2. Transformar docs (dict -> DataFrame se necessário)
if isinstance(docs, dict):
    docs_df = pd.DataFrame(
        [(pid, text) for pid, text in docs.items()],
        columns=["pid", "text"]
    )
else:
    docs_df = docs[["pid", "text"]].copy()

# 3. Merge direto: qrels (qid, pid) + texto do documento
df_final = qrels_pos[["qid", "pid"]].merge(docs_df, on="pid", how="left")

df_final


Unnamed: 0,qid,pid,text
0,300674,7067032,http://en.wikipedia.org/wiki/William_Bradford_...
1,125705,7067056,Adjective[edit] preventive â(comparative mor...
2,94798,7067181,Photoshop: color overlay. In my sunlight touch...
3,9083,7067274,TRUE. Hippocrates is considered the father of ...
4,174249,7067348,How to create a PaySlip using a printer: 1 Lo...
...,...,...,...
7432,147073,8008770,Process Manufacturing involves Variable Ingred...
7433,243761,8008787,Abraham Lincoln served as president from March...
7434,162662,8008977,Rosacea is a facial skin condition that can ca...
7435,247194,8009319,1 Bake at 425F degrees for 5 minutes. 2 Reduc...


In [24]:


# --- 1. Garantir que docs é DataFrame ---
if isinstance(docs, dict):
    docs_df = pd.DataFrame(list(docs.items()), columns=["pid", "text"])
else:
    docs_df = docs.copy()

# --- 2. Normalizar tipos de pid (importantíssimo) ---
docs_df["pid"] = docs_df["pid"].astype(int)
qrels["pid"] = qrels["pid"].astype(int)

# --- 3. Manter apenas qrels cujo pid está nos docs ---
qrels_filtered = qrels[qrels["pid"].isin(docs_df["pid"])].copy()

# --- 4. Agora filtrar queries: manter apenas qids presentes no qrels filtrado ---
valid_qids = qrels_filtered["qid"].unique()
queries_filtered = queries[queries["qid"].isin(valid_qids)].copy()

# --- 5. Resultado final ---
print("qrels filtrado:", len(qrels_filtered))
print("queries filtrado:", len(queries_filtered))
print("docs disponíveis:", len(docs_df))

qrels_filtered.head(), queries_filtered.head()


qrels filtrado: 7437
queries filtrado: 6980
docs disponíveis: 8841823


(      qid  unused      pid  relevance
 0  300674       0  7067032          1
 1  125705       0  7067056          1
 2   94798       0  7067181          1
 3    9083       0  7067274          1
 4  174249       0  7067348          1,
        qid                                          query
 0  1048585                   what is paula deen's brother
 1        2                       Androgen receptor define
 2   524332  treating tension headaches without medication
 3  1048642                            what is paranoid sc
 4   524447            treatment of varicose veins in legs)