In [1]:
from fastbm25 import fastbm25
import pandas as pd
import nltk
nltk.download('punkt')
import ast


# DATASET COM AS EXPANSÕES ENRIQUECIDAS (As que são efetivamente utilizadas)
full_dataset = pd.read_csv(
    "./queries_train_judged_expanded_enriched.csv",
    sep='\t',
    index_col=0
)

# DATASET COM TODAS AS EXPANSÕES E AS
samples = pd.DataFrame(pd.read_pickle("../../input_data/samples.pkl"))
"""
Formato esperado para samples.pkl
[{
    'idx': 0, 
    'query_idx': 19335, 
    'query': 'anthropological definition of environment', 
    'passage_idx': 1017759, 
    'passage': 'Man and environment'
    }, 
{...}, ...]

"""
print()




[nltk_data] Downloading package punkt to /home/bec/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Cria o modelo BM25 para rankeamento das consultas

In [2]:
# Extrair os índices das passagens presentes no dicionário da coluna original_passage_scores
passage_indices = set()
for scores_dict in full_dataset['original_passage_scores']:
    for original_key in ast.literal_eval(scores_dict).keys():
        passage_indices.add(int(original_key.split("_")[1]))

# Filtrar as passagens no dataframe samples cujos índices estão em passage_indices
filtered_passages = samples[samples['passage_idx'].isin(passage_indices)]
filtered_passages

# Remover duplicatas e obter a lista de passagens únicas
unique_passages = filtered_passages[['passage_idx', 'passage']].drop_duplicates()

# Extrair os IDs e textos das passagens únicas
passage_ids = unique_passages['passage_idx'].tolist()
passage_texts = unique_passages['passage'].tolist()

# Toquenização
tokenized_passages = []
for text in passage_texts:
    tokenized_passages.append(nltk.word_tokenize(text))

# Cria o modelo
model = fastbm25(tokenized_passages) 

## Aplica o modelo ao dataset

In [3]:
# Agora que consigo calcular os runs de cada consulta usando o bm25, preciso aplicar isso no full_dataset, em uma coluna adicional

def calculate_bm25_scores(query:str, model:fastbm25):
     
    tokenized_query = nltk.word_tokenize(query)
    scores = model.top_k_sentence(tokenized_query, k=20)
    
    passage_scores = []
    for score in scores:
        passage_id = passage_ids[score[1]]
        passage_scores.append((passage_id, score[2]))

    scores_dict = {f"passage_{id}": score for id, score in passage_scores}

    return str(scores_dict)
    


full_dataset['bm25_scores'] = full_dataset.apply(lambda row: calculate_bm25_scores(row['query_expandida'], model), axis=1)

In [4]:
# Salva o resultado
full_dataset.to_csv(f'./queries_train_judged_expanded_enriched.csv', sep="\t")