# Enrich results
Incorpora informações úteis ao rankeamento.

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import pyarrow.parquet as pq
import pickle
import dask.dataframe as dd
import numpy as np
from scipy.stats import spearmanr
import ast
import math
import os
import ir_datasets
from collections import defaultdict, Counter
from tqdm import tqdm
import nltk
nltk.download('punkt_tab')
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from gensim.models import KeyedVectors
from ranx import Qrels
import re


# ------- SOURCES --------
vocabulary_file_source = "../../aditional_data/word-vocab-small.csv"
emb_file_source = "../../aditional_data/wiki-news-300d-1M.vec"
idf_file = '../../aditional_data/idfnew.norm.tsv'
original_data = "../../dataset/queries_train_judged_expanded/samples.pkl"

In [None]:
# DADOS ÚTEIS

!mkdir -p ../../aditional_data/

#Carrega o modelo
if os.path.isfile(emb_file_source) == False:
    !wget https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip
    !unzip wiki-news-300d-1M.vec.zip
    !rm wiki-news-300d-1M.vec.zip
    !mv ./wiki-news-300d-1M.vec ../../aditional_data/
model = KeyedVectors.load_word2vec_format(emb_file_source, encoding="utf-8", binary=False) #FastText model

#Carrega o vocabulário
if os.path.isfile(vocabulary_file_source) == False:
    !wget https://raw.githubusercontent.com/microsoft/MSMARCO-Passage-Ranking/refs/heads/master/Baselines/data/word-vocab-small.tsv -P ../../aditional_data/
vocabulary_pd = pd.read_csv(vocabulary_file_source, names=["word"], sep="\t")
vocabulary = list(set([str(word).lower() for word in vocabulary_pd["word"].to_list()]))

# IDF das palavras - o arquivo contém apenas termos de consultas em conjuntos de treinamento/dev/eval do MSMARCO.
# Fonte: https://github.com/microsoft/MSMARCO-Passage-Ranking/tree/master/Baselines
if os.path.isfile(idf_file) == False:
    !wget https://raw.githubusercontent.com/microsoft/MSMARCO-Passage-Ranking/master/Baselines/data/idfnew.norm.tsv -P ../../aditional_data/


# Extract qrels (relevance judgments) from the MS MARCO dev judged dataset
#dataset = ir_datasets.load("msmarco-passage/train/judged")
#qrels = []
#for qrel in dataset.qrels_iter():
#    qrel # namedtuple<query_id, doc_id, relevance, iteration>
#    qrels.append({
#        'query_idx': qrel.query_id,
#        'passage_idx': qrel.doc_id,
#        'relevance': qrel.relevance
#    })

qrels_dict = Qrels.from_ir_datasets("msmarco-passage/train/judged").to_dict()
data = [
    (query_id, passage_id, relevance)
    for query_id, passages in qrels_dict.items()
    for passage_id, relevance in passages.items()
]
qrels_df = pd.DataFrame(data, columns=["query_idx", "passage_idx", "relevance"])

# Agrupa julgamentos por id da query julgada para contar número de julghamentos
relevant_counts = qrels_df.groupby('query_idx').size().reset_index(name="relevant_count").astype('int64') 

'''
    |query_idx   |relevant_count    |
    |3           |1                 |
'''
## Leva até 3min

In [None]:
word_vectors = []
new_vocabulary = []
for word in vocabulary:
    if word in model:
        word_vectors.append(np.asarray(model[word], dtype=np.float32))
        new_vocabulary.append(word)

vocabulary = new_vocabulary.copy()
word_vectors = np.array(word_vectors)

word_vectors.shape
## Cada palavra no embedding fasttext tem 300 dimensoes
## Agora basta calcular a distancia de cosseno de um embedding para o outro

In [None]:
#### FUNÇÕES ÙTEIS

def fix_text(text):
    text = text.strip()             # Remove espaços iniciais e finais
    text = re.sub(' +', ' ', text)  # Remove excesso de espaços
    text = text.replace(" 's", "'s")
    text = text.replace(" ,", ",")
    text = text.replace(" / ", "/")
    text = text.replace(" ?", "?")
    return text

def calc_spearman(dict1: str, dict2: str, K: int) -> float:
    """
    Example input dictionaries:
    dict1 = {'passage1': 1, 'passage2': 2, 'passage3': 3}
    dict2 = {'passage1': 3, 'passage2': 2, 'passage3': 1}
    """

    dict1 = ast.literal_eval(dict1)
    dict2 = ast.literal_eval(dict2)

    # Encontrar as chaves comuns
    common_keys = set(dict1.keys()).intersection(set(dict2.keys()))

    # Extrair os ranks para as chaves comuns
    common_ranks1 = {key: dict1[key] for key in common_keys}
    
    # Ordenar as chaves comuns de acordo com os valores em dict1 (ordem descendente) e pegar os primeiros K
    top_k_keys = sorted(common_ranks1, key=common_ranks1.get, reverse=True)[:K]

    # Extrair os ranks para as chaves comuns filtradas pelo top K
    ranks1 = [dict1[key] for key in top_k_keys]
    ranks2 = [dict2[key] for key in top_k_keys]

    # Calcular a correlação de Spearman
    spearman_corr, _ = spearmanr(ranks1, ranks2)

    return spearman_corr


def calc_avg_precision(dict: str, K: int) -> float:
    """
    Example input dictionary:
    dict = {'passage1': 1, 'passage2': 2, 'passage3': 3}
    """
    
    dict = ast.literal_eval(dict)

    # Pegar os primeiros K elementos com base no rank (ordem descendente)
    top_k_keys = sorted(dict, key=dict.get, reverse=True)[:K]

    # Calcular a média de precisão
    ranks = [dict[key] for key in top_k_keys]
    avg_precision = sum(ranks) / len(ranks) if ranks else 0.0

    return avg_precision

def similarity(originalPhrase: str, expandedPhrase: str, word_vectors):
    '''
    Calcula a similaridade entre as palavras diferentes de duas frases.
    Se mais de uma palavra for diferente, o resultado será a média das similaridades.
    '''
    similarities = []
    originalPhraseWords = originalPhrase.split()
    expandedPhraseWords = expandedPhrase.split()
    
    for wordIndex in range(0, len(originalPhraseWords)):
        originalWord = originalPhraseWords[wordIndex]
        expandedWord = expandedPhraseWords[wordIndex]

        if(originalWord == expandedWord):
            continue

        emb1 = word_vectors[vocabulary.index(originalWord)]
        emb2 = word_vectors[vocabulary.index(expandedWord)]

        similarities.extend(cosine_similarity([emb1], [emb2]))
     
    return(sum(similarities)/len(similarities))

def idf(originalPhrase: str, expandedPhrase: str, idfListOriginal: list, idfListExpanded: list):
    '''
        Recupera o idf das palavras diferentes de duas frases, além da diferença entre o idf da frase original e da expandida.
        Se mais de uma palavra for diferente, o resultado será a média dos idfs.
    '''    

    if len(idfListOriginal) != len(originalPhrase.split()) or len(idfListExpanded) != len(expandedPhrase.split()):
        raise ValueError("idfListOriginal and idfListExpanded must be the same length as originalPhrase and expandedPhrase")

    
    expanded_idfs = []
    difference_idfs = []

    originalPhraseWords = originalPhrase.split()
    expandedPhraseWords = expandedPhrase.split()

    for wordIndex in range(0, len(originalPhraseWords)):
        originalWord = originalPhraseWords[wordIndex]
        expandedWord = expandedPhraseWords[wordIndex]

        if(originalWord == expandedWord):
            continue


        if idfListOriginal[wordIndex] is None or idfListExpanded[wordIndex] is None:
            return np.nan, np.nan

        
        expanded_idfs.append(idfListExpanded[wordIndex])
        difference_idfs.append(idfListOriginal[wordIndex]-idfListExpanded[wordIndex])
        

    mean_expanded_idfs = sum(expanded_idfs)/len(expanded_idfs)
    mean_of_diference_idfs = sum(difference_idfs)/len(difference_idfs)
    
    return mean_expanded_idfs, mean_of_diference_idfs

# Função para tokenizar documentos
def tokenize(text):
    tokens = word_tokenize(text)
    return tokens

# Função para retornar o vetor TF-IDF de uma sentença
def get_sentence_tfidf(sentence, idf):
    tokenized_sentence = tokenize(sentence)
    sentence_tf = Counter(tokenized_sentence)
    sentence_tfidf = {term: (sentence_tf[term] / sum(sentence_tf.values())) * idf.get(term, 0) for term in tokenized_sentence}
    return sentence_tfidf

# Função para retornar o IDF de cada termo em uma sentença
def get_sentence_idf(sentence, idf):
    tokenized_sentence = tokenize(sentence)
    sentence_idf = {term: idf.get(term, 0) for term in tokenized_sentence}
    return sentence_idf


#### Recupera as informações textuais e identificadores originais das queries

In [None]:
expanded_queries_info_pd = pd.DataFrame(pd.read_pickle(original_data))

expanded_queries_info_pd['query_idx'] = expanded_queries_info_pd['query_exp_id'].apply(lambda query: query.split('_')[0]).astype('int64')
expanded_queries_info_pd['exp_num'] = expanded_queries_info_pd['query_exp_id'].apply(lambda query: query.split('_')[1]).astype('int64')

#expanded_queries_info_pd = expanded_queries_info_pd[['query_idx', 'exp_num', 'query', 'passage_idx', 'passage']]
expanded_queries_info_pd

#### Recupera o rankeamento das queries expandidas

In [None]:
# Seleciona o fold desejado
fold = 0

expanded_queries_rank_pd =  pd.read_csv(f"../../ranking/RetrieverBERT_queries_train_judged_expanded/RetrieverBERT_queries_train_judged_expanded_{fold}.rnk.csv", sep="\t")     

expanded_queries_rank_pd = expanded_queries_rank_pd.sort_values(by=['Query']).drop_duplicates()

print(f"Foram obtidos {len(expanded_queries_rank_pd)} resgistros dos folds. Cada um referente a um par (expansão-rank).")
expanded_queries_rank_pd.head(10)

In [None]:
# Separa o número referente ao par
expanded_queries_rank_pd['query_run_idx'] = expanded_queries_rank_pd['Query'].apply(lambda query: query.split('_')[1]).astype('int64')  # transformação: query_3 -> 3
expanded_queries_docs_rank_pd = expanded_queries_rank_pd[["query_run_idx", "Passage_Scores"]]

expanded_queries_docs_rank_pd

In [None]:
# Recupera o identificador original das queries
expanded_queries_docs_rank_info_pd = expanded_queries_docs_rank_pd.merge(expanded_queries_info_pd, left_on='query_run_idx', right_on='idx', suffixes=('_x', ''))
expanded_queries_docs_rank_info_pd = expanded_queries_docs_rank_info_pd[['query_idx', 'exp_num','Passage_Scores', 'query_exp_id']]

print(f"São {expanded_queries_docs_rank_info_pd['query_idx'].nunique()} queries originais em jogo")
expanded_queries_docs_rank_info_pd.head(10)

### Identifica as queries que deram origem às expansões (aquelas com 'exp_num' igual a 1)


In [None]:
pd.options.display.max_rows = 50
pd.options.display.max_columns = 500

print(f"Entraram {len(expanded_queries_docs_rank_info_pd)} queries expandidas.")
grouped_expansions = expanded_queries_docs_rank_info_pd.groupby('query_idx')

one_expansions = []

for query_idx, group in grouped_expansions:
    # Filtrar o registro onde 'exp_num' é igual a 1
    filtered_rows = group[group['exp_num'] == 1]
    one_expansions.append(filtered_rows)

# Concatenar todos os DataFrames resultantes em um único DataFrame
one_expansions = pd.concat(one_expansions, ignore_index=True)
one_expansions = one_expansions.drop_duplicates()

#print(f"{df_first_rows['query_idx'].nunique()} queries originais")
print(len(one_expansions))
one_expansions.head(15)
    

In [None]:
# Agrega o número de documentos relevantes por query (obtidos da MSMARCO)
original_queries = one_expansions.merge(relevant_counts, on="query_idx")
original_queries.rename(columns={"Passage_Scores":"original_passage_scores"}, inplace=True)

original_queries

### Identifica as queries são realmente expansões (aquelas com 'exp_num' diferente de 1)

In [None]:
expanded_queries = expanded_queries_docs_rank_info_pd[expanded_queries_docs_rank_info_pd["exp_num"] != 1].copy()
expanded_queries.rename(columns={"Passage_Scores":"expansion_passage_scores"}, inplace=True)

expanded_queries

### Unifica as queries originais com as expansões
E agrega o número de documentos relevantes julgados na MsMarco

In [None]:
#expanded_queries_ranks_pd  = expanded_queries.copy()


expanded_queries_ranks_pd = pd.merge(left=expanded_queries, right=original_queries, on='query_idx')

#expanded_queries_ranks_pd["original_passage_scores"] = expanded_queries_ranks_pd.apply(
#    lambda row: original_queries.loc[original_queries["query_idx"] == row["query_idx"]]["original_passage_scores"].values[0], axis=1
#)

#expanded_queries_ranks_pd["relevant_count"] = expanded_queries_ranks_pd.apply(
#    lambda row: original_queries.loc[original_queries["query_idx"] == row["query_idx"]]["relevant_count"].values[0], axis=1
#)

print(expanded_queries_ranks_pd['original_passage_scores'].nunique())
expanded_queries_ranks_pd

## Calcula o Spearman entre cada registro original e suas expansões

In [None]:
spearman_pd = expanded_queries_ranks_pd.copy()
spearman_pd['spearman'] = spearman_pd.apply(
    lambda row: calc_spearman(row['original_passage_scores'], row['expansion_passage_scores'], row['relevant_count']),
    axis=1   
) 

#master_info_dd.compute()
print(f"queries originais mantidas até aqui: {spearman_pd['query_idx'].nunique()}")
print(spearman_pd.head(5))
print(len(spearman_pd))

## Calcula o a precisão média da query original e das expansões

In [None]:
spearman_pd['avg_precision_query_original'] = spearman_pd.apply(
    lambda row: calc_avg_precision(row['original_passage_scores'], row['relevant_count']),
    axis=1   
)

spearman_pd['avg_precision_query_expansao'] = spearman_pd.apply(
    lambda row: calc_avg_precision(row['expansion_passage_scores'], row['relevant_count']),
    axis=1   
)

print(f"queries originais mantidas até aqui: {spearman_pd['query_idx'].nunique()}")
print(spearman_pd.head(5))
print(len(spearman_pd))


In [None]:
import matplotlib.pyplot as plt

# Create a histogram
plt.hist(spearman_pd['avg_precision_query_original'], bins=15, color='blue', edgecolor='black')

# Set the y-axis to log scale
plt.yscale('log')

# Add labels and title
plt.xlabel('Precisão')
plt.ylabel('Freqência')
plt.title('Frequência de faixas de precisão para a consulta original')

# Show the plot
plt.show()

#### Obtém os textos das queries original e expandida

In [None]:

#spearman_pd.drop(columns=['original_passage_scores', 'expansion_passage_scores'], inplace= True)

spearman_pd["query_original"] = spearman_pd.apply(
    lambda row: expanded_queries_info_pd.loc[expanded_queries_info_pd["query_idx"] == row["query_idx"], "query"].values[0], axis=1
)

spearman_pd["query_expandida"] = spearman_pd.apply(
    lambda row: expanded_queries_info_pd.loc[
        (expanded_queries_info_pd["query_idx"] == row["query_idx"]) & 
        (expanded_queries_info_pd["exp_num"] == row["exp_num_x"]), 
        "query"
    ].values[0], axis=1
)


print(f"queries originais mantidas até aqui: {spearman_pd['query_idx'].nunique()}")
print(len(spearman_pd))
print(print(spearman_pd.head(3)))

# FILTRA queries sem julgamento suficiente


In [None]:
# Não é necessario, já que essa filtragem foi feita antes de realizar a expansão

# #filtered_df = spearman_pd[spearman_pd['relevant_count'] >= 5]
#filtered_df.count()
#len(filtered_df)

# Agrega IDF das palavras da query original

In [None]:
IDF_pd = pd.read_csv("../../aditional_data/idfnew.norm.tsv", sep="\t", header=None, names=["word", "idf"])

# Create a dictionary from df_idf for quick lookup
idf_dict = dict(zip(IDF_pd['word'], IDF_pd['idf']))

# Function to map words to their idf values
def map_idf(query):
    words = query.split()
    idf_values = [idf_dict.get(word, None) for word in words]  # Use None if the word is not found
    return idf_values

# Apply the function to the 'query_original' column
spearman_pd['idf_original_values'] = spearman_pd['query_original'].apply(map_idf)
spearman_pd['idf_expanded_values'] = spearman_pd['query_expandida'].apply(map_idf)

print(f"queries originais mantidas até aqui: {spearman_pd['query_idx'].nunique()}")
spearman_pd

## Cria o indicador de similaridade

In [None]:
spearman_pd["words_similarity"] = spearman_pd.apply(lambda row: similarity(row["query_original"], row["query_expandida"], word_vectors)[0], axis=1)
print(f"queries originais mantidas até aqui: {spearman_pd['query_idx'].nunique()}")

# Demora cerca de 2m para executar

## Cria os indicadores de idf (idf do(s) termo(s) expandido(s) e diferença entre idfs)

In [None]:
spearman_pd[["expansion_idf", "expansion_idf_difference"]] = spearman_pd.apply(
    lambda row: idf(row["query_original"], row["query_expandida"], row["idf_original_values"], row["idf_expanded_values"]), 
    axis=1,
    result_type="expand"
    )

print(f"queries originais mantidas até aqui: {spearman_pd['query_idx'].nunique()}")
spearman_pd

## Cria os labels com base na precisão média

In [None]:
# Se avg_precision_query_expansao >= avg_precision_query_original então label = 1
spearman_pd["label"] = spearman_pd.apply(
    lambda row: 1 if row["avg_precision_query_expansao"] >= row["avg_precision_query_original"] else 0, axis=1
    )

print(f"queries originais mantidas até aqui: {spearman_pd['query_idx'].nunique()}")
spearman_pd

# Salva resultado

In [None]:
spearman_pd = spearman_pd.rename(columns={'exp_num_x':'exp_num'})
spearman_pd.to_csv(f'./queries_train_judged_expanded_enriched.csv', sep="\t")