# Laboratório Elastic Search teste com entidades

-----------------------
# Código

### Imports

In [8]:
import zipfile
import requests
import os
import pandas as pd
from datetime import datetime
import re
import warnings
import subprocess
warnings.filterwarnings("ignore")
from collections import OrderedDict

from elasticsearch import Elasticsearch

from sentence_transformers import SentenceTransformer

import nltk
from nltk import word_tokenize
from nltk.stem import PorterStemmer
import sklearn

# Download localmente
nltk.data.path.append('./nltk_data')
nltk.download('punkt_tab', download_dir='./nltk_data')
nltk.download('stopwords',download_dir='./nltk_data')

import spacy
import unidecode

[nltk_data] Downloading package punkt_tab to ./nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to ./nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Run the following blocks on the terminal

In [7]:
!uv run python -m spacy download pt_core_news_sm

/bin/python: No module named spacy


In [None]:
!sudo systemctl start docker.service && docker compose up -d

### Passo 1: Baixar dados do Lupa

In [9]:
# Base de dados de notícias da Lupa
url = "https://docs.google.com/uc?export=download&confirm=t&id=1W067Md2EbvVzW1ufzFg17Hf7Y9cCZxxr"
filename = "articles_lupa_lab_elasticsearch.zip"
data_path = "data"
zip_file_path = f"{data_path}/{filename}"

os.makedirs(data_path, exist_ok=True)

# Baixa o zip
with open(zip_file_path, "wb") as f:
    f.write(requests.get(url, allow_redirects=True).content)

# Extrai o csv do zip
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(data_path)
    
output_file = f"{data_path}/articles_lupa.csv"
assert os.path.exists(output_file)
print("ok")

ok


### Passo 2: Pré-processar os dados e gerar embeddings

In [10]:
# Implementações de pré-processamentos de texto. Modifiquem, adicionem, removam conforme necessário.
class Preprocessors:
    STOPWORDS = set(nltk.corpus.stopwords.words('portuguese'))
    
    def __init__(self):
        self.stemmer = PorterStemmer()
        self.spacy_nlp = spacy.load("pt_core_news_sm") # Utiliza para lematização
        
    # Remove stopwords do português
    def remove_stopwords(self, text):
        # Tokeniza as palavras
        tokens = word_tokenize(text)
        # Remove as stop words
        tokens = [word for word in tokens if word not in self.STOPWORDS]

        return ' '.join(tokens)
    
    # Realiza a lematização
    def lemma(self, text):
        return " ".join([token.lemma_ for token in self.spacy_nlp(text)])
    
    # Realiza a stemização
    def porter_stemmer(self, text):
        # Tokeniza as palavras
        tokens = word_tokenize(text)

        for index in range(len(tokens)):
            # Realiza a stemização
            stem_word = self.stemmer.stem(tokens[index])
            tokens[index] = stem_word

        return ' '.join(tokens)

    # Transforma o texto em lower case
    def lower_case(self, str):
        return str.lower()

    # Remove urls com regex
    def remove_urls(self, text):
        url_pattern = r'https?://\S+|www\.\S+'
        without_urls = re.sub(pattern=url_pattern, repl=' ', string=text)
        return without_urls

    # Remove números com regex
    def remove_numbers(self, text):
        number_pattern = r'\d+'
        without_number = re.sub(pattern=number_pattern,
    repl=" ", string=text)
        return without_number

    # Converte caracteres acentuados para sua versão ASCII
    def accented_to_ascii(self, text):
        text = unidecode.unidecode(text)
        return text

In [11]:
# Lista de palavras que o modelo Small confunde com entidades (Blacklist)
nlp = spacy.load("pt_core_news_sm")

def extrair_features(texto):
    """Extrai entidades limpando ruídos do modelo Small"""
    doc = nlp(str(texto))
    features = {"PER": [], "ORG": [], "LOC": [], "NOUNS": [], "PROPN": []}
    
    for ent in doc.ents:
        t = ent.text.strip().lower() # Normaliza para comparar
        t_original = ent.text.strip()
        
        # Filtros de Qualidade para o Small
        if len(t) < 3: continue # Remove "A", "O"
        if t.replace("/", "").replace("-", "").isdigit(): continue # Remove datas puras
        
        if ent.label_ in features:
            features[ent.label_].append(t_original)
    
    for token in doc:
        # Ignora pontuação e palavras curtas
        if token.is_punct or len(token.text) < 3: continue
        
        palavra = token.text.strip().lower()
        
        # Se for SUBSTANTIVO (Ex: vacina, autismo, cobrança)
        if token.pos_ == "NOUN":
            features["NOUNS"].append(palavra)
            
        # Se for NOME PRÓPRIO (Ex: Israel, Pix, Covid)
        # O Spacy small adora classificar coisas que não conhece como PROPN
        elif token.pos_ == "PROPN":
            features["PROPN"].append(palavra)

            
    # Retorna listas únicas
    return {k: list(set(v)) for k, v in features.items()}

In [12]:
# Carregar o modelo gerador de embeddings
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

# Caminho para salvar o dataframe de notícias
data_df_path = "data/data_df.pkl"

# Selecione diferentes pré-processamentos
# Exemplo:

preprocessor = Preprocessors()
preprocessing_steps = [
    preprocessor.remove_urls,
    preprocessor.remove_stopwords,
    preprocessor.remove_numbers,
    preprocessor.lemma,
    preprocessor.accented_to_ascii,
    preprocessor.lower_case,
    #preprocessor.porter_stemmer
]



RECREATE_DF = True

# Cria o data frame se ele já existir ou se a variável RECREATE_INDEX for verdadeira
# Ou (exclusivo) carrega o dataframe salvo
if not os.path.exists(data_df_path) or RECREATE_DF:    
    df = pd.read_csv(output_file, sep=";")[["Título", "Texto", "Data de Publicação"]]
    df["Data de Publicação"] = df["Data de Publicação"].apply(lambda str_date: datetime.strptime(str_date.split(" - ")[0], "%d.%m.%Y"))
    df.sort_values("Data de Publicação", inplace=True, ascending=False)

    df = df.sample(frac=0.5, random_state=42).reset_index(drop=True)

    df["Embeddings"] = [None] * len(df)
    df["doc_id"] = df.reset_index(drop=True).index
    df["entidades_per"] = [[] for _ in range(len(df))]
    df["entidades_org"] = [[] for _ in range(len(df))]
    df["entidades_loc"] = [[] for _ in range(len(df))]
    df["entidades_noun"] = [[] for _ in range(len(df))]
    df["entidades_propn"] = [[] for _ in range(len(df))]
    
    print("Gerando Embeddings e Extraindo Entidades... (Isso pode demorar um pouco)")

    for i, row in df.iterrows():
        texto_completo = row["Texto"].strip() + "\n" + row["Título"].strip()

        ents = extrair_features(texto_completo)
        df.at[i, "entidades_per"] = ents["PER"]
        df.at[i, "entidades_org"] = ents["ORG"]
        df.at[i, "entidades_loc"] = ents["LOC"]
        df.at[i, "entidades_noun"] = ents["NOUNS"]
        df.at[i, "entidades_propn"] = ents["PROPN"]
        
        df.at[i, "Texto completo"] = texto_completo
        texto_processado = texto_completo
        for preprocessing_step in preprocessing_steps:
            texto_processado = preprocessing_step(texto_processado)
        
        df.at[i, "Texto processado"] = texto_processado
        embeddings = model.encode(texto_completo).tolist()
        df.at[i, "Embeddings"] = embeddings
        
    print("Geração de embeddings finalizada.")
    
    with open(data_df_path, "wb") as f:
        df.to_pickle(f)
else:
    with open(data_df_path, "rb") as f:
        df = pd.read_pickle(f)
    print("Dataframe carregado de arquivo.")

Gerando Embeddings e Extraindo Entidades... (Isso pode demorar um pouco)
Geração de embeddings finalizada.


### Passo 3: Indexar dados no ElasticSearch (Lembrem-se de reindexar os dados se os pré-processamentos mudarem)

In [13]:
es = Elasticsearch(
    hosts = [{'host': "localhost", 'port': 9200, "scheme": "https"}],
    basic_auth=("elastic","elastic"),
    verify_certs = False,
)

In [14]:
RECREATE_INDEX = True

index_name = "verificacoes_lupa"

# Se a flag for True e se o índice existir, ele é deletado
if RECREATE_INDEX and es.indices.exists(index=index_name):
    es.indices.delete(index=index_name)
    print(f"Índice '{index_name}' deletado.")

# Cria o índice e popula com os dados
if not es.indices.exists(index=index_name):
    es.indices.create(index=index_name, mappings={
        "properties": {
            "doc_id": {"type": "integer"},
            "full_text": {"type": "text"},
            "processed_text": {"type": "text"},
            "embeddings": {"type": "dense_vector", "dims": 384},
            "entidades_per": {"type": "keyword"},
            "entidades_org": {"type": "keyword"},
            "entidades_loc": {"type": "keyword"},
            "entidades_noun": {"type": "keyword"},
            "entidades_propn": {"type": "keyword"},
        }
    })
    print(f"Índice '{index_name}' criado.")
    
    for i, row in df.iterrows():
        es.index(index=index_name, id=row["doc_id"], body={
            "doc_id": row["doc_id"],
            "full_text": row["Texto completo"],
            "processed_text": row["Texto processado"],
            "embeddings": row["Embeddings"],
            "entidades_per": row["entidades_per"],
            "entidades_org": row["entidades_org"],
            "entidades_loc": row["entidades_loc"],
            "entidades_noun": row["entidades_noun"],
            "entidades_propn": row["entidades_propn"],
        })
    print("Índice preenchido.")

print("Indexação finalizada.")

Índice 'verificacoes_lupa' criado.
Índice preenchido.
Indexação finalizada.


In [17]:
# Estas serão as queries QF1 e QF2
with open("data/queries_fixadas.txt", "r") as f:
    queries_fixadas = [line.strip() for line in f.readlines()]
    assert len(queries_fixadas) == 2
    QF1 = queries_fixadas[0]
    QF2 = queries_fixadas[1]
    
# Preencha aqui as queries do grupo
QP1 = "vacina causa autismo"
QP2 = "massacre em Israel"

queries = OrderedDict()
queries["QF1"] = QF1
queries["QF2"] = QF2
queries["QP1"] = QP1
queries["QP2"] = QP2

#### Busca Léxica

In [18]:
# Implementação de busca esparsa (léxica) com BM25
def lexical_search(queries: dict[str, str]):
    lexical_results = {}
    for query_id, query in queries.items():
        
        # Pré-processa os dados
        for preprocessing_step in preprocessing_steps:
            query = preprocessing_step(query)
        
        search_query = {
            "query": {
                "match": {
                    "processed_text": query
                }
            }
        }

        # Realiza a busca
        response = es.search(index=index_name, body=search_query)
        
        hits_results = []
        # Recupera os resultados
        for hit in response["hits"]["hits"]:
            hits_results.append((hit["_source"]["doc_id"], hit["_score"]))
        lexical_results[query_id] = hits_results
        
    return lexical_results

#### Busca Semântica

In [19]:
# Realiza busca semântica (densa) com KNN exato
def semantic_search(queries: dict[str, str]):
    semantic_results = {}
    
    for query_id, query in queries.items():
        # Aplica todos os pré-processamentos aos dados
        for preprocessing_step in preprocessing_steps:
            query = preprocessing_step(query)
            
        query_vector = model.encode(query).tolist()
        
        
        search_query = {
            "query": {
                "script_score": {
                    "query": {"match_all": {}},
                    "script": {
                        "source": "cosineSimilarity(params.query_vector, 'embeddings') + 1.0",
                        "params": {"query_vector": query_vector}
                    }
                }
            }
        }

        # Realiza a busca
        response = es.search(index=index_name, body=search_query)
        
        hits_results = []
        # Recupera top 10 resultados
        for hit in response["hits"]["hits"]:
            hits_results.append((hit["_source"]["doc_id"], hit["_score"]))
            
        semantic_results[query_id] = hits_results

    return semantic_results

#### Busca Híbrida

In [20]:
# Busca híbrida ou RRF. Implemente sua solução aqui. Você pode realizar as duas buscas anteriores (léxica e semântica) como base para formar a busca híbrida.

def hybrid_search_v1(queries: dict[str, str]):
    lexical_results = lexical_search(queries)
    semantic_results = semantic_search(queries)

    print(lexical_results)
    print(semantic_results)

    alpha = 0.8
    beta = 0.2
    k_rrf = 60

    relevance_documents = {}

    for query in queries.keys():

        lexical_rank = {}
        semantic_rank = {}

        for i, (doc_id, _) in enumerate(lexical_results[query]):
            lexical_rank[doc_id] = i + 1

        for i, (doc_id, _) in enumerate(semantic_results[query]):
            semantic_rank[doc_id] = i + 1

        all_docs = set(lexical_rank).union(set(semantic_rank))

        relevance_documents[query] = []

        for doc_id in all_docs:
            rrf_lex = 0
            rrf_sem = 0

            if doc_id in lexical_rank:
                rrf_lex = 1 / (k_rrf + lexical_rank[doc_id])

            if doc_id in semantic_rank:
                rrf_sem = 1 / (k_rrf + semantic_rank[doc_id])

            score = rrf_lex +  rrf_sem
            relevance_documents[query].append((doc_id, score))

        relevance_documents[query].sort(key=lambda x: x[1], reverse=True)
        relevance_documents[query] = relevance_documents[query][:10]

    for doc_id, score in relevance_documents["QF1"]:
        print(f"DOC_ID={doc_id} SCORE={score}")
   
    return relevance_documents

In [36]:
# Busca híbrida ou RRF. Implemente sua solução aqui. Você pode realizar as duas buscas anteriores (léxica e semântica) como base para formar a busca híbrida.
def hybrid_search_v2(queries: dict[str, str]):
    top_k = 10
    lexical_results  = lexical_search(queries)
    semantic_results = semantic_search(queries)

    # print(lexical_results)
    # print(semantic_results)

    alpha = 0.55
    beta  = 0.35
    k_rrf = 60
    w_rrf = 0.25 

    relevance_documents = {}

    for qid in queries.keys():
        lex_list = lexical_results.get(qid, [])
        sem_list = semantic_results.get(qid, [])

        lexical_rank   = {doc_id: rank + 1 for rank, (doc_id, _) in enumerate(lex_list)}
        semantic_rank  = {doc_id: rank + 1 for rank, (doc_id, _) in enumerate(sem_list)}
        lexical_score  = {doc_id: score for (doc_id, score) in lex_list}
        semantic_score = {doc_id: score for (doc_id, score) in sem_list}

        # Normalizando os scores [0,1]
        def normalize_score_map(score_map):
            if not score_map: return {}
            values = list(score_map.values())
            mn, mx = min(values), max(values)
            if mx == mn:
                return {doc: 1.0 for doc in score_map}
            return {doc: (s - mn) / (mx - mn) for doc, s in score_map.items()}

        norm_lex = normalize_score_map(lexical_score)
        norm_sem = normalize_score_map(semantic_score)

        candidate_docs = set(lexical_rank) | set(semantic_rank)

        scored = []
        for doc_id in candidate_docs:
            # RRF baseado nos ranks
            rrf_lex = 1.0 / (k_rrf + lexical_rank[doc_id]) if doc_id in lexical_rank else 0.0
            rrf_sem = 1.0 / (k_rrf + semantic_rank[doc_id]) if doc_id in semantic_rank else 0.0
            rrf_sum = rrf_lex + rrf_sem

            nlex = norm_lex.get(doc_id, 0.0)
            nsem = norm_sem.get(doc_id, 0.0)

            # Juntanto os scores --> scores normalizados + RRF
            combined = (alpha * nlex) + (beta * nsem) + (w_rrf * rrf_sum)

            scored.append((doc_id, combined))

        scored.sort(key=lambda x: x[1], reverse=True)
        # relevance_documents[qid] = scored[:top_k]
        relevance_documents[qid] = scored[:10]


    for doc_id, score in relevance_documents["QF1"]:
        print(f"DOC_ID={doc_id} SCORE={score}")
   
    return relevance_documents

#### Busca Criativa

In [37]:
# Implemente sua própria estratégia de busca, podendo ela ser esparsa, densa ou híbrida. Implemente algo como "more_like_this", "BM35", "fuzzy" etc.
def creative_search_v1(queries: dict[str, str]):
    # 1. Extrai entidades da PERGUNTA do usuário
    resultados_queries = {}

    for query_id, query in queries.items():
        ents_query = extrair_features(query)
        print(f"Entidades da {query_id}, {query}", ents_query)
        q_per = ents_query["PER"]
        q_org = ents_query["ORG"]
        q_loc = ents_query["LOC"]
        q_noun = ents_query["NOUNS"]
        q_propn = ents_query["PROPN"]
        for processor in preprocessing_steps:
            query = processor(query)

        
        
        
        # Gera vetor da query
        query_vector = model.encode(query).tolist()
        
        # 2. Monta a Query com Boost de Entidade
        body = {
            "size": 10,
            "query": {
                "bool": {
                    # A. O "Must" garante que o assunto seja relevante (Híbrida Simples)
                    "must": [
                        {
                            "bool": {
                                "should": [
                                    {"match": {"processed_text": query}}, # BM25
                                    {"knn": {"field": "embeddings", "query_vector": query_vector, "k": 10, "boost": 0.5}} # Vetorial
                                ]
                            }
                        }
                    ],
                    
                    # B. O "Should" é o bônus da Busca Criativa
                    # Se bater a entidade, a nota sobe.
                    "should": [
                        {"terms": {"entidades_per": q_per, "boost": 5.0}},
                        {"terms": {"entidades_org": q_org, "boost": 5.0}},
                        {"terms": {"entidades_loc": q_loc, "boost": 4.0}},

                        {"terms": {"entidades_noun": q_noun, "boost": 4.0}},
                        {"terms": {"entidades_propn": q_propn, "boost": 3.5}},

                    ]
                }
            }
        }
        
        # Executa a busca
        res = es.search(index=index_name, body=body)
        
        # Formata retorno apenas com doc_id (conforme pedido na competição)
        resultados = []
        for hit in res['hits']['hits']:
            resultados.append((hit['_source']['doc_id'], hit["_score"]))
        
        resultados_queries[query_id] = resultados
        
    return resultados_queries

In [29]:
# Implemente sua própria estratégia de busca, podendo ela ser esparsa, densa ou híbrida. Implemente algo como "more_like_this", "BM35", "fuzzy" etc.
def creative_search_v2(queries: dict[str, str]):
    size = 10

    resultados_queries = {}

    for query_id, query in queries.items():
        processed_query = query
        for processor in preprocessing_steps:
            processed_query = processor(processed_query)

        ents = extrair_features(query)
        q_per   = ents.get("PER", []) or []
        q_org   = ents.get("ORG", []) or []
        q_loc   = ents.get("LOC", []) or []
        q_noun  = ents.get("NOUNS", []) or []
        q_propn = ents.get("PROPN", []) or []

        should_clauses = []

        # 1) "more_like_this" em "full_text" p/ ajudar a encontrar documentos semanticamente semelhantes
        mlt_clause = {
            "more_like_this": {
                "fields": ["full_text"],
                "like": [query],
                "min_term_freq": 1,
                "min_doc_freq": 1,
                "max_query_terms": 25
            }
        }
        should_clauses.append({"bool": {"must": mlt_clause}})

        # 2) "multi_match" em "processed_text" com fuzziness e cross_fields
        multi_match_clause = {
            "multi_match": {
                "query": processed_query,
                "fields": ["processed_text"],
                "type": "best_fields",
                "operator": "and",     # prefira documentos que correspondam a todos os termos da pesquisa
                "fuzziness": "AUTO",
                "prefix_length": 2
            }
        }
        should_clauses.append(multi_match_clause)

        # 3) Boosted entity com matches exatas: usando termo/match nos entity fields
        #    Documentos que contenham named entities da query serao favorecidos
        entity_boosts = [
            ("entidades_per", q_per, 6.0),
            ("entidades_org", q_org, 5.0),
            ("entidades_loc", q_loc, 4.0),
            ("entidades_propn", q_propn, 3.5),
            ("entidades_noun", q_noun, 2.5),
        ]

        for field, values, boost in entity_boosts:
            if values:
                for v in values:
                    v_proc = v.strip()
                    if not v_proc: continue
                    should_clauses.append({
                        "match": {
                            field: {
                                "query": v_proc,
                                "boost": float(boost),
                                "operator": "and"
                            }
                        }
                    })

        # 4) Uma correspondência de frases simplificada em "processed_text" para priorizar match de frases (maior precisão)
        phrase_clause = {
            "match_phrase": {
                "processed_text": {
                    "query": processed_query,
                    "slop": 2,
                    "boost": 1.8
                }
            }
        }
        should_clauses.append(phrase_clause)

        # Cria a query final
        body = {
            "size": size,
            "query": {
                "bool": {
                        # Pelo menos uma das classes vai dar match, mas prefira usar multiplas matches
                        "should": should_clauses,
                        "minimum_should_match": 1,
                }
            }
        }


        # Executa a busca
        res = es.search(index=index_name, body=body)

        resultados = []
        for hit in res.get("hits", {}).get("hits", []):
            doc_id = hit.get("_source", {}).get("doc_id", hit.get("_id"))
            score  = hit.get("_score", 0.0)
            resultados.append((doc_id, score))

        resultados_queries[query_id] = resultados

    return resultados_queries

#### Execução das buscas

In [38]:
search_functions = [
    ("lexical", lexical_search),
    ("semantic", semantic_search),
    ("hybrid_v1", hybrid_search_v1),
    ("hybrid_v2", hybrid_search_v2),
    ("creative_v1", creative_search_v1),
    ("creative_v2", creative_search_v2)
]

def run_all_searches(queries: dict[str, str]):
    all_search_results = {}
    for search_name, search_function in search_functions:
        results = search_function(queries)
        all_search_results[search_name] = results
    return all_search_results

run_all_searches(queries)

{'QF1': [(1429, 16.823084), (299, 16.192177), (140, 16.191858), (1689, 15.490684), (2328, 14.387486), (1398, 14.052032), (1735, 13.307976), (410, 12.483434), (1887, 12.423523), (2021, 12.145069)], 'QF2': [(1594, 12.996189), (1237, 12.85702), (1621, 12.774794), (316, 12.742256), (2208, 12.732671), (15, 12.723009), (1067, 12.273449), (510, 12.072852), (122, 11.790132), (1793, 11.518058)], 'QP1': [(2314, 11.017654), (303, 10.456556), (960, 9.499834), (1394, 9.259333), (2519, 8.568039), (2436, 8.433064), (1892, 8.372154), (1552, 8.238545), (1296, 7.72898), (294, 7.6265297)], 'QP2': [(814, 15.349197), (1388, 13.525953), (180, 12.836807), (1992, 11.048485), (1133, 8.7437725), (1578, 8.4807625), (2484, 7.2963786), (1235, 7.2920065), (2550, 7.282001), (2492, 7.243937)]}
{'QF1': [(1398, 1.6246119), (2188, 1.5283167), (1429, 1.5064515), (2328, 1.505476), (2555, 1.4939659), (333, 1.4828167), (177, 1.4721811), (410, 1.47214), (2410, 1.4565318), (1125, 1.4546205)], 'QF2': [(1237, 1.8076564), (1722,

{'lexical': {'QF1': [(1429, 16.823084),
   (299, 16.192177),
   (140, 16.191858),
   (1689, 15.490684),
   (2328, 14.387486),
   (1398, 14.052032),
   (1735, 13.307976),
   (410, 12.483434),
   (1887, 12.423523),
   (2021, 12.145069)],
  'QF2': [(1594, 12.996189),
   (1237, 12.85702),
   (1621, 12.774794),
   (316, 12.742256),
   (2208, 12.732671),
   (15, 12.723009),
   (1067, 12.273449),
   (510, 12.072852),
   (122, 11.790132),
   (1793, 11.518058)],
  'QP1': [(2314, 11.017654),
   (303, 10.456556),
   (960, 9.499834),
   (1394, 9.259333),
   (2519, 8.568039),
   (2436, 8.433064),
   (1892, 8.372154),
   (1552, 8.238545),
   (1296, 7.72898),
   (294, 7.6265297)],
  'QP2': [(814, 15.349197),
   (1388, 13.525953),
   (180, 12.836807),
   (1992, 11.048485),
   (1133, 8.7437725),
   (1578, 8.4807625),
   (2484, 7.2963786),
   (1235, 7.2920065),
   (2550, 7.282001),
   (2492, 7.243937)]},
 'semantic': {'QF1': [(1398, 1.6246119),
   (2188, 1.5283167),
   (1429, 1.5064515),
   (2328, 1.505

#### Analise os resultados da busca e aprimore a busca!

In [39]:
all_search_results = run_all_searches(queries)
search_results_df = pd.DataFrame(all_search_results)
search_results_df

{'QF1': [(1429, 16.823084), (299, 16.192177), (140, 16.191858), (1689, 15.490684), (2328, 14.387486), (1398, 14.052032), (1735, 13.307976), (410, 12.483434), (1887, 12.423523), (2021, 12.145069)], 'QF2': [(1594, 12.996189), (1237, 12.85702), (1621, 12.774794), (316, 12.742256), (2208, 12.732671), (15, 12.723009), (1067, 12.273449), (510, 12.072852), (122, 11.790132), (1793, 11.518058)], 'QP1': [(2314, 11.017654), (303, 10.456556), (960, 9.499834), (1394, 9.259333), (2519, 8.568039), (2436, 8.433064), (1892, 8.372154), (1552, 8.238545), (1296, 7.72898), (294, 7.6265297)], 'QP2': [(814, 15.349197), (1388, 13.525953), (180, 12.836807), (1992, 11.048485), (1133, 8.7437725), (1578, 8.4807625), (2484, 7.2963786), (1235, 7.2920065), (2550, 7.282001), (2492, 7.243937)]}
{'QF1': [(1398, 1.6246119), (2188, 1.5283167), (1429, 1.5064515), (2328, 1.505476), (2555, 1.4939659), (333, 1.4828167), (177, 1.4721811), (410, 1.47214), (2410, 1.4565318), (1125, 1.4546205)], 'QF2': [(1237, 1.8076564), (1722,

Unnamed: 0,lexical,semantic,hybrid_v1,hybrid_v2,creative_v1,creative_v2
QF1,"[(1429, 16.823084), (299, 16.192177), (140, 16...","[(1398, 1.6246119), (2188, 1.5283167), (1429, ...","[(1429, 0.032266458495966696), (1398, 0.031544...","[(1429, 0.6647828955652628), (1398, 0.58209023...","[(1429, 21.199839), (299, 20.192177), (1689, 1...","[(299, 49.295006), (1887, 44.67344), (1561, 42..."
QF2,"[(1594, 12.996189), (1237, 12.85702), (1621, 1...","[(1237, 1.8076564), (1722, 1.7621573), (1989, ...","[(1237, 0.03252247488101534), (1621, 0.0314980...","[(1237, 0.856347014966595), (1621, 0.614106656...","[(1594, 17.421227), (1237, 17.3088), (1621, 17...","[(1621, 59.45197), (1237, 57.21261), (861, 53...."
QP1,"[(2314, 11.017654), (303, 10.456556), (960, 9....","[(1237, 1.5580077), (359, 1.5565056), (2519, 1...","[(2519, 0.03125763125763126), (2314, 0.0163934...","[(2314, 0.5540983606557377), (303, 0.463028840...","[(2314, 15.017654), (303, 14.456556), (960, 13...","[(960, 46.01397), (303, 36.968796), (595, 35.6..."
QP2,"[(814, 15.349197), (1388, 13.525953), (180, 12...","[(814, 1.7237178), (1693, 1.6455503), (1023, 1...","[(814, 0.03278688524590164), (1388, 0.03128054...","[(814, 0.9081967213114754), (1388, 0.535171806...","[(814, 27.281872), (1992, 22.548485), (1388, 2...","[(814, 111.52178), (1992, 88.82996), (180, 73...."


In [42]:
# def generate_exploded_df(search_results_df):
#     exploded_search_results_df = pd.concat([search_results_df[col].explode().reset_index() for col in search_results_df.columns], axis=1)
#     exploded_search_results_df = exploded_search_results_df.apply(lambda l: [doc_id for doc_id, _ in l])
#     return exploded_search_results_df

def generate_exploded_df(search_results_df):

    df_doc_ids = search_results_df.applymap(
        lambda lst: [doc_id for doc_id, _ in lst] if isinstance(lst, list) else []
    )

    exploded_df = pd.DataFrame({
        col: df_doc_ids[col].explode().reset_index(drop=True)
        for col in df_doc_ids.columns
    })

    return exploded_df

def generate_found_docs_text_df(exploded_search_results_df, all_docs_df):
    # Recupera os ids únicos dos documentos
    documents_ids = set(exploded_search_results_df.to_numpy().flatten().tolist())

    # Salva os textos e os ids dos documetnos que foram encontrados ems usas buscas
    documents_df = all_docs_df[all_docs_df["doc_id"].isin(documents_ids)][["Texto processado", "doc_id"]]
    return documents_df

exploded_df = generate_exploded_df(search_results_df)
found_docs_text_df = generate_found_docs_text_df(exploded_df, all_docs_df=df)

def save_results_to_file(exploded_df: pd.DataFrame,
                         found_docs_text_df: pd.DataFrame,
                         exploded_df_save_filepath: str = "data/search_results.csv",
                         found_docs_text_save_filepath: str = "data/documents.csv"):
    exploded_df.to_csv(exploded_df_save_filepath)
    found_docs_text_df.to_csv(found_docs_text_save_filepath)
    print("Resultados das buscas salvos em 'data/search_results.csv'.")
    print("Documentos de interesse salvos em 'data/documents.csv'.")
    
save_results_to_file(exploded_df, found_docs_text_df)
exploded_df

Resultados das buscas salvos em 'data/search_results.csv'.
Documentos de interesse salvos em 'data/documents.csv'.


Unnamed: 0,lexical,semantic,hybrid_v1,hybrid_v2,creative_v1,creative_v2
0,1429,1398,1429,1429,1429,299
1,299,2188,1398,1398,299,1887
2,140,1429,2328,299,1689,1561
3,1689,2328,410,140,2328,1295
4,2328,2555,2188,1689,1735,773
5,1398,333,299,2328,1887,2365
6,1735,177,140,2188,140,1429
7,410,410,1689,1735,1834,1342
8,1887,2410,2555,2555,2477,1865
9,2021,1125,333,410,2205,1999


In [43]:
def generate_id_map(all_docs_df, output_csv_path):
    """
    Exports a stable mapping of ALL documents used in Elasticsearch.

    The doc_id values will exactly match those returned by search results.
    """

    required_cols = {"doc_id", "Título", "Texto processado"}
    missing = required_cols - set(all_docs_df.columns)
    if missing:
        raise ValueError(f"Missing required columns: {missing}")

    id_map_df = (
        all_docs_df[["doc_id", "Título", "Texto processado"]]
        .rename(columns={
            "Título": "title",
            "Texto processado": "content"
        })
        .sort_values("doc_id")   # optional, but good for reproducibility
        .reset_index(drop=True)
    )

    id_map_df.to_csv(output_csv_path, sep=";", index=False)

    return id_map_df

generate_id_map(df, "data/id_map.csv")

Unnamed: 0,doc_id,title,content
0,0,É falso que houve megaprotesto na Alemanha con...,circular rede social foto mostrar dezena carro...
1,1,Estudo não aponta que carga viral de vacinados...,circular rede social estudo publicar revista c...
2,2,É antiga foto de manifestante em cima de carro...,"circular rede social foto homem mascarar , seg..."
3,3,Diretor da Anvisa não pediu demissão e critico...,"circular rede social diretor anvisa , agencia ..."
4,4,Jogadoras da seleção de futebol não posaram in...,circular rede imagem dois jogadora selecao bra...
...,...,...,...
2565,2565,Es falso que farmacias de Italia estén distrib...,circula en las rede sociales las farmacias en ...
2566,2566,Foto viral que mostra Faixa de Gaza após bomba...,circular rede social foto mostrar cidade ceu c...
2567,2567,É falso que STF afastou Bolsonaro do controle ...,circular rede social supremo tribunal federal ...
2568,2568,É falso que Magazine Luiza 'financia' fome dos...,em o comissao parlamentar inquerito ( cpi ) on...
