Realizando imports e download de dependências

In [None]:
!pip install gensim nltk

import string
from gensim import corpora, models, similarities
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

Carregando corpus

Pré-processamento

In [None]:
def preprocessar(texto):
    tokens = texto.lower().translate(str.maketrans('', '', string.punctuation)).split()
    return [t for t in tokens if t not in stopwords.words('english')]

tokenized_docs = [preprocessar(doc) for doc in documentos]

Criando modelo TF-IDF

In [None]:
def construir_modelo_tfidf(perguntas):
    textos = [preprocessar(p) for p in perguntas]
    dicionario = corpora.Dictionary(textos)
    corpus_bow = [dicionario.doc2bow(texto) for texto in textos]
    modelo_tfidf = models.TfidfModel(corpus_bow)
    index = similarities.SparseMatrixSimilarity(modelo_tfidf[corpus_bow], num_features=len(dicionario))
    return dicionario, modelo_tfidf, index, textos

dicionario, modelo_tfidf, index, textos = construir_modelo_tfidf(perguntas)
print(f"Vocabulário com {len(dicionario)} termos.")

Função da consulta

In [None]:
def responder_consulta(consulta, dicionario, modelo_tfidf, index, perguntas, categorias, top_k=5):
    consulta_tokens = preprocessar(consulta)
    consulta_bow = dicionario.doc2bow(consulta_tokens)
    consulta_tfidf = modelo_tfidf[consulta_bow]
    
    sims = index[consulta_tfidf]
    sims = list(enumerate(sims))
    sims = sorted(sims, key=lambda x: -x[1])

    print(f"\nConsulta: {consulta}\n")
    print("Top resultados:")
    for idx, score in sims[:top_k]:
        print(f"[{score:.2f}] {perguntas[idx]} (Classe: {categorias[idx]})")

Testes de consulta

In [None]:
responder_consulta("Who leads Brazil?", dicionario, modelo_tfidf, index, perguntas, categorias)

Avaliação

In [None]:
def calcular_precision_recall_f1(ranking, relevantes):
    precisions, recalls, f1s = [], [], []
    retrieved_relevant = 0
    total_relevant = len(relevantes)

    for k, doc_id in enumerate(ranking, start=1):
        if doc_id in relevantes:
            retrieved_relevant += 1
        precision = retrieved_relevant / k
        recall = retrieved_relevant / total_relevant if total_relevant > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0

        precisions.append(precision)
        recalls.append(recall)
        f1s.append(f1)

    return precisions, recalls, f1s

def precision_11_points(precisions, recalls):
    recall_levels = np.linspace(0, 1.0, 11)
    precision_at_recall = []

    for r in recall_levels:
        precisions_at_r = [p for p, rc in zip(precisions, recalls) if rc >= r]
        if precisions_at_r:
            precision_at_recall.append(max(precisions_at_r))  # interpolated
        else:
            precision_at_recall.append(0.0)
    
    return recall_levels, precision_at_recall

In [None]:
tfidf_results = {}

for query in consultas:
    ranking = ranquear_tfidf(query)
    relevantes = ground_truth[query]
    p, r, f1 = calcular_precision_recall_f1(ranking, relevantes)
    tfidf_results[query] = (p, r, f1)

# Plotar curvas
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))
for i, query in enumerate(consultas):
    precisions, recalls, _ = tfidf_results[query]
    recall_levels, prec_11 = precision_11_points(precisions, recalls)
    plt.plot(recall_levels, prec_11, label=f"TF-IDF: Consulta {i+1}")

plt.xlabel("Recall")
plt.ylabel("Precision (Interpolado)")
plt.title("Curva Precision x Recall (TF-IDF)")
plt.grid(True)
plt.legend()
plt.show()