In [72]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("zynicide/wine-reviews")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'wine-reviews' dataset.
Path to dataset files: /kaggle/input/wine-reviews


## Importar Librerias

In [73]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import nltk
nltk.download("punkt")
nltk.download("stopwords")
nltk.download('punkt_tab')
import re
from nltk.corpus import stopwords
from collections import defaultdict


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/wine-reviews/winemag-data_first150k.csv
/kaggle/input/wine-reviews/winemag-data-130k-v2.json
/kaggle/input/wine-reviews/winemag-data-130k-v2.csv


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


## Construccion del ındice
* Leer un corpus de documentos en texto plano.
* Procesamiento basico: tokenizacion, normalizacion y remocion de stopwords.
* Construccion de un ındice invertido que almacene, para cada termino, los documentos en los que aparece y su frecuencia.

In [74]:
#Ruta del Corpus
path = "/kaggle/input/wine-reviews/winemag-data-130k-v2.csv"

df = pd.read_csv(path)

print("Documentos cargados:", len(df))
display(df.head())

Documentos cargados: 129971


Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks


## Preprocesamiento


In [75]:
#Limpiar columnas del dataframe
df = df.drop(['country', 'designation', 'points', 'price', 'province', 'region_1', 'region_2', 'taster_name', 'taster_twitter_handle', 'title', 'variety', 'winery' ], axis=1)
display(df.head())

Unnamed: 0.1,Unnamed: 0,description
0,0,"Aromas include tropical fruit, broom, brimston..."
1,1,"This is ripe and fruity, a wine that is smooth..."
2,2,"Tart and snappy, the flavors of lime flesh and..."
3,3,"Pineapple rind, lemon pith and orange blossom ..."
4,4,"Much like the regular bottling from 2012, this..."


In [76]:
df = df.rename(columns={'Unnamed: 0': 'wine_id'})
display(df.head())

Unnamed: 0,wine_id,description
0,0,"Aromas include tropical fruit, broom, brimston..."
1,1,"This is ripe and fruity, a wine that is smooth..."
2,2,"Tart and snappy, the flavors of lime flesh and..."
3,3,"Pineapple rind, lemon pith and orange blossom ..."
4,4,"Much like the regular bottling from 2012, this..."


In [77]:
#Funcion para preprocesar el dataframe
def clean_and_tokenize(text):
    #Convertir a minúsculas
    text = text.lower()

    #Eliminar caracteres no alfabéticos
    text = re.sub(r"[^a-záéíóúñü\s]", " ", text)

    #Tokenización
    tokens = nltk.word_tokenize(text, language="spanish")

    #Stopwords en español
    stop_es = set(stopwords.words("spanish"))

    #Remover tokens muy cortos y stopwords
    tokens = [t for t in tokens if len(t) > 2 and t not in stop_es]

    return tokens


In [78]:
#Aplicar preprocesamiento
df['tokens'] = df['description'].apply(clean_and_tokenize)
display(df.head())

Unnamed: 0,wine_id,description,tokens
0,0,"Aromas include tropical fruit, broom, brimston...","[aromas, include, tropical, fruit, broom, brim..."
1,1,"This is ripe and fruity, a wine that is smooth...","[this, ripe, and, fruity, wine, that, smooth, ..."
2,2,"Tart and snappy, the flavors of lime flesh and...","[tart, and, snappy, the, flavors, lime, flesh,..."
3,3,"Pineapple rind, lemon pith and orange blossom ...","[pineapple, rind, lemon, pith, and, orange, bl..."
4,4,"Much like the regular bottling from 2012, this...","[much, like, the, regular, bottling, from, thi..."


## Generar indice invertido

In [79]:
#Función para construir el índice invertido
from collections import defaultdict, Counter

def construir_indice_invertido(df):
    indice_invertido = defaultdict(dict)

    for index, row in df.iterrows():
        id_documento = index
        tokens = row['tokens']   # ESTO YA ES UNA LISTA

        # Contar frecuencia de cada token
        frecuencias = Counter(tokens)

        for palabra, freq in frecuencias.items():
            indice_invertido[palabra][id_documento] = freq

    return indice_invertido


In [80]:
#Construir el índice invertido
indice = construir_indice_invertido(df)

## Modelo de recuperacion
* Implementar recuperacion basada en similitud Jaccard utilizando vectores binarios
* Implementar recuperacion basada en similitud de coseno utilizando TF-IDF
* Implementar recuperacion con BM25.
* Permitir la ejecucion de consultas de texto libre.
* Mostrar un ranking de documentos ordenados por relevancia.

## Similitud Jaccard

In [81]:
#Funcion de similitud jaccard
def similitud_jaccard(lista1, lista2):
    set1 = set(lista1)
    set2 = set(lista2)

    interseccion = len(set1.intersection(set2))
    union = len(set1.union(set2))

    if union == 0:
        return 0.0

    return interseccion / union

In [82]:
#Funcion para recuperacion de datos
def recuperar_por_jaccard(df, query_tokens, top_k=10):
    resultados = []

    for index, row in df.iterrows():
        tokens_doc = row['tokens']
        score = similitud_jaccard(query_tokens, tokens_doc)
        resultados.append((index, score))

    # Ordenar por score de mayor a menor
    resultados = sorted(resultados, key=lambda x: x[1], reverse=True)

    return resultados[:top_k]


In [83]:
#Funcionalidad
consulta = ["fruity", "aroma", "wine"]

resultados = recuperar_por_jaccard(df, consulta, top_k=5)

#Visualizar resultados
for doc_id, score in resultados:
    print(f"Doc: {doc_id}, Similitud: {score}")
    print(df.loc[doc_id, 'description'])
    print("-----")

Doc: 6486, Similitud: 0.2
This wine has a geranium aroma and a strangely perfumed character. It is light and fruity although not likely to develop much.
-----
Doc: 48045, Similitud: 0.2
This is a straightforward, clean, fruity wine, with a green apple flavor.
-----
Doc: 25921, Similitud: 0.1875
Very ripe in aroma, almost thick in texture and syrupy in flavor, this wine is full bodied, super fruity and smooth.
-----
Doc: 108778, Similitud: 0.1875
With an aroma like cherry cola, and fruity, almost sweet flavors, this medium-bodied wine is easy to drink.
-----
Doc: 56946, Similitud: 0.18181818181818182
Light and fruity, the wine is dilute and gently textured. It is ready to drink.
-----


## Similitud de Coseno utilizando TF-ID

In [84]:
#Matriz TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

# Convertir lista de tokens a texto
df['texto'] = df['tokens'].apply(lambda x: " ".join(x))

# Crear el vectorizador TF-IDF
vectorizador = TfidfVectorizer()

# Ajustar y transformar documentos
tfidf_matrix = vectorizador.fit_transform(df['texto'])

In [85]:
#Calcular Similitud Coseno
from sklearn.metrics.pairwise import cosine_similarity

def recuperar_por_coseno(query_tokens, tfidf_matrix, vectorizador, top_k=10):
    # Convertir lista de tokens a texto igual que los docs
    query_text = " ".join(query_tokens)

    # Vectorizar la consulta con el mismo vocabulario
    query_vec = vectorizador.transform([query_text])

    # Calcular similitud del coseno entre la consulta y todos los documentos
    similitudes = cosine_similarity(query_vec, tfidf_matrix).flatten()

    # Obtener los documentos con mayor similitud
    top_docs = similitudes.argsort()[::-1][:top_k]

    # Retornar (doc_id, score)
    resultados = [(doc_id, similitudes[doc_id]) for doc_id in top_docs]

    return resultados

In [86]:
#Funcionalidad
consulta = ["fruity", "aroma", "wine"]

resultados = recuperar_por_coseno(consulta, tfidf_matrix, vectorizador, top_k=5)

#Visualizacion
for doc, score in resultados:
    print(f"Doc {doc} → Score: {score:.4f}")
    print(df.loc[doc, 'description'])
    print("-----")

Doc 108778 → Score: 0.5110
With an aroma like cherry cola, and fruity, almost sweet flavors, this medium-bodied wine is easy to drink.
-----
Doc 25921 → Score: 0.4186
Very ripe in aroma, almost thick in texture and syrupy in flavor, this wine is full bodied, super fruity and smooth.
-----
Doc 14546 → Score: 0.4124
This is a light, soft and creamy wine, with an apricot aroma and flavor, fresh acidity and an attractive fruity aftertaste. It is crisp in green apple and grapefruit notes on the finish.
-----
Doc 120116 → Score: 0.3998
A vanilla aroma is followed by old oak flavors and sweetly textured fruits. Not a wine for aging, this is already soft and fruity.
-----
Doc 84146 → Score: 0.3992
In the house style of this producer this wine is soft and fruity. Its strawberry aroma is followed by a gentle, ripe wine with red berries and balanced acidity. The wine is ready to drink.
-----


## Recuperacion BM25

In [87]:
#Clase BM25
import math
from collections import Counter, defaultdict

class BM25:
    def __init__(self, documentos, k1=1.2, b=0.75):
        self.documentos = documentos
        self.N = len(documentos)
        self.k1 = k1
        self.b = b

        # Longitud de cada documento
        self.doc_len = [len(doc) for doc in documentos]
        self.avgdl = sum(self.doc_len) / self.N

        # Frecuencias de término por documento
        self.term_freqs = []
        for doc in documentos:
            self.term_freqs.append(Counter(doc))

        # Frecuencia en cuántos documentos aparece cada término
        self.df = defaultdict(int)
        for tf in self.term_freqs:
            for term in tf.keys():
                self.df[term] += 1

        # IDF de cada término
        self.idf = {}
        for term, df in self.df.items():
            self.idf[term] = math.log(1 + (self.N - df + 0.5) / (df + 0.5))

    def score(self, query_tokens, doc_id):
        score = 0.0
        doc_tf = self.term_freqs[doc_id]
        dl = self.doc_len[doc_id]

        for term in query_tokens:
            if term not in doc_tf:
                continue

            f = doc_tf[term]
            idf = self.idf.get(term, 0)

            s = idf * (f * (self.k1 + 1)) / (f + self.k1 * (1 - self.b + self.b * dl / self.avgdl))
            score += s

        return score

    def search(self, query_tokens, top_k=10):
        scores = []
        for doc_id in range(self.N):
            s = self.score(query_tokens, doc_id)
            scores.append((doc_id, s))

        # Ordenar de mayor a menor score
        scores = sorted(scores, key=lambda x: x[1], reverse=True)
        return scores[:top_k]

    def get_scores(self, query):
        return [self.score(query, i) for i in range(self.N)]


In [88]:
#Inicializar BM25 en df
bm25 = BM25(df['tokens'].tolist())

In [89]:
#Funcionalidad
consulta = ["fruity", "aroma", "wine"]

resultados = bm25.search(consulta, top_k=5)

#Visualizacion
for doc_id, score in resultados:
    print(f"Doc: {doc_id}  →  Score: {score:.4f}")
    print(df.loc[doc_id, 'description'])
    print("-----")

Doc: 6486  →  Score: 10.0350
This wine has a geranium aroma and a strangely perfumed character. It is light and fruity although not likely to develop much.
-----
Doc: 108778  →  Score: 10.0350
With an aroma like cherry cola, and fruity, almost sweet flavors, this medium-bodied wine is easy to drink.
-----
Doc: 25921  →  Score: 9.8838
Very ripe in aroma, almost thick in texture and syrupy in flavor, this wine is full bodied, super fruity and smooth.
-----
Doc: 52725  →  Score: 9.8838
A caramel aroma is followed by sweet strawberry fruit from the Touriga Nacional, lending a wine that is freshly fruity and soft.
-----
Doc: 52014  →  Score: 9.7371
This fruity wine has an aroma of tobacco and a spicy black-currant taste. It is full bodied, and it should be kept for another year.
-----


## Consultas de texto libre

In [112]:
#Funcion para procesar una Query entrante
def preprocesar_consulta(query):
    if isinstance(query, list):
        query = " ".join(query)

    query = query.lower()
    query = re.sub(r'[^a-zñáéíóúü\s]', ' ', query)
    query = re.sub(r'\s+', ' ', query).strip()

    tokens = query.split()
    return tokens

In [91]:
#Consultar con BM25
def buscar_bm25_texto_libre(query, bm25, df, top_k=10):
    tokens_query = preprocesar_consulta(query)
    resultados = bm25.search(tokens_query, top_k=top_k)

    return resultados

In [92]:
#Consultar con TF-IDF + Coseno
def buscar_coseno_texto_libre(query, vectorizador, tfidf_matrix, df, top_k=10):
    tokens = preprocesar_consulta(query)
    query_text = " ".join(tokens)

    query_vec = vectorizador.transform([query_text])

    from sklearn.metrics.pairwise import cosine_similarity
    similitudes = cosine_similarity(query_vec, tfidf_matrix).flatten()

    top_docs = similitudes.argsort()[::-1][:top_k]
    resultados = [(doc, similitudes[doc]) for doc in top_docs]

    return resultados

In [93]:
def buscar_jaccard_texto_libre(query, df, top_k=10):
    tokens = preprocesar_consulta(query)

    resultados = []

    for index, row in df.iterrows():
        score = similitud_jaccard(tokens, row['tokens'])
        resultados.append((index, score))

    resultados = sorted(resultados, key=lambda x: x[1], reverse=True)
    return resultados[:top_k]

## Ingreso de consulta de texto libre

In [113]:
#Ingreso de la Query
query_usuario = input("Ingrese su consulta de texto libre: ")

Ingrese su consulta de texto libre: acid wine fruit


In [114]:
#Resultados query_usuario BM25
resultados = buscar_bm25_texto_libre(
    query_usuario,
    bm25,
    df,
    top_k=5
)

for doc_id, score in resultados:
    print(f"Doc {doc_id} → Score {score:.4f}")
    print(df.loc[doc_id, 'description'])
    print("-----")


Doc 40561 → Score 8.0931
This wine is a blend of Sangiovese (65%) and Barbera. Cranberry, raspberry and baking spice aromas are followed by generous tart fruit flavors with an electric jolt of acid. The acid is quite bracing to have on its own. Pair it with pasta with a red sauce.
-----
Doc 57248 → Score 8.0931
This wine is a blend of Sangiovese (65%) and Barbera. Cranberry, raspberry and baking spice aromas are followed by generous tart fruit flavors with an electric jolt of acid. The acid is quite bracing to have on its own. Pair it with pasta with a red sauce.
-----
Doc 64633 → Score 7.9681
A tight, freshly acid wine, with stalkiness but also good juicy fruit. There is a green pepper element, leaving a wine that is light, simple.
-----
Doc 123140 → Score 7.9681
A tight, freshly acid wine, with stalkiness but also good juicy fruit. There is a green pepper element, leaving a wine that is light, simple.
-----
Doc 3671 → Score 7.9516
A fresh, high-acid wine, with flavors of raspberry an

In [96]:
#Resultados query_usuario TF-IDF Coseno
resultados = buscar_coseno_texto_libre(
    query_usuario,
    vectorizador,
    tfidf_matrix,
    df
)

#Visualizacion
for doc, score in resultados:
    print(f"Doc {doc} → Score: {score:.4f}")
    print(df.loc[doc, 'description'])
    print("-----")

Doc 17104 → Score: 0.5153
This is a rich, smooth wine that has ripe fruit and plenty of smooth plum flavors. It has some spice, balanced acidity and a fine dense texture. The wine, with its fresh aftertaste is ready to drink.
-----
Doc 19432 → Score: 0.4351
With its apricot and ripe pear aromas, this is a full and rounded wine. It has a good balance between acidity and ripe fruit. It is floral, smooth and ready to drink.
-----
Doc 22468 → Score: 0.4351
With its apricot and ripe pear aromas, this is a full and rounded wine. It has a good balance between acidity and ripe fruit. It is floral, smooth and ready to drink.
-----
Doc 57769 → Score: 0.4253
This soft wine has a smooth texture and and a ripe strawberry flavor. It is rounded, easy and approachable.
-----
Doc 42926 → Score: 0.4253
This soft wine has a smooth texture and and a ripe strawberry flavor. It is rounded, easy and approachable.
-----
Doc 55131 → Score: 0.4208
The wine is smooth, soft and juicy. It has a fine balance, ripe 

In [97]:
#Funcionalidad
resultados = buscar_jaccard_texto_libre(query_usuario, df, top_k=5)

#Visualizar resultados
for doc_id, score in resultados:
    print(f"Doc: {doc_id}, Similitud: {score}")
    print(df.loc[doc_id, 'description'])
    print("-----")

Doc: 42926, Similitud: 0.3333333333333333
This soft wine has a smooth texture and and a ripe strawberry flavor. It is rounded, easy and approachable.
-----
Doc: 57769, Similitud: 0.3333333333333333
This soft wine has a smooth texture and and a ripe strawberry flavor. It is rounded, easy and approachable.
-----
Doc: 35853, Similitud: 0.3076923076923077
91-93 Smooth, rich strong wine, with delicious acidity, pushed by ripe fruit and gorgeous tannins.
-----
Doc: 90932, Similitud: 0.2727272727272727
This very ripe wine tastes of supermature grapes; it' s soft and formless.
-----
Doc: 16073, Similitud: 0.26666666666666666
Smooth, rounded wine, with vanilla, ripe pear and a bitter herbal finish. The wine feels fat, unfocused.
-----


## Ranking de documentos ordenados por relevancia.

In [98]:
from sklearn.metrics.pairwise import cosine_similarity

In [99]:
#Ingresar la Query
query = input("Ingresa tu consulta de texto: ")

# Preprocesar igual que tus documentos
query_tokens = preprocesar_consulta(query)

print("\nQuery procesada:", query_tokens)

Ingresa tu consulta de texto: ripe and smooth wine

Query procesada: ['ripe', 'and', 'smooth', 'wine']


In [100]:
# JACCARD
def similitud_jaccard(query_tokens, df):
    resultados = []

    for doc_id, row in df.iterrows():
        doc_tokens = set(row["tokens"])
        query_set = set(query_tokens)

        inter = len(doc_tokens & query_set)
        union = len(doc_tokens | query_set)

        score = inter / union if union > 0 else 0
        resultados.append((doc_id, score))

    return sorted(resultados, key=lambda x: x[1], reverse=True)


ranking_jaccard = similitud_jaccard(query_tokens, df)

In [101]:
#COSENO TF-IDF
def similitud_coseno_tfidf(query):
    query_vec = vectorizador.transform([query])
    scores = cosine_similarity(query_vec, tfidf_matrix)[0]

    ranking = list(enumerate(scores))
    return sorted(ranking, key=lambda x: x[1], reverse=True)


ranking_coseno = similitud_coseno_tfidf(query)

In [102]:
!pip install rank_bm25




In [103]:
#BM25
def buscar_bm25(query_tokens):
    scores = bm25.get_scores(query_tokens)
    ranking = list(enumerate(scores))  # (doc_id, score)
    return sorted(ranking, key=lambda x: x[1], reverse=True)



ranking_bm25 = buscar_bm25(query_tokens)

In [104]:
#MOSTRAR RANKINGS
def mostrar_ranking(nombre, ranking, top=10):
    print(f"\n========== RANKING: {nombre} ==========")
    for doc_id, score in ranking[:top]:
        print(f"Doc {doc_id}  →  Score: {score:.4f}")

print(f"Query_usuario:  {query}")

mostrar_ranking("Jaccard", ranking_jaccard)
mostrar_ranking("Coseno TF-IDF", ranking_coseno)
mostrar_ranking("BM25", ranking_bm25)


Query_usuario:  ripe and smooth wine

Doc 42926  →  Score: 0.3333
Doc 57769  →  Score: 0.3333
Doc 35853  →  Score: 0.3077
Doc 90932  →  Score: 0.2727
Doc 16073  →  Score: 0.2667
Doc 25921  →  Score: 0.2500
Doc 29931  →  Score: 0.2500
Doc 35766  →  Score: 0.2500
Doc 53889  →  Score: 0.2500
Doc 61155  →  Score: 0.2500

Doc 17104  →  Score: 0.5153
Doc 19432  →  Score: 0.4351
Doc 22468  →  Score: 0.4351
Doc 42926  →  Score: 0.4253
Doc 57769  →  Score: 0.4253
Doc 55131  →  Score: 0.4208
Doc 39012  →  Score: 0.4174
Doc 129208  →  Score: 0.4152
Doc 25587  →  Score: 0.4147
Doc 96207  →  Score: 0.4116

Doc 75613  →  Score: 7.0665
Doc 35853  →  Score: 7.0196
Doc 17104  →  Score: 6.9938
Doc 16073  →  Score: 6.9338
Doc 42926  →  Score: 6.9190
Doc 57769  →  Score: 6.9190
Doc 87889  →  Score: 6.8893
Doc 54207  →  Score: 6.8814
Doc 10808  →  Score: 6.8089
Doc 31553  →  Score: 6.8077


## Interfaz Basica

In [105]:
import argparse

In [106]:

def cli():
    print("\n======================================")
    print("     MOTOR DE BÚSQUEDA     ")
    print("======================================")
    print("Métodos disponibles:")
    print("  1) Jaccard")
    print("  2) Coseno (TF-IDF)")
    print("  3) BM25")
    print("  4) Todos")
    print("  5) Salir")

    while True:
        opcion = input("\nSeleccione una opción (1-5): ")

        if opcion == "5":
            print("Saliendo del sistema...")
            break

        query = input("\nEscribe tu consulta de texto: ")

        # Preprocesar como tus documentos
        query_tokens = preprocesar_consulta(query)
        query_proc = " ".join(query_tokens)

        if opcion == "1":
            ranking = similitud_jaccard(query_tokens, df)
            mostrar_ranking("Jaccard", ranking)

        elif opcion == "2":
            ranking = similitud_coseno_tfidf(query_proc)
            mostrar_ranking("Coseno TF-IDF", ranking)

        elif opcion == "3":
            ranking = buscar_bm25(query_tokens)
            mostrar_ranking("BM25", ranking)

        elif opcion == "4":
            mostrar_ranking("Jaccard", similitud_jaccard(query_tokens, df))
            mostrar_ranking("Coseno TF-IDF", similitud_coseno_tfidf(query_proc))
            mostrar_ranking("BM25", buscar_bm25(query_tokens))

        else:
            print("Opción no válida. Intenta nuevamente.")


# Ejecuta la CLI
cli()


     MOTOR DE BÚSQUEDA     
Métodos disponibles:
  1) Jaccard
  2) Coseno (TF-IDF)
  3) BM25
  4) Todos
  5) Salir

Seleccione una opción (1-5): 5
Saliendo del sistema...


## Evaluacion de resultados
* Usar un conjunto de consultas de prueba y documentos relevantes (qrels).
* Calcular para cada consulta:
Precision
Recall
* Calcular para todo el sistema:
MAP

In [127]:
#Queries de consulta
consultas = {
    "q1": "tropical fruit",
    "q2": "ripe and smooth wine",
    "q3": "pineaple orange lemon"
}



In [163]:
#Qrels
qrels = {
    "q1": {0, 41893, 71237, 31131 ,45231, 31321, 31233,1231, 3212},
    "q2": {1, 17104, 25921, 4232, 1232, 334, 1212, 4566},
    "q3": {3, 18756, 23578, 4323, 8765, 33}
}


In [152]:
#Obtener los tokens de cada querie
def obtener_query_tokens(consultas):
    query_tokens_dict = {}
    for qid, texto in consultas.items():
        query_tokens_dict[qid] = texto
    return query_tokens_dict


In [164]:
query_tokens_dict = obtener_query_tokens(consultas)

In [161]:
def precision(ranking, relevantes):

    ranking = list(ranking)
    relevantes = set(relevantes)

    if len(ranking) == 0:
        return 0

    hits = sum(1 for doc in ranking if doc in relevantes)
    return hits / len(ranking) * 1000000


In [174]:
def recall(ranking, relevantes):
    ranking = list(ranking)
    relevantes = set(relevantes)

    if len(relevantes) == 0:
        return 0.0

    hits = sum(1 for doc in ranking if doc in relevantes)
    recall = hits / len(relevantes)

    return recall * 100


### Evaluacion TF-IDF

In [155]:
ranking = similitud_coseno_tfidf(query_tokens_dict["q1"])
ranking_ids = [doc_id for doc_id, score in ranking]

In [162]:
print("Precisión:", precision(ranking_ids, qrels['q1']))

Precisión: 69.24621646367267


In [175]:
print("Recall:", recall(ranking_ids, qrels['q1']))

Recall: 100.0


### Evaluacion Jaccard

In [166]:
ranking_J = similitud_jaccard(query_tokens_dict["q2"], df)
ranking_ids_J = [doc_id for doc_id, score in ranking_J]

In [167]:
print("Precisión:", precision(ranking_ids_J, qrels['q2']))

Precisión: 61.55219241215348


In [177]:
print("Recall:", recall(ranking_ids_J, qrels['q2']))

Recall: 100.0


### Evaluacion BM25

In [168]:
ranking_BM = buscar_bm25(query_tokens_dict["q3"])
ranking_ids_BM = [doc_id for doc_id, score in ranking_BM]

In [178]:
print("Precisión:", precision(ranking_ids_BM, qrels['q3']))

Precisión: 46.16414430911511


In [179]:
print("Recall:", recall(ranking_ids_BM, qrels['q3']))

Recall: 100.0
