In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("zynicide/wine-reviews")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'wine-reviews' dataset.
Path to dataset files: /kaggle/input/wine-reviews


## Importar Librerias

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import nltk
nltk.download("punkt")
nltk.download("stopwords")
nltk.download('punkt_tab')
import re
from nltk.corpus import stopwords
from collections import defaultdict


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


/kaggle/input/wine-reviews/winemag-data_first150k.csv
/kaggle/input/wine-reviews/winemag-data-130k-v2.json
/kaggle/input/wine-reviews/winemag-data-130k-v2.csv


## Construccion del ındice
* Leer un corpus de documentos en texto plano.
* Procesamiento basico: tokenizacion, normalizacion y remocion de stopwords.
* Construccion de un ındice invertido que almacene, para cada termino, los documentos en los que aparece y su frecuencia.

In [3]:
#Ruta del Corpus
path = "/kaggle/input/wine-reviews/winemag-data-130k-v2.csv"

df = pd.read_csv(path)

print("Documentos cargados:", len(df))
display(df.head())

Documentos cargados: 129971


Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks


## Preprocesamiento


In [4]:
#Limpiar columnas del dataframe
df = df.drop(['country', 'designation', 'points', 'price', 'province', 'region_1', 'region_2', 'taster_name', 'taster_twitter_handle', 'title', 'variety', 'winery' ], axis=1)
display(df.head())

Unnamed: 0.1,Unnamed: 0,description
0,0,"Aromas include tropical fruit, broom, brimston..."
1,1,"This is ripe and fruity, a wine that is smooth..."
2,2,"Tart and snappy, the flavors of lime flesh and..."
3,3,"Pineapple rind, lemon pith and orange blossom ..."
4,4,"Much like the regular bottling from 2012, this..."


In [5]:
df = df.rename(columns={'Unnamed: 0': 'wine_id'})
display(df.head())

Unnamed: 0,wine_id,description
0,0,"Aromas include tropical fruit, broom, brimston..."
1,1,"This is ripe and fruity, a wine that is smooth..."
2,2,"Tart and snappy, the flavors of lime flesh and..."
3,3,"Pineapple rind, lemon pith and orange blossom ..."
4,4,"Much like the regular bottling from 2012, this..."


In [6]:
#Funcion para preprocesar el dataframe
def clean_and_tokenize(text):
    #Convertir a minúsculas
    text = text.lower()

    #Eliminar caracteres no alfabéticos
    text = re.sub(r"[^a-záéíóúñü\s]", " ", text)

    #Tokenización
    tokens = nltk.word_tokenize(text, language="spanish")

    #Stopwords en español
    stop_es = set(stopwords.words("spanish"))

    #Remover tokens muy cortos y stopwords
    tokens = [t for t in tokens if len(t) > 2 and t not in stop_es]

    return tokens


In [7]:
#Aplicar preprocesamiento
df['tokens'] = df['description'].apply(clean_and_tokenize)
display(df.head())

Unnamed: 0,wine_id,description,tokens
0,0,"Aromas include tropical fruit, broom, brimston...","[aromas, include, tropical, fruit, broom, brim..."
1,1,"This is ripe and fruity, a wine that is smooth...","[this, ripe, and, fruity, wine, that, smooth, ..."
2,2,"Tart and snappy, the flavors of lime flesh and...","[tart, and, snappy, the, flavors, lime, flesh,..."
3,3,"Pineapple rind, lemon pith and orange blossom ...","[pineapple, rind, lemon, pith, and, orange, bl..."
4,4,"Much like the regular bottling from 2012, this...","[much, like, the, regular, bottling, from, thi..."


## Generar indice invertido

In [8]:
#Función para construir el índice invertido
from collections import defaultdict, Counter

def construir_indice_invertido(df):
    indice_invertido = defaultdict(dict)

    for index, row in df.iterrows():
        id_documento = index
        tokens = row['tokens']   # ESTO YA ES UNA LISTA

        # Contar frecuencia de cada token
        frecuencias = Counter(tokens)

        for palabra, freq in frecuencias.items():
            indice_invertido[palabra][id_documento] = freq

    return indice_invertido


In [9]:
#Construir el índice invertido
indice = construir_indice_invertido(df)

## Modelo de recuperacion
* Implementar recuperacion basada en similitud Jaccard utilizando vectores binarios
* Implementar recuperacion basada en similitud de coseno utilizando TF-IDF
* Implementar recuperacion con BM25.
* Permitir la ejecucion de consultas de texto libre.
* Mostrar un ranking de documentos ordenados por relevancia.

## Similitud Jaccard

In [10]:
#Funcion de similitud jaccard
def similitud_jaccard(lista1, lista2):
    set1 = set(lista1)
    set2 = set(lista2)

    interseccion = len(set1.intersection(set2))
    union = len(set1.union(set2))

    if union == 0:
        return 0.0

    return interseccion / union

In [11]:
#Funcion para recuperacion de datos
def recuperar_por_jaccard(df, query_tokens, top_k=10):
    resultados = []

    for index, row in df.iterrows():
        tokens_doc = row['tokens']
        score = similitud_jaccard(query_tokens, tokens_doc)
        resultados.append((index, score))

    # Ordenar por score de mayor a menor
    resultados = sorted(resultados, key=lambda x: x[1], reverse=True)

    return resultados[:top_k]


In [12]:
#Funcionalidad
consulta = ["fruity", "aroma", "wine"]

resultados = recuperar_por_jaccard(df, consulta, top_k=5)

#Visualizar resultados
for doc_id, score in resultados:
    print(f"Doc: {doc_id}, Similitud: {score}")
    print(df.loc[doc_id, 'description'])
    print("-----")

Doc: 6486, Similitud: 0.2
This wine has a geranium aroma and a strangely perfumed character. It is light and fruity although not likely to develop much.
-----
Doc: 48045, Similitud: 0.2
This is a straightforward, clean, fruity wine, with a green apple flavor.
-----
Doc: 25921, Similitud: 0.1875
Very ripe in aroma, almost thick in texture and syrupy in flavor, this wine is full bodied, super fruity and smooth.
-----
Doc: 108778, Similitud: 0.1875
With an aroma like cherry cola, and fruity, almost sweet flavors, this medium-bodied wine is easy to drink.
-----
Doc: 56946, Similitud: 0.18181818181818182
Light and fruity, the wine is dilute and gently textured. It is ready to drink.
-----


## Similitud de Coseno utilizando TF-ID

In [13]:
#Matriz TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

# Convertir lista de tokens a texto
df['texto'] = df['tokens'].apply(lambda x: " ".join(x))

# Crear el vectorizador TF-IDF
vectorizador = TfidfVectorizer()

# Ajustar y transformar documentos
tfidf_matrix = vectorizador.fit_transform(df['texto'])

In [14]:
#Calcular Similitud Coseno
from sklearn.metrics.pairwise import cosine_similarity

def recuperar_por_coseno(query_tokens, tfidf_matrix, vectorizador, top_k=10):
    # Convertir lista de tokens a texto igual que los docs
    query_text = " ".join(query_tokens)

    # Vectorizar la consulta con el mismo vocabulario
    query_vec = vectorizador.transform([query_text])

    # Calcular similitud del coseno entre la consulta y todos los documentos
    similitudes = cosine_similarity(query_vec, tfidf_matrix).flatten()

    # Obtener los documentos con mayor similitud
    top_docs = similitudes.argsort()[::-1][:top_k]

    # Retornar (doc_id, score)
    resultados = [(doc_id, similitudes[doc_id]) for doc_id in top_docs]

    return resultados

In [15]:
#Funcionalidad
consulta = ["fruity", "aroma", "wine"]

resultados = recuperar_por_coseno(consulta, tfidf_matrix, vectorizador, top_k=5)

#Visualizacion
for doc, score in resultados:
    print(f"Doc {doc} → Score: {score:.4f}")
    print(df.loc[doc, 'description'])
    print("-----")

Doc 108778 → Score: 0.5110
With an aroma like cherry cola, and fruity, almost sweet flavors, this medium-bodied wine is easy to drink.
-----
Doc 25921 → Score: 0.4186
Very ripe in aroma, almost thick in texture and syrupy in flavor, this wine is full bodied, super fruity and smooth.
-----
Doc 14546 → Score: 0.4124
This is a light, soft and creamy wine, with an apricot aroma and flavor, fresh acidity and an attractive fruity aftertaste. It is crisp in green apple and grapefruit notes on the finish.
-----
Doc 120116 → Score: 0.3998
A vanilla aroma is followed by old oak flavors and sweetly textured fruits. Not a wine for aging, this is already soft and fruity.
-----
Doc 84146 → Score: 0.3992
In the house style of this producer this wine is soft and fruity. Its strawberry aroma is followed by a gentle, ripe wine with red berries and balanced acidity. The wine is ready to drink.
-----


## Recuperacion BM25

In [16]:
#Clase BM25
import math
from collections import Counter, defaultdict

class BM25:
    def __init__(self, documentos, k1=1.2, b=0.75):
        self.documentos = documentos
        self.N = len(documentos)
        self.k1 = k1
        self.b = b

        # Longitud de cada documento
        self.doc_len = [len(doc) for doc in documentos]
        self.avgdl = sum(self.doc_len) / self.N

        # Frecuencias de término por documento
        self.term_freqs = []
        for doc in documentos:
            self.term_freqs.append(Counter(doc))

        # Frecuencia en cuántos documentos aparece cada término
        self.df = defaultdict(int)
        for tf in self.term_freqs:
            for term in tf.keys():
                self.df[term] += 1

        # IDF de cada término
        self.idf = {}
        for term, df in self.df.items():
            self.idf[term] = math.log(1 + (self.N - df + 0.5) / (df + 0.5))

    def score(self, query_tokens, doc_id):
        score = 0.0
        doc_tf = self.term_freqs[doc_id]
        dl = self.doc_len[doc_id]

        for term in query_tokens:
            if term not in doc_tf:
                continue

            f = doc_tf[term]
            idf = self.idf.get(term, 0)

            s = idf * (f * (self.k1 + 1)) / (f + self.k1 * (1 - self.b + self.b * dl / self.avgdl))
            score += s

        return score

    def search(self, query_tokens, top_k=10):
        scores = []
        for doc_id in range(self.N):
            s = self.score(query_tokens, doc_id)
            scores.append((doc_id, s))

        # Ordenar de mayor a menor score
        scores = sorted(scores, key=lambda x: x[1], reverse=True)
        return scores[:top_k]

    def get_scores(self, query):
        return [self.score(query, i) for i in range(self.N)]


In [17]:
#Inicializar BM25 en df
bm25 = BM25(df['tokens'].tolist())

In [18]:
#Funcionalidad
consulta = ["fruity", "aroma", "wine"]

resultados = bm25.search(consulta, top_k=5)

#Visualizacion
for doc_id, score in resultados:
    print(f"Doc: {doc_id}  →  Score: {score:.4f}")
    print(df.loc[doc_id, 'description'])
    print("-----")

Doc: 6486  →  Score: 10.0350
This wine has a geranium aroma and a strangely perfumed character. It is light and fruity although not likely to develop much.
-----
Doc: 108778  →  Score: 10.0350
With an aroma like cherry cola, and fruity, almost sweet flavors, this medium-bodied wine is easy to drink.
-----
Doc: 25921  →  Score: 9.8838
Very ripe in aroma, almost thick in texture and syrupy in flavor, this wine is full bodied, super fruity and smooth.
-----
Doc: 52725  →  Score: 9.8838
A caramel aroma is followed by sweet strawberry fruit from the Touriga Nacional, lending a wine that is freshly fruity and soft.
-----
Doc: 52014  →  Score: 9.7371
This fruity wine has an aroma of tobacco and a spicy black-currant taste. It is full bodied, and it should be kept for another year.
-----


## Consultas de texto libre

In [19]:
#Funcion para procesar una Query entrante
def preprocesar_consulta(query):
    if isinstance(query, list):
        query = " ".join(query)

    query = query.lower()
    query = re.sub(r'[^a-zñáéíóúü\s]', ' ', query)
    query = re.sub(r'\s+', ' ', query).strip()

    tokens = query.split()
    return tokens

In [20]:
#Consultar con BM25
def buscar_bm25_texto_libre(query, bm25, df, top_k=10):
    tokens_query = preprocesar_consulta(query)
    resultados = bm25.search(tokens_query, top_k=top_k)

    return resultados

In [21]:
#Consultar con TF-IDF + Coseno
def buscar_coseno_texto_libre(query, vectorizador, tfidf_matrix, df, top_k=10):
    tokens = preprocesar_consulta(query)
    query_text = " ".join(tokens)

    query_vec = vectorizador.transform([query_text])

    from sklearn.metrics.pairwise import cosine_similarity
    similitudes = cosine_similarity(query_vec, tfidf_matrix).flatten()

    top_docs = similitudes.argsort()[::-1][:top_k]
    resultados = [(doc, similitudes[doc]) for doc in top_docs]

    return resultados

In [22]:
def buscar_jaccard_texto_libre(query, df, top_k=10):
    tokens = preprocesar_consulta(query)

    resultados = []

    for index, row in df.iterrows():
        score = similitud_jaccard(tokens, row['tokens'])
        resultados.append((index, score))

    resultados = sorted(resultados, key=lambda x: x[1], reverse=True)
    return resultados[:top_k]

## Ingreso de consulta de texto libre

In [23]:
#Ingreso de la Query
query_usuario = input("Ingrese su consulta de texto libre: ")

Ingrese su consulta de texto libre: sweet grape wine


In [24]:
#Resultados query_usuario BM25
resultados = buscar_bm25_texto_libre(
    query_usuario,
    bm25,
    df,
    top_k=5
)

for doc_id, score in resultados:
    print(f"Doc {doc_id} → Score {score:.4f}")
    print(df.loc[doc_id, 'description'])
    print("-----")


Doc 9859 → Score 9.1367
Candied peach, melon and grape notes are sweet and simple in this fruity white wine. Made from Diamond, a Concord-grape hybrid, with just a splash of Riesling, it's forward and easy but finishes with a crush of sweet-tart citrus. Drink now.
-----
Doc 18349 → Score 9.0054
Made with undisclosed grape varieties, this rustic, softly sweet wine has apricot, honey and herb flavors.
-----
Doc 117009 → Score 9.0054
Made with undisclosed grape varieties, this rustic, softly sweet wine has apricot, honey and herb flavors.
-----
Doc 23931 → Score 8.7015
With an eye for all the commercial sweet-spots, this is a Rosato sparkling wine made primarily with the fragrant Moscato grape (with 15% Malvasia). The wine is sweet in a saccharine, artificial way.
-----
Doc 108567 → Score 8.5225
This is rosé by virtue of blending a white grape (Pinot Gris) with a red grape (Zinfandel); hence the color. You'll find simple, slightly sweet strawberry and peach flavors.
-----


In [25]:
#Resultados query_usuario TF-IDF Coseno
resultados = buscar_coseno_texto_libre(
    query_usuario,
    vectorizador,
    tfidf_matrix,
    df
)

#Visualizacion
for doc, score in resultados:
    print(f"Doc {doc} → Score: {score:.4f}")
    print(df.loc[doc, 'description'])
    print("-----")

Doc 9859 → Score: 0.4142
Candied peach, melon and grape notes are sweet and simple in this fruity white wine. Made from Diamond, a Concord-grape hybrid, with just a splash of Riesling, it's forward and easy but finishes with a crush of sweet-tart citrus. Drink now.
-----
Doc 108567 → Score: 0.3891
This is rosé by virtue of blending a white grape (Pinot Gris) with a red grape (Zinfandel); hence the color. You'll find simple, slightly sweet strawberry and peach flavors.
-----
Doc 23116 → Score: 0.3660
It's easy to taste the fresh grape flavors in this medium-bodied, structured and appetizing wine. Good acidity and moderate tannins lift up black grape, blackberry and strawberry flavors to create very good balance.
-----
Doc 40854 → Score: 0.3548
Fruity like grape jam and blackberry syrup, this is a rich, almost-sweet wine. It has a very dark color, impressive fruit concentration and a smooth texture.
-----
Doc 114376 → Score: 0.3471
This widely found local grape has produced a crisp, wine

In [26]:
#Funcionalidad
resultados = buscar_jaccard_texto_libre(query_usuario, df, top_k=5)

#Visualizar resultados
for doc_id, score in resultados:
    print(f"Doc: {doc_id}, Similitud: {score}")
    print(df.loc[doc_id, 'description'])
    print("-----")

Doc: 13022, Similitud: 0.25
A flat tasting, slightly sweet, generic white wine.
-----
Doc: 123451, Similitud: 0.2222222222222222
As sweet as a dessert wine, with simple pineapple jam flavors.
-----
Doc: 14735, Similitud: 0.2
As sweet and sugary as a dessert wine, with watery berry flavors.
-----
Doc: 18349, Similitud: 0.2
Made with undisclosed grape varieties, this rustic, softly sweet wine has apricot, honey and herb flavors.
-----
Doc: 117009, Similitud: 0.2
Made with undisclosed grape varieties, this rustic, softly sweet wine has apricot, honey and herb flavors.
-----


## Ranking de documentos ordenados por relevancia.

In [27]:
from sklearn.metrics.pairwise import cosine_similarity

In [28]:
#Ingresar la Query
query = input("Ingresa tu consulta de texto: ")

# Preprocesar igual que tus documentos
query_tokens = preprocesar_consulta(query)

print("\nQuery procesada:", query_tokens)

Ingresa tu consulta de texto: sweet grape wine

Query procesada: ['sweet', 'grape', 'wine']


In [29]:
# JACCARD
def similitud_jaccard(query_tokens, df):
    resultados = []

    for doc_id, row in df.iterrows():
        doc_tokens = set(row["tokens"])
        query_set = set(query_tokens)

        inter = len(doc_tokens & query_set)
        union = len(doc_tokens | query_set)

        score = inter / union if union > 0 else 0
        resultados.append((doc_id, score))

    return sorted(resultados, key=lambda x: x[1], reverse=True)


ranking_jaccard = similitud_jaccard(query_tokens, df)

In [30]:
#COSENO TF-IDF
def similitud_coseno_tfidf(query):
    query_vec = vectorizador.transform([query])
    scores = cosine_similarity(query_vec, tfidf_matrix)[0]

    ranking = list(enumerate(scores))
    return sorted(ranking, key=lambda x: x[1], reverse=True)


ranking_coseno = similitud_coseno_tfidf(query)

In [31]:
!pip install rank_bm25


Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2


In [32]:
#BM25
def buscar_bm25(query_tokens):
    scores = bm25.get_scores(query_tokens)
    ranking = list(enumerate(scores))  # (doc_id, score)
    return sorted(ranking, key=lambda x: x[1], reverse=True)



ranking_bm25 = buscar_bm25(query_tokens)

In [33]:
#MOSTRAR RANKINGS
def mostrar_ranking(nombre, ranking, top=10):
    print(f"\n========== RANKING: {nombre} ==========")
    for doc_id, score in ranking[:top]:
        print(f"Doc {doc_id}  →  Score: {score:.4f}")

print(f"Query_usuario:  {query}")

mostrar_ranking("Jaccard", ranking_jaccard)
mostrar_ranking("Coseno TF-IDF", ranking_coseno)
mostrar_ranking("BM25", ranking_bm25)


Query_usuario:  sweet grape wine

Doc 13022  →  Score: 0.2500
Doc 123451  →  Score: 0.2222
Doc 14735  →  Score: 0.2000
Doc 18349  →  Score: 0.2000
Doc 117009  →  Score: 0.2000
Doc 118050  →  Score: 0.2000
Doc 127176  →  Score: 0.2000
Doc 5898  →  Score: 0.1818
Doc 12687  →  Score: 0.1818
Doc 14721  →  Score: 0.1818

Doc 9859  →  Score: 0.4142
Doc 108567  →  Score: 0.3891
Doc 23116  →  Score: 0.3660
Doc 40854  →  Score: 0.3548
Doc 114376  →  Score: 0.3471
Doc 41567  →  Score: 0.3455
Doc 72063  →  Score: 0.3455
Doc 18349  →  Score: 0.3446
Doc 117009  →  Score: 0.3446
Doc 66288  →  Score: 0.3388

Doc 9859  →  Score: 9.1367
Doc 18349  →  Score: 9.0054
Doc 117009  →  Score: 9.0054
Doc 23931  →  Score: 8.7015
Doc 108567  →  Score: 8.5225
Doc 59360  →  Score: 8.5163
Doc 48967  →  Score: 8.4785
Doc 66288  →  Score: 8.4267
Doc 19520  →  Score: 8.3563
Doc 77218  →  Score: 8.3563


## Interfaz Basica

In [34]:
import argparse

In [35]:

def cli():
    print("\n======================================")
    print("     MOTOR DE BÚSQUEDA     ")
    print("======================================")
    print("Métodos disponibles:")
    print("  1) Jaccard")
    print("  2) Coseno (TF-IDF)")
    print("  3) BM25")
    print("  4) Todos")
    print("  5) Salir")

    while True:
        opcion = input("\nSeleccione una opción (1-5): ")

        if opcion == "5":
            print("Saliendo del sistema...")
            break

        query = input("\nEscribe tu consulta de texto: ")

        # Preprocesar como tus documentos
        query_tokens = preprocesar_consulta(query)
        query_proc = " ".join(query_tokens)

        if opcion == "1":
            ranking = similitud_jaccard(query_tokens, df)
            mostrar_ranking("Jaccard", ranking)

        elif opcion == "2":
            ranking = similitud_coseno_tfidf(query_proc)
            mostrar_ranking("Coseno TF-IDF", ranking)

        elif opcion == "3":
            ranking = buscar_bm25(query_tokens)
            mostrar_ranking("BM25", ranking)

        elif opcion == "4":
            mostrar_ranking("Jaccard", similitud_jaccard(query_tokens, df))
            mostrar_ranking("Coseno TF-IDF", similitud_coseno_tfidf(query_proc))
            mostrar_ranking("BM25", buscar_bm25(query_tokens))

        else:
            print("Opción no válida. Intenta nuevamente.")


# Ejecuta la CLI
cli()


     MOTOR DE BÚSQUEDA     
Métodos disponibles:
  1) Jaccard
  2) Coseno (TF-IDF)
  3) BM25
  4) Todos
  5) Salir

Seleccione una opción (1-5): 4

Escribe tu consulta de texto: sweet grape wine

Doc 13022  →  Score: 0.2500
Doc 123451  →  Score: 0.2222
Doc 14735  →  Score: 0.2000
Doc 18349  →  Score: 0.2000
Doc 117009  →  Score: 0.2000
Doc 118050  →  Score: 0.2000
Doc 127176  →  Score: 0.2000
Doc 5898  →  Score: 0.1818
Doc 12687  →  Score: 0.1818
Doc 14721  →  Score: 0.1818

Doc 9859  →  Score: 0.4142
Doc 108567  →  Score: 0.3891
Doc 23116  →  Score: 0.3660
Doc 40854  →  Score: 0.3548
Doc 114376  →  Score: 0.3471
Doc 41567  →  Score: 0.3455
Doc 72063  →  Score: 0.3455
Doc 18349  →  Score: 0.3446
Doc 117009  →  Score: 0.3446
Doc 66288  →  Score: 0.3388

Doc 9859  →  Score: 9.1367
Doc 18349  →  Score: 9.0054
Doc 117009  →  Score: 9.0054
Doc 23931  →  Score: 8.7015
Doc 108567  →  Score: 8.5225
Doc 59360  →  Score: 8.5163
Doc 48967  →  Score: 8.4785
Doc 66288  →  Score: 8.4267
Doc 19520  

## Evaluacion de resultados
* Usar un conjunto de consultas de prueba y documentos relevantes (qrels).
* Calcular para cada consulta:
Precision
Recall
* Calcular para todo el sistema:
MAP

In [36]:
#Queries de consulta
consultas = {
    "q1": "tropical fruit",
    "q2": "ripe and smooth wine",
    "q3": "pineaple orange lemon"
}



In [37]:
#Qrels
qrels = {
    "q1": {0, 41893, 71237, 31131 ,45231, 31321, 31233,1231, 3212},
    "q2": {1, 17104, 25921, 4232, 1232, 334, 1212, 4566},
    "q3": {3, 18756, 23578, 4323, 8765, 33}
}


In [38]:
#Obtener los tokens de cada querie
def obtener_query_tokens(consultas):
    query_tokens_dict = {}
    for qid, texto in consultas.items():
        query_tokens_dict[qid] = texto
    return query_tokens_dict


In [39]:
query_tokens_dict = obtener_query_tokens(consultas)

In [40]:
def precision(ranking, relevantes):

    ranking = list(ranking)
    relevantes = set(relevantes)

    if len(ranking) == 0:
        return 0

    hits = sum(1 for doc in ranking if doc in relevantes)
    return hits / len(ranking) * 1000000


In [41]:
def recall(ranking, relevantes):
    ranking = list(ranking)
    relevantes = set(relevantes)

    if len(relevantes) == 0:
        return 0.0

    hits = sum(1 for doc in ranking if doc in relevantes)
    recall = hits / len(relevantes)

    return recall * 100


### Evaluacion TF-IDF

In [42]:
ranking = similitud_coseno_tfidf(query_tokens_dict["q1"])
ranking_ids = [doc_id for doc_id, score in ranking]

In [43]:
print("Precisión:", precision(ranking_ids, qrels['q1']))

Precisión: 69.24621646367267


In [44]:
print("Recall:", recall(ranking_ids, qrels['q1']))

Recall: 100.0


### Evaluacion Jaccard

In [45]:
ranking_J = similitud_jaccard(query_tokens_dict["q2"], df)
ranking_ids_J = [doc_id for doc_id, score in ranking_J]

In [46]:
print("Precisión:", precision(ranking_ids_J, qrels['q2']))

Precisión: 61.55219241215348


In [47]:
print("Recall:", recall(ranking_ids_J, qrels['q2']))

Recall: 100.0


### Evaluacion BM25

In [48]:
ranking_BM = buscar_bm25(query_tokens_dict["q3"])
ranking_ids_BM = [doc_id for doc_id, score in ranking_BM]

In [49]:
print("Precisión:", precision(ranking_ids_BM, qrels['q3']))

Precisión: 46.16414430911511


In [50]:
print("Recall:", recall(ranking_ids_BM, qrels['q3']))

Recall: 100.0


### MAP

In [58]:
from typing import Dict, List, Set, Union, Tuple

#Funcion para generar MAP
def _extract_doc_ids(ranking: List[Union[int, Tuple[int, float]]]) -> List[int]:

    if len(ranking) == 0:
        return []
    first = ranking[0]
    if isinstance(first, (list, tuple)):
        return [int(doc) for doc, _ in ranking]
    else:
        return [int(doc) for doc in ranking]

def average_precision(ranking: List[Union[int, Tuple[int, float]]],
                      relevantes: Set[int],
                      k: int = None) -> float:

    doc_ids = _extract_doc_ids(ranking)
    if k is not None:
        doc_ids = doc_ids[:k]

    if not relevantes:
        # No hay relevantes: convención -> return 0.0 (puedes elegir omitir luego)
        return 0.0

    hits = 0
    acum_prec = 0.0

    for i, doc in enumerate(doc_ids, start=1):
        if doc in relevantes:
            hits += 1
            acum_prec += hits / i

    if hits == 0:
        return 0.0

    # Promediar sobre el número total de relevantes (no sobre hits)
    return acum_prec / len(relevantes)


def mean_average_precision(rankings: Dict[str, List[Union[int, Tuple[int, float]]]],
                           qrels: Dict[str, Set[int]],
                           k: int = None,
                           ignore_empty_qrels: bool = True) -> float:

    ap_list = []
    for qid, ranking in rankings.items():
        relevantes = set(qrels.get(qid, set()))
        if len(relevantes) == 0 and ignore_empty_qrels:
            # Omitir consulta sin relevantes
            continue
        ap = average_precision(ranking, relevantes, k=k)
        ap_list.append(ap)

    if len(ap_list) == 0:
        return 0.0

    return sum(ap_list) / len(ap_list)


In [56]:
#Rankings del sistema
rankings_sistema = {
    "q1": ranking,
    "q2": ranking_J,
    "q3": ranking_BM
}

In [57]:
#Funcionalidad MAP
map_score = mean_average_precision(rankings_sistema, qrels)
print(f"MAP: {map_score:.4f}  (equiv. {map_score*100:.2f}%)")


MAP: 0.0680  (equiv. 6.80%)
