In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("zynicide/wine-reviews")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'wine-reviews' dataset.
Path to dataset files: /kaggle/input/wine-reviews


## Importar Librerias

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import nltk
nltk.download("punkt")
nltk.download("stopwords")
import re
from nltk.corpus import stopwords
from collections import defaultdict


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


/kaggle/input/wine-reviews/winemag-data_first150k.csv
/kaggle/input/wine-reviews/winemag-data-130k-v2.json
/kaggle/input/wine-reviews/winemag-data-130k-v2.csv


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## Construccion del ındice
* Leer un corpus de documentos en texto plano.
* Procesamiento basico: tokenizacion, normalizacion y remocion de stopwords.
* Construccion de un ındice invertido que almacene, para cada termino, los documentos en los que aparece y su frecuencia.

In [None]:
#Ruta del Corpus
path = "/kaggle/input/wine-reviews/winemag-data-130k-v2.csv"

df = pd.read_csv(path)

print("Documentos cargados:", len(df))
display(df.head())

Documentos cargados: 129971


Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks


## Preprocesamiento


In [None]:
#Limpiar columnas del dataframe
df = df.drop(['country', 'designation', 'points', 'price', 'province', 'region_1', 'region_2', 'taster_name', 'taster_twitter_handle', 'title', 'variety', 'winery' ], axis=1)
display(df.head())

Unnamed: 0.1,Unnamed: 0,description
0,0,"Aromas include tropical fruit, broom, brimston..."
1,1,"This is ripe and fruity, a wine that is smooth..."
2,2,"Tart and snappy, the flavors of lime flesh and..."
3,3,"Pineapple rind, lemon pith and orange blossom ..."
4,4,"Much like the regular bottling from 2012, this..."


In [None]:
df = df.rename(columns={'Unnamed: 0': 'wine_id'})
display(df.head())

Unnamed: 0,wine_id,description
0,0,"Aromas include tropical fruit, broom, brimston..."
1,1,"This is ripe and fruity, a wine that is smooth..."
2,2,"Tart and snappy, the flavors of lime flesh and..."
3,3,"Pineapple rind, lemon pith and orange blossom ..."
4,4,"Much like the regular bottling from 2012, this..."


In [None]:
#Funcion para preprocesar el dataframe
def clean_and_tokenize(text):
    #Convertir a minúsculas
    text = text.lower()

    #Eliminar caracteres no alfabéticos
    text = re.sub(r"[^a-záéíóúñü\s]", " ", text)

    #Tokenización
    tokens = nltk.word_tokenize(text, language="spanish")

    #Stopwords en español
    stop_es = set(stopwords.words("spanish"))

    #Remover tokens muy cortos y stopwords
    tokens = [t for t in tokens if len(t) > 2 and t not in stop_es]

    return tokens


In [None]:
#Aplicar preprocesamiento
df['tokens'] = df['description'].apply(clean_and_tokenize)
display(df.head())

Unnamed: 0,wine_id,description,tokens
0,0,"Aromas include tropical fruit, broom, brimston...","[aromas, include, tropical, fruit, broom, brim..."
1,1,"This is ripe and fruity, a wine that is smooth...","[this, ripe, and, fruity, wine, that, smooth, ..."
2,2,"Tart and snappy, the flavors of lime flesh and...","[tart, and, snappy, the, flavors, lime, flesh,..."
3,3,"Pineapple rind, lemon pith and orange blossom ...","[pineapple, rind, lemon, pith, and, orange, bl..."
4,4,"Much like the regular bottling from 2012, this...","[much, like, the, regular, bottling, from, thi..."


## Generar indice invertido

In [None]:
#Función para construir el índice invertido
from collections import defaultdict, Counter

def construir_indice_invertido(df):
    indice_invertido = defaultdict(dict)

    for index, row in df.iterrows():
        id_documento = index
        tokens = row['tokens']   # ESTO YA ES UNA LISTA

        # Contar frecuencia de cada token
        frecuencias = Counter(tokens)

        for palabra, freq in frecuencias.items():
            indice_invertido[palabra][id_documento] = freq

    return indice_invertido


In [None]:
#Construir el índice invertido
indice = construir_indice_invertido(df)

## Modelo de recuperacion
* Implementar recuperacion basada en similitud Jaccard utilizando vectores binarios
* Implementar recuperacion basada en similitud de coseno utilizando TF-IDF
* Implementar recuperacion con BM25.
* Permitir la ejecucion de consultas de texto libre.
* Mostrar un ranking de documentos ordenados por relevancia.

## Similitud Jaccard

In [None]:
#Funcion de similitud jaccard
def similitud_jaccard(lista1, lista2):
    set1 = set(lista1)
    set2 = set(lista2)

    interseccion = len(set1.intersection(set2))
    union = len(set1.union(set2))

    if union == 0:
        return 0.0

    return interseccion / union

In [None]:
#Funcion para recuperacion de datos
def recuperar_por_jaccard(df, query_tokens, top_k=10):
    resultados = []

    for index, row in df.iterrows():
        tokens_doc = row['tokens']
        score = similitud_jaccard(query_tokens, tokens_doc)
        resultados.append((index, score))

    # Ordenar por score de mayor a menor
    resultados = sorted(resultados, key=lambda x: x[1], reverse=True)

    return resultados[:top_k]


In [None]:
#Funcionalidad
consulta = ["fruity", "aroma", "wine"]

resultados = recuperar_por_jaccard(df, consulta, top_k=5)

#Visualizar resultados
for doc_id, score in resultados:
    print(f"Doc: {doc_id}, Similitud: {score}")
    print(df.loc[doc_id, 'description'])
    print("-----")

Doc: 6486, Similitud: 0.2
This wine has a geranium aroma and a strangely perfumed character. It is light and fruity although not likely to develop much.
-----
Doc: 48045, Similitud: 0.2
This is a straightforward, clean, fruity wine, with a green apple flavor.
-----
Doc: 25921, Similitud: 0.1875
Very ripe in aroma, almost thick in texture and syrupy in flavor, this wine is full bodied, super fruity and smooth.
-----
Doc: 108778, Similitud: 0.1875
With an aroma like cherry cola, and fruity, almost sweet flavors, this medium-bodied wine is easy to drink.
-----
Doc: 56946, Similitud: 0.18181818181818182
Light and fruity, the wine is dilute and gently textured. It is ready to drink.
-----


## Similitud de Coseno utilizando TF-ID

In [None]:
#Matriz TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

# Convertir lista de tokens a texto
df['texto'] = df['tokens'].apply(lambda x: " ".join(x))

# Crear el vectorizador TF-IDF
vectorizador = TfidfVectorizer()

# Ajustar y transformar documentos
tfidf_matrix = vectorizador.fit_transform(df['texto'])

In [None]:
#Calcular Similitud Coseno
from sklearn.metrics.pairwise import cosine_similarity

def recuperar_por_coseno(query_tokens, tfidf_matrix, vectorizador, top_k=10):
    # Convertir lista de tokens a texto igual que los docs
    query_text = " ".join(query_tokens)

    # Vectorizar la consulta con el mismo vocabulario
    query_vec = vectorizador.transform([query_text])

    # Calcular similitud del coseno entre la consulta y todos los documentos
    similitudes = cosine_similarity(query_vec, tfidf_matrix).flatten()

    # Obtener los documentos con mayor similitud
    top_docs = similitudes.argsort()[::-1][:top_k]

    # Retornar (doc_id, score)
    resultados = [(doc_id, similitudes[doc_id]) for doc_id in top_docs]

    return resultados

In [None]:
#Funcionalidad
consulta = ["fruity", "aroma", "wine"]

resultados = recuperar_por_coseno(consulta, tfidf_matrix, vectorizador, top_k=5)

#Visualizacion
for doc, score in resultados:
    print(f"Doc {doc} → Score: {score:.4f}")
    print(df.loc[doc, 'description'])
    print("-----")

Doc 108778 → Score: 0.5110
With an aroma like cherry cola, and fruity, almost sweet flavors, this medium-bodied wine is easy to drink.
-----
Doc 25921 → Score: 0.4186
Very ripe in aroma, almost thick in texture and syrupy in flavor, this wine is full bodied, super fruity and smooth.
-----
Doc 14546 → Score: 0.4124
This is a light, soft and creamy wine, with an apricot aroma and flavor, fresh acidity and an attractive fruity aftertaste. It is crisp in green apple and grapefruit notes on the finish.
-----
Doc 120116 → Score: 0.3998
A vanilla aroma is followed by old oak flavors and sweetly textured fruits. Not a wine for aging, this is already soft and fruity.
-----
Doc 84146 → Score: 0.3992
In the house style of this producer this wine is soft and fruity. Its strawberry aroma is followed by a gentle, ripe wine with red berries and balanced acidity. The wine is ready to drink.
-----


## Recuperacion BM25

In [None]:
#Clase BM25
import math
from collections import Counter, defaultdict

class BM25:
    def __init__(self, documentos, k1=1.2, b=0.75):
        self.documentos = documentos
        self.N = len(documentos)
        self.k1 = k1
        self.b = b

        # Longitud de cada documento
        self.doc_len = [len(doc) for doc in documentos]
        self.avgdl = sum(self.doc_len) / self.N

        # Frecuencias de término por documento
        self.term_freqs = []
        for doc in documentos:
            self.term_freqs.append(Counter(doc))

        # Frecuencia en cuántos documentos aparece cada término
        self.df = defaultdict(int)
        for tf in self.term_freqs:
            for term in tf.keys():
                self.df[term] += 1

        # IDF de cada término
        self.idf = {}
        for term, df in self.df.items():
            self.idf[term] = math.log(1 + (self.N - df + 0.5) / (df + 0.5))

    def score(self, query_tokens, doc_id):
        score = 0.0
        doc_tf = self.term_freqs[doc_id]
        dl = self.doc_len[doc_id]

        for term in query_tokens:
            if term not in doc_tf:
                continue

            f = doc_tf[term]
            idf = self.idf.get(term, 0)

            s = idf * (f * (self.k1 + 1)) / (f + self.k1 * (1 - self.b + self.b * dl / self.avgdl))
            score += s

        return score

    def search(self, query_tokens, top_k=10):
        scores = []
        for doc_id in range(self.N):
            s = self.score(query_tokens, doc_id)
            scores.append((doc_id, s))

        # Ordenar de mayor a menor score
        scores = sorted(scores, key=lambda x: x[1], reverse=True)
        return scores[:top_k]


In [None]:
#Inicializar BM25 en df
bm25 = BM25(df['tokens'].tolist())

In [None]:
#Funcionalidad
consulta = ["fruity", "aroma", "wine"]

resultados = bm25.search(consulta, top_k=5)

#Visualizacion
for doc_id, score in resultados:
    print(f"Doc: {doc_id}  →  Score: {score:.4f}")
    print(df.loc[doc_id, 'description'])
    print("-----")

Doc: 6486  →  Score: 10.0350
This wine has a geranium aroma and a strangely perfumed character. It is light and fruity although not likely to develop much.
-----
Doc: 108778  →  Score: 10.0350
With an aroma like cherry cola, and fruity, almost sweet flavors, this medium-bodied wine is easy to drink.
-----
Doc: 25921  →  Score: 9.8838
Very ripe in aroma, almost thick in texture and syrupy in flavor, this wine is full bodied, super fruity and smooth.
-----
Doc: 52725  →  Score: 9.8838
A caramel aroma is followed by sweet strawberry fruit from the Touriga Nacional, lending a wine that is freshly fruity and soft.
-----
Doc: 52014  →  Score: 9.7371
This fruity wine has an aroma of tobacco and a spicy black-currant taste. It is full bodied, and it should be kept for another year.
-----


## Consultas de texto libre

In [None]:
#Funcion para procesar una Query entrante
def preprocesar_consulta(query):
    query = query.lower()
    query = re.sub(r'[^a-zñáéíóúü\s]', ' ', query)
    query = re.sub(r'\s+', ' ', query).strip()

    tokens = query.split()
    return tokens

In [None]:
#Consultar con BM25
def buscar_bm25_texto_libre(query, bm25, df, top_k=10):
    tokens_query = preprocesar_consulta(query)
    resultados = bm25.search(tokens_query, top_k=top_k)

    return resultados

In [None]:
#Consultar con TF-IDF + Coseno
def buscar_coseno_texto_libre(query, vectorizador, tfidf_matrix, df, top_k=10):
    tokens = preprocesar_consulta(query)
    query_text = " ".join(tokens)

    query_vec = vectorizador.transform([query_text])

    from sklearn.metrics.pairwise import cosine_similarity
    similitudes = cosine_similarity(query_vec, tfidf_matrix).flatten()

    top_docs = similitudes.argsort()[::-1][:top_k]
    resultados = [(doc, similitudes[doc]) for doc in top_docs]

    return resultados

In [None]:
def buscar_jaccard_texto_libre(query, df, top_k=10):
    tokens = preprocesar_consulta(query)

    resultados = []

    for index, row in df.iterrows():
        score = similitud_jaccard(tokens, row['tokens'])
        resultados.append((index, score))

    resultados = sorted(resultados, key=lambda x: x[1], reverse=True)
    return resultados[:top_k]

## Ingreso de consulta de texto libre

In [None]:
#Ingreso de la Query
query_usuario = input("Ingrese su consulta de texto libre: ")

Ingrese su consulta de texto libre: white wine bad aroma


In [None]:
#Resultados query_usuario BM25
resultados = buscar_bm25_texto_libre(
    query_usuario,
    bm25,
    df,
    top_k=5
)

for doc_id, score in resultados:
    print(f"Doc {doc_id} → Score {score:.4f}")
    print(df.loc[doc_id, 'description'])
    print("-----")


Doc 23932 → Score 11.0218
Simple and cloying in white sugar notes. Too bad, because there's good acidity and succulent citrus and tropical fruit flavors.
-----
Doc 33988 → Score 9.9634
This is pretty good Zin, fully ripened and showing classic wild berry and spice notes, although there’s an earthy, cardboardy aroma that’s a little off-putting. The tannins are very soft, and the acidity is a bit low, making it simple and one-dimensional. Still, not a bad deal at this price.
-----
Doc 117303 → Score 9.7396
Tropical fruit and citrus aromas aren't bad, but the palate is heavy. The flavor profile only offers soap and grassy white-fruit flavors, while a light, flowery finish falls flat.
-----
Doc 119183 → Score 9.4716
An aroma like white pepper and onion is strong in this medium-bodied wine that has decent olive and berry flavors underneath.
-----
Doc 56735 → Score 9.4578
Rather quiet at first, this wine unfurls into a soft but textured palate. This balanced white lives more off structure th

In [None]:
#Resultados query_usuario TF-IDF Coseno
resultados = buscar_coseno_texto_libre(
    query_usuario,
    vectorizador,
    tfidf_matrix,
    df
)

#Visualizacion
for doc, score in resultados:
    print(f"Doc {doc} → Score: {score:.4f}")
    print(df.loc[doc, 'description'])
    print("-----")

Doc 23932 → Score: 0.3636
Simple and cloying in white sugar notes. Too bad, because there's good acidity and succulent citrus and tropical fruit flavors.
-----
Doc 118689 → Score: 0.3347
This Cab is a country-style wine, simple and thin, with cherry-berry flavors and a touch of oak. It's not bad for the price.
-----
Doc 123449 → Score: 0.3301
A rustic, simple wine, with ripe, sweet cherry and cola flavors. Feels rough and sharp in the mouth, but not bad for the price.
-----
Doc 28900 → Score: 0.3242
This is a Bordeaux blend, and while it's a simple wine, the price isn't bad. It shows plenty of ripe raspberry and cherry fruit, and finishes on the sweet side.
-----
Doc 13078 → Score: 0.3240
Not bad for the price. It's thin and silky in mouthfeel and dry, with decent raspberry, cola and mint flavors.
-----
Doc 95578 → Score: 0.3025
Dry, thin and simple, with a green, minty streak and lots of acidity. Still, with some decent blackberry fruit, it's not a bad wine for the price.
-----
Doc 22

In [None]:
#Funcionalidad
resultados = buscar_jaccard_texto_libre(query_usuario, df, top_k=5)

#Visualizar resultados
for doc_id, score in resultados:
    print(f"Doc: {doc_id}, Similitud: {score}")
    print(df.loc[doc_id, 'description'])
    print("-----")

Doc: 13022, Similitud: 0.2222222222222222
A flat tasting, slightly sweet, generic white wine.
-----
Doc: 35301, Similitud: 0.16666666666666666
This is a light, nondescript white wine, with a trace of sweetness on the finish.
-----
Doc: 96172, Similitud: 0.16666666666666666
This is a light, nondescript white wine, with a trace of sweetness on the finish.
-----
Doc: 119183, Similitud: 0.16666666666666666
An aroma like white pepper and onion is strong in this medium-bodied wine that has decent olive and berry flavors underneath.
-----
Doc: 9483, Similitud: 0.14285714285714285
Imported by Bluewater Wine Company.
-----


## Ranking de documentos ordenados por relevancia.

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
#Ingresar la Query
query = input("Ingresa tu consulta de texto: ")

# Preprocesar igual que tus documentos
query_tokens = preprocesar_consulta(query)

print("\nQuery procesada:", query_tokens)

Ingresa tu consulta de texto: red wine with sweet taste and good aroma

Query procesada: ['red', 'wine', 'with', 'sweet', 'taste', 'and', 'good', 'aroma']


In [None]:
# JACCARD
def similitud_jaccard(query_tokens, df):
    resultados = []

    for doc_id, row in df.iterrows():
        doc_tokens = set(row["tokens"])
        query_set = set(query_tokens)

        inter = len(doc_tokens & query_set)
        union = len(doc_tokens | query_set)

        score = inter / union if union > 0 else 0
        resultados.append((doc_id, score))

    return sorted(resultados, key=lambda x: x[1], reverse=True)


ranking_jaccard = similitud_jaccard(query_tokens, df)

In [None]:
#COSENO TF-IDF
def similitud_coseno_tfidf(query):
    query_vec = vectorizador.transform([query])
    scores = cosine_similarity(query_vec, tfidf_matrix)[0]

    ranking = list(enumerate(scores))
    return sorted(ranking, key=lambda x: x[1], reverse=True)


ranking_coseno = similitud_coseno_tfidf(query)

In [None]:
!pip install rank_bm25


Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2


In [None]:
#BM25
def buscar_bm25(query_tokens):
    scores = bm25.get_scores(query_tokens)
    ranking = list(enumerate(scores))  # (doc_id, score)
    return sorted(ranking, key=lambda x: x[1], reverse=True)



ranking_bm25 = buscar_bm25(query_tokens)

In [None]:
#MOSTRAR RANKINGS
def mostrar_ranking(nombre, ranking, top=10):
    print(f"\n========== RANKING: {nombre} ==========")
    for doc_id, score in ranking[:top]:
        print(f"Doc {doc_id}  →  Score: {score:.4f}")

print(f"Query_usuario:  {query}")

mostrar_ranking("Jaccard", ranking_jaccard)
mostrar_ranking("Coseno TF-IDF", ranking_coseno)
mostrar_ranking("BM25", ranking_bm25)


Query_usuario:  red wine with sweet taste and good aroma

Doc 109225  →  Score: 0.3636
Doc 86525  →  Score: 0.3158
Doc 34514  →  Score: 0.3125
Doc 14735  →  Score: 0.3077
Doc 12687  →  Score: 0.2857
Doc 14721  →  Score: 0.2857
Doc 123458  →  Score: 0.2857
Doc 83844  →  Score: 0.2667
Doc 92441  →  Score: 0.2667
Doc 127162  →  Score: 0.2667

Doc 16972  →  Score: 0.4121
Doc 109225  →  Score: 0.3939
Doc 93988  →  Score: 0.3849
Doc 56181  →  Score: 0.3790
Doc 52014  →  Score: 0.3682
Doc 126484  →  Score: 0.3682
Doc 118701  →  Score: 0.3671
Doc 108778  →  Score: 0.3621
Doc 97891  →  Score: 0.3373
Doc 38977  →  Score: 0.3352

Doc 86525  →  Score: 18.8195
Doc 107703  →  Score: 18.6808
Doc 40767  →  Score: 18.4206
Doc 30065  →  Score: 18.3597
Doc 98319  →  Score: 18.0661
Doc 110236  →  Score: 17.9741
Doc 16972  →  Score: 17.9582
Doc 25427  →  Score: 17.8981
Doc 109225  →  Score: 17.7452
Doc 95278  →  Score: 17.7130
