In [32]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))




/kaggle/input/clapper-massive-rotten-tomatoes-movies-and-reviews/rotten_tomatoes_movies.csv
/kaggle/input/clapper-massive-rotten-tomatoes-movies-and-reviews/rotten_tomatoes_movie_reviews.csv
/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv


# Ejercicio 1: Introducción a Recuperación de Información

## Objetivo de la práctica
- Entender el problema de **buscar información** en colecciones de texto.
- Comprender por qué se necesita un **índice invertido** en recuperación de información.
- Programar una primera solución manual y luego optimizarla con un índice.
- Evaluar la mejora en tiempos de búsqueda cuando usamos estructuras adecuadas.

## Parte 1: Búsqueda lineal en documentos

### Actividad
1. Se te proporcionará un dataset con reviews de películas.
2. Escribe una función que:
   - Lea todos los documentos.
   - Busque una palabra ingresada por el usuario.
   - Muestre en qué documentos aparece la palabra.

In [33]:
df = pd.read_csv("/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [34]:
query = "wonderful"
mask = df["review"].str.contains(query)
df[mask]

Unnamed: 0,review,sentiment
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
29,'War movie' is a Hollywood genre that has been...,positive
41,"This movie is based on the book, ""A Many Splen...",positive
59,"I just watched The Dresser this evening, havin...",positive
...,...,...
49852,Russ and Valerie are having discussions about ...,positive
49921,"Antonio Margheriti's ""Danza Macabra"" aka. ""Cas...",positive
49935,"""Nurse Betty"" is the kind of movie you can't d...",positive
49938,I made a big mistake going to see this film. T...,negative


In [46]:
def buscar(docs, query):
    mask = docs.astype(str).str.contains(query)
    result = docs[mask]
    return result

    
buscar(df["review"], "wonderful")

1        A wonderful little production. <br /><br />The...
2        I thought this was a wonderful way to spend ti...
29       'War movie' is a Hollywood genre that has been...
41       This movie is based on the book, "A Many Splen...
59       I just watched The Dresser this evening, havin...
                               ...                        
49852    Russ and Valerie are having discussions about ...
49921    Antonio Margheriti's "Danza Macabra" aka. "Cas...
49935    "Nurse Betty" is the kind of movie you can't d...
49938    I made a big mistake going to see this film. T...
49941    Why did the histories of Mary and Rhoda have t...
Name: review, Length: 3083, dtype: object

## Parte 2: Construcción de un índice invertido

### Actividad
1. Escribe un programa que:
   - Recorra todos los documentos.
   - Construya un **índice invertido**, es decir, un diccionario donde:
     - Cada palabra clave apunta a una lista de documentos donde aparece.

2. Escribe una nueva función de búsqueda que:
   - Consulte directamente el índice para encontrar los documentos relevantes.
   - Sea mucho más rápida que la búsqueda lineal.

In [68]:
def construir_indice_invertido(docs):
    indice = {}
    for i, texto in enumerate(docs.astype(str)):
        for palabra in texto.lower().split():
            palabra = palabra.strip(".,!?()[]{}\"'") 
            if palabra not in indice:
                indice[palabra] = [i]
            elif i not in indice[palabra]:
                indice[palabra].append(i)
    return indice

ERROR! Session/line number was not unique in database. History logging moved to new session 34


In [70]:
def construir_indice_invertido_filtrado(docs, palabras_objetivo):
    palabras_objetivo = set(p.lower() for p in palabras_objetivo)
    indice = {}
    for i, texto in enumerate(docs.astype(str)):
        for palabra in texto.lower().split():
            palabra = palabra.strip(".,!?()[]{}\"'")
            if palabra in palabras_objetivo:
                if palabra not in indice:
                    indice[palabra] = [i]
                elif i not in indice[palabra]:
                    indice[palabra].append(i)
    return indice


In [63]:
def buscar_en_indice(indice, query):
    palabras = query.lower().split()
    docs = set(indice.get(palabras[0], []))
    for palabra in palabras[1:]:
        docs &= set(indice.get(palabra, []))  # intersección
    return sorted(docs)


## Parte 3: Evaluación de tiempos de búsqueda
### Actividad

1. Realiza la búsqueda de varias palabras usando:
      -  Corpus pequeño.
      -  Corpus grande.
2. Mide el tiempo de ejecución:
      -  Para búsqueda lineal.
      -  Para búsqueda usando índice invertido.
      -  Grafica o presenta los resultados en una tabla comparativa.

In [85]:
df_large = pd.read_csv("/kaggle/input/clapper-massive-rotten-tomatoes-movies-and-reviews/rotten_tomatoes_movie_reviews.csv")
df_large

Unnamed: 0,id,reviewId,creationDate,criticName,isTopCritic,originalScore,reviewState,publicatioName,reviewText,scoreSentiment,reviewUrl
0,beavers,1145982,2003-05-23,Ivan M. Lincoln,False,3.5/4,fresh,Deseret News (Salt Lake City),Timed to be just long enough for most youngste...,POSITIVE,http://www.deseretnews.com/article/700003233/B...
1,blood_mask,1636744,2007-06-02,The Foywonder,False,1/5,rotten,Dread Central,It doesn't matter if a movie costs 300 million...,NEGATIVE,http://www.dreadcentral.com/index.php?name=Rev...
2,city_hunter_shinjuku_private_eyes,2590987,2019-05-28,Reuben Baron,False,,fresh,CBR,The choreography is so precise and lifelike at...,POSITIVE,https://www.cbr.com/city-hunter-shinjuku-priva...
3,city_hunter_shinjuku_private_eyes,2558908,2019-02-14,Matt Schley,False,2.5/5,rotten,Japan Times,The film's out-of-touch attempts at humor may ...,NEGATIVE,https://www.japantimes.co.jp/culture/2019/02/0...
4,dangerous_men_2015,2504681,2018-08-29,Pat Padua,False,,fresh,DCist,Its clumsy determination is endearing and some...,POSITIVE,http://dcist.com/2015/11/out_of_frame_dangerou...
...,...,...,...,...,...,...,...,...,...,...,...
1444958,thor_love_and_thunder,102706151,2022-07-05,Christie Cronan,False,7/10,fresh,Raising Whasians,Solid but not totally sold&#44; Thor&#58; Ragn...,POSITIVE,https://raisingwhasians.com/thor-love-and-thun...
1444959,thor_love_and_thunder,102706150,2022-07-05,Ian Sandwell,False,4/5,fresh,Digital Spy,Thor&#58; Love and Thunder is the most enterta...,POSITIVE,https://www.digitalspy.com/movies/a40496050/th...
1444960,thor_love_and_thunder,102706149,2022-07-05,Lauren LaMagna,False,8/10,fresh,Next Best Picture,&quot;Thor&#58; Love and Thunder&quot; is a st...,POSITIVE,https://www.nextbestpicture.com/thor-love-and-...
1444961,thor_love_and_thunder,102706148,2022-07-05,Jake Cole,True,1/4,rotten,Slant Magazine,Across Taika Waititi&#8217;s film&#44; a war a...,NEGATIVE,https://www.slantmagazine.com/film/thor-love-a...


# Pruebas

In [73]:
lista = ("wonderful","bad")
indice_invertido = construir_indice_invertido_filtrado(df_large["reviewText"], lista)
print(indice_invertido["wonderful"][:20])

[181, 487, 816, 1053, 1338, 1796, 1917, 1954, 1964, 2113, 2123, 2182, 2225, 2523, 2588, 2994, 3511, 3761, 3976, 3983]


In [74]:
resultados = buscar_en_indice(indice_invertido, "wonderful")

print("Documentos encontrados:", resultados[:10])

Documentos encontrados: [181, 487, 816, 1053, 1338, 1796, 1917, 1954, 1964, 2113]


In [66]:
print(indice_invertido_prueba["wonderful"][:3])
resultados = buscar_en_indice(indice_invertido_prueba, "wonderful")

print("Documentos encontrados:", resultados[:10])

[ 487 1014 1029]
Documentos encontrados: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


# Resolucion de los Ejercicios de paso 3

In [92]:
def buscar_lineal(docs, query):
    mask = docs.astype(str).str.contains(query)
    return docs[mask].index.values

In [93]:
palabras_objetivo = ["wonderful"]
query = "wonderful"

*Corpus Pequeño*

In [94]:
import time

In [100]:
t0 = time.time()
result_lineal_small = buscar(df["review"], query)
t_lineal_small = time.time() - t0

indice_small = construir_indice_invertido_filtrado(df["review"], palabras_objetivo)
t0 = time.time()
result_indice_small = buscar_en_indice(indice_small, query)
t_indice_small = time.time() - t0

*Corpus Grande*

In [101]:
t0 = time.time()
result_lineal_large = buscar(df_large["reviewText"], query)
t_lineal_large = time.time() - t0

indice_large = construir_indice_invertido_filtrado(df_large["reviewText"], palabras_objetivo)
t0 = time.time()
result_indice_large = buscar_en_indice(indice_large, query)
t_indice_large = time.time() - t0

*Resultados*

In [102]:
data = {
    "Corpus": ["Pequeño", "Grande"],
    "Búsqueda lineal (s)": [t_lineal_small, t_lineal_large],
    "Índice invertido (s)": [t_indice_small, t_indice_large],
    "Resultados lineal": [len(result_lineal_small), len(result_lineal_large)],
    "Resultados índice": [len(result_indice_small), len(result_indice_large)]
}

tabla = pd.DataFrame(data)
print(tabla)

    Corpus  Búsqueda lineal (s)  Índice invertido (s)  Resultados lineal  \
0  Pequeño             0.077024              0.000607               3083   
1   Grande             0.691195              0.001918               8393   

   Resultados índice  
0               2715  
1               5954  


*Como se puede ver en los datos de la seccion de arriba se puede ver que cuando es una busqueda por indice la ejecucion es mucho mas rapida*

# Obtencion de todos los indices de Dataset Grande (teorico)

*Indice invertido de todas la palabras ya que la ejecucion dura mas de 20 minutos*

In [None]:
indice_invertido = construir_indice_invertido(df_large["reviewText"])
print(indice_invertido["wonderful"][:10])  # muestra los primeros 10 documentos con "wonderful"

In [None]:
resultados = buscar_en_indice(indice_invertido, "wonderful")

print("Documentos encontrados:", resultados[:10])

# Obtencion de todos los indices de Dataset Pequeño(teorico)

In [None]:
indice_invertido_pequeño = construir_indice_invertido(df["review"])
print(indice_invertido["wonderful"][:10])  # muestra los primeros 10 documentos con "wonderful"

In [None]:
resultados = buscar_en_indice(indice_invertido_pequeño, "wonderful")

print("Documentos encontrados:", resultados[:10])