# Examen B1
 Alexis Vera



### Instalacion de librerias

In [8]:
pip install nltk scikit-learn




In [9]:
pip install nltk scikit-learn pandas



### Importarcion de librerias y recursos nltk

In [25]:
#Importar librerias
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import defaultdict
import json
from nltk.stem import PorterStemmer, WordNetLemmatizer

#Descargar recursos nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

###Carga del corpus

In [32]:
#Cargar el archivo CSV del corpus Rotten Tomatoes
corpus_path = 'rotten_tomatoes_critic_reviews.csv'
df = pd.read_csv(corpus_path)

#Verificar las primeras filas del DataFrame
print(df.head())

  rotten_tomatoes_link      critic_name  top_critic           publisher_name  \
0            m/0814255  Andrew L. Urban       False           Urban Cinefile   
1            m/0814255    Louise Keller       False           Urban Cinefile   
2            m/0814255              NaN       False      FILMINK (Australia)   
3            m/0814255     Ben McEachen       False  Sunday Mail (Australia)   
4            m/0814255      Ethan Alter        True       Hollywood Reporter   

  review_type review_score review_date  \
0       Fresh          NaN  2010-02-06   
1       Fresh          NaN  2010-02-06   
2       Fresh          NaN  2010-02-09   
3       Fresh        3.5/5  2010-02-09   
4      Rotten          NaN  2010-02-10   

                                      review_content  
0  A fantasy adventure that fuses Greek mythology...  
1  Uma Thurman as Medusa, the gorgon with a coiff...  
2  With a top-notch cast and dazzling special eff...  
3  Whether audiences will get behind The Light

##1.- Preprocesamiento de Datos

###Funciones de limpieza

In [38]:
def preprocess_text(text):
    #Herramientas de stemming
    stemmer = PorterStemmer()

    #Convertir a minúsculas
    text = text.lower()
    #Eliminar caracteres especiales y números
    text = re.sub(r'[^a-z\s]', '', text)
    #Tokenización
    tokens = word_tokenize(text)
    #Eliminar stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    #Aplicar stemming
    stemmed_tokens = [stemmer.stem(word) for word in tokens]

    return stemmed_tokens


### Aplicar preprocesamiento

In [39]:
#Seleccionar la columna de objetivo "review_content"
text_column = "review_content"
if text_column not in df.columns:
    raise ValueError(f"La columna '{text_column}' no se encontró. Verifica las columnas disponibles: {df.columns}")

#Aplicar preprocesamiento
df['tokens'] = df[text_column].astype(str).apply(preprocess_text)

#Visualizar texto original y texto preprocesado (primeras filas)
print("Texto original y preprocesado:")
for i in range(5):
    print(f"Original: {df[text_column].iloc[i]}")
    print(f"Preprocesado: {df['tokens'].iloc[i]}")
    print("-" * 50)


Texto original y preprocesado:
Original: A fantasy adventure that fuses Greek mythology to contemporary American places and values. Anyone around 15 (give or take a couple of years) will thrill to the visual spectacle
Preprocesado: ['fantasi', 'adventur', 'fuse', 'greek', 'mytholog', 'contemporari', 'american', 'place', 'valu', 'anyon', 'around', 'give', 'take', 'coupl', 'year', 'thrill', 'visual', 'spectacl']
--------------------------------------------------
Original: Uma Thurman as Medusa, the gorgon with a coiffure of writhing snakes and stone-inducing hypnotic gaze is one of the highlights of this bewitching fantasy
Preprocesado: ['uma', 'thurman', 'medusa', 'gorgon', 'coiffur', 'writh', 'snake', 'stoneinduc', 'hypnot', 'gaze', 'one', 'highlight', 'bewitch', 'fantasi']
--------------------------------------------------
Original: With a top-notch cast and dazzling special effects, this will tide the teens over until the next Harry Potter instalment.
Preprocesado: ['topnotch', 'cast

In [40]:
#Crear índice invertido
inverted_index = defaultdict(list)

for idx, tokens in enumerate(df['tokens']):
    for token in tokens:
        inverted_index[token].append(idx)

#Guardar índice invertido como archivo JSON
with open("inverted_index.json", "w") as f:
    json.dump(inverted_index, f)

print("Índice invertido creado y guardado como 'inverted_index.json'")


Índice invertido creado y guardado como 'inverted_index.json'


## 2.- Sistema de Recuperacion

In [41]:
#Función de búsqueda en el índice invertido
def search_inverted_index(query, inverted_index, df):
    # Preprocesar la consulta
    query_tokens = preprocess_text(query)
    print(f"Términos de búsqueda (preprocesados): {query_tokens}")

    #Recuperar índices de documentos que contienen los términos
    matching_docs = set()
    for token in query_tokens:
        if token in inverted_index:
            matching_docs.update(inverted_index[token])

    #Mostrar documentos encontrados
    if matching_docs:
        print(f"Se encontraron {len(matching_docs)} documentos que coinciden:")
        for doc_id in matching_docs:
            print(f"Documento {doc_id}: {df[text_column].iloc[doc_id]}")
            print("-" * 50)
    else:
        print("No se encontraron documentos para la consulta.")

#Cargar índice invertido generado previamente
with open("inverted_index.json", "r") as f:
    inverted_index = json.load(f)




## 3.- Simulacion de Consulta

In [42]:
#Consulta de ejemplo
query = input("Introduce tu consulta: ")

#Realizar búsqueda
search_inverted_index(query, inverted_index, df)

[1;30;43mSe truncaron las últimas líneas 5000 del resultado de transmisión.[0m
Documento 779795: A handsomely-mounted, muscular, red-blooded Old West adventure ... One of the most stunning films to look at in recent years.
--------------------------------------------------
Documento 878103: A feast for the eyes and a thrillingly brazen transposition of the high-tech on the old-fashioned, fueled by a dose of what-if historical fantasy.
--------------------------------------------------
Documento 124439: A poorly written and disappointingly animated adventure that may keep the wee'uns happy for a while but will leave the adults snoozing.
--------------------------------------------------
Documento 517655: [Gibson and Glover] make a great team, and some of their early adventures are exciting. But the film runs out of gas as it turns into an extended chase sequence.
--------------------------------------------------
Documento 157211: As a noir-tinged Western hybrid, the film is a classic

In [43]:
#Cargar el archivo CSV del corpus Rotten Tomatoes movies
corpus_path = 'rotten_tomatoes_movies.csv'
movies_df = pd.read_csv(corpus_path)

#Verificar las primeras filas del DataFrame
print(movies_df.head())

                    rotten_tomatoes_link  \
0                              m/0814255   
1                              m/0878835   
2                                   m/10   
3                 m/1000013-12_angry_men   
4  m/1000079-20000_leagues_under_the_sea   

                                         movie_title  \
0  Percy Jackson & the Olympians: The Lightning T...   
1                                        Please Give   
2                                                 10   
3                    12 Angry Men (Twelve Angry Men)   
4                       20,000 Leagues Under The Sea   

                                          movie_info  \
0  Always trouble-prone, the life of teenager Per...   
1  Kate (Catherine Keener) and her husband Alex (...   
2  A successful, middle-aged Hollywood songwriter...   
3  Following the closing arguments in a murder tr...   
4  In 1866, Professor Pierre M. Aronnax (Paul Luk...   

                                   critics_consensus content_

In [52]:
# Función de búsqueda y comparación
def search_and_compare_movies(query, inverted_index, df, movies_df, text_column, movie_column, rating_column):
    # Preprocesar la consulta
    query_tokens = preprocess_text(query)
    print(f"Términos de búsqueda (preprocesados): {query_tokens}")

    # Recuperar índices de documentos que contienen los términos
    matching_docs = set()
    for token in query_tokens:
        if token in inverted_index:
            matching_docs.update(inverted_index[token])

    # Mostrar documentos encontrados
    if matching_docs:
        print(f"Se encontraron {len(matching_docs)} documentos que coinciden:")
        movie_scores = []
        for doc_id in matching_docs:
            # Recuperar texto y nombre de la película
            movie_name = df[movie_column].iloc[doc_id]
            review_text = df[text_column].iloc[doc_id]
            print(f"Película: {movie_name}")
            print(f"Reseña: {review_text}")
            print("-" * 50)

            # Buscar la película en el nuevo corpus
            movie_data = movies_df[movies_df['rotten_tomatoes_link'] == movie_name]
            if not movie_data.empty:
                score = movie_data[rating_column].iloc[0]
                movie_scores.append((movie_name, score))
            else:
                print(f"Película '{movie_name}' no encontrada en el corpus de películas.")

        # Determinar la mejor película
        if movie_scores:
            best_movie = max(movie_scores, key=lambda x: x[1])
            print(f"La mejor película según '{rating_column}' es: {best_movie[0]} con un puntaje de {best_movie[1]}.")
        else:
            print("No se encontraron datos para comparar las películas en el corpus de películas.")
    else:
        print("No se encontraron documentos para la consulta.")

# Columnas relevantes
text_column = "review_content"
movie_column = "rotten_tomatoes_link"
rating_column = "audience_rating "

# Consulta de ejemplo
query = input("Introduce tu consulta: ")

# Realizar búsqueda con comparación
search_and_compare_movies(query, inverted_index, df, movies_df, text_column, movie_column, rating_column)


Introduce tu consulta: fantasy adventure
Términos de búsqueda (preprocesados): ['fantasi', 'adventur']
Se encontraron 14014 documentos que coinciden:
Película: m/0814255
Reseña: A fantasy adventure that fuses Greek mythology to contemporary American places and values. Anyone around 15 (give or take a couple of years) will thrill to the visual spectacle
--------------------------------------------------


KeyError: 'audience_rating '