In [3]:
import pandas as pd
import sklearn as sk
import nltk 
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer

### Importación de los datos y limpieza

In [4]:
import json
# Cargar la configuración desde un archivo JSON
with open("configura_local.json", encoding="utf-8") as f:
    config = json.load(f)

print("Ruta cargada:", config["data_path"])

Ruta cargada: C:/Users/jmuri/OneDrive/Escritorio/Stuff/Proyecto aprendizaje automático/archiverotten/rotten_tomatoes_critic_reviews.csv


In [5]:
file_path = json.load(open("configura_local.json", encoding="utf-8"))["data_path"]
df = pd.read_csv(file_path)

In [6]:
df.head(5)

Unnamed: 0,rotten_tomatoes_link,critic_name,top_critic,publisher_name,review_type,review_score,review_date,review_content
0,m/0814255,Andrew L. Urban,False,Urban Cinefile,Fresh,,2010-02-06,A fantasy adventure that fuses Greek mythology...
1,m/0814255,Louise Keller,False,Urban Cinefile,Fresh,,2010-02-06,"Uma Thurman as Medusa, the gorgon with a coiff..."
2,m/0814255,,False,FILMINK (Australia),Fresh,,2010-02-09,With a top-notch cast and dazzling special eff...
3,m/0814255,Ben McEachen,False,Sunday Mail (Australia),Fresh,3.5/5,2010-02-09,Whether audiences will get behind The Lightnin...
4,m/0814255,Ethan Alter,True,Hollywood Reporter,Rotten,,2010-02-10,What's really lacking in The Lightning Thief i...


In [7]:
#Filtrado de columnas que se van a utilizar
columnas = ['review_score','review_content']
df = df[columnas]
df.head(5)

Unnamed: 0,review_score,review_content
0,,A fantasy adventure that fuses Greek mythology...
1,,"Uma Thurman as Medusa, the gorgon with a coiff..."
2,,With a top-notch cast and dazzling special eff...
3,3.5/5,Whether audiences will get behind The Lightnin...
4,,What's really lacking in The Lightning Thief i...


In [8]:
# Eliminación de filas con valores nulos
df.dropna(inplace=True)

In [9]:
# Diccionario de conversión de letras a escala de 5 como fracción
letter_to_score = {
    'A': '5/5',
    'A-': '4.7/5',
    'B+': '4.3/5',
    'B': '4/5',
    'B-': '3.7/5',
    'C+': '3.3/5',
    'C': '3/5',
    'C-': '2.7/5',
    'D+': '2.3/5',
    'D': '2/5',
    'D-': '1.7/5',
    'F': '1/5',
}

# Función para reemplazar calificaciones de letra
def convert_score(score):
    score_str = str(score).strip()
    if score_str in letter_to_score:
        return letter_to_score[score_str]
    return score_str  # deja igual si ya es una fracción como "3.5/5"

# Aplica la función a la columna de calificaciones
df['review_score'] = df['review_score'].apply(convert_score)

In [10]:
df.head(5)

Unnamed: 0,review_score,review_content
3,3.5/5,Whether audiences will get behind The Lightnin...
6,1/4,Harry Potter knockoffs don't come more transpa...
7,3.5/5,"Percy Jackson isn't a great movie, but it's a ..."
8,4/5,"Fun, brisk and imaginative"
9,3/5,"Crammed with dragons, set-destroying fights an..."


In [11]:
# Convertir fracciones a número decimal
def fraction_to_float(score_str):
    try:
        num, den = score_str.split('/')
        return float(num) / float(den)
    except:
        return None  # para manejar errores o valores inesperados

df['review_score'] = df['review_score'].apply(fraction_to_float)


In [12]:
df.head(5)

Unnamed: 0,review_score,review_content
3,0.7,Whether audiences will get behind The Lightnin...
6,0.25,Harry Potter knockoffs don't come more transpa...
7,0.7,"Percy Jackson isn't a great movie, but it's a ..."
8,0.8,"Fun, brisk and imaginative"
9,0.6,"Crammed with dragons, set-destroying fights an..."


In [13]:
# Calcular la mediana
median_score = df['review_score'].median()
print("Mediana de review_score:", round(median_score, 3))

Mediana de review_score: 0.66


In [14]:
# Clasificar como 'positivo' si el score es mayor que la mediana, si no, 'negativo'
df['sentiment'] = df['review_score'].apply(
    lambda x: 'positivo' if x >= median_score else 'negativo'
)

In [15]:
df

Unnamed: 0,review_score,review_content,sentiment
3,0.70,Whether audiences will get behind The Lightnin...,positivo
6,0.25,Harry Potter knockoffs don't come more transpa...,negativo
7,0.70,"Percy Jackson isn't a great movie, but it's a ...",positivo
8,0.80,"Fun, brisk and imaginative",positivo
9,0.60,"Crammed with dragons, set-destroying fights an...",negativo
...,...,...,...
1130006,0.80,As a spectacular war film with a powerful mora...,positivo
1130013,0.70,"Seen today, it's not only a startling indictme...",positivo
1130014,0.86,A rousing visual spectacle that's a prequel of...,positivo
1130015,0.70,"A simple two-act story: Prelude to war, and th...",positivo


In [16]:
# Conteo de valores positivos y negativos de la columna 'sentiment'
print(df['sentiment'].value_counts())

sentiment
positivo    386805
negativo    371904
Name: count, dtype: int64


In [17]:
# Filtrar las columnas que se van a utilizar
df = df[['review_content', 'sentiment']]

# Cambio de etiquetas de sentimiento a valores numéricos 
# 'positivo' a 1 y 'negativo' a 0
target_map = {'positivo': 1, 'negativo': 0}
df['sentiment'] = df['sentiment'].map(target_map)
df.head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sentiment'] = df['sentiment'].map(target_map)


Unnamed: 0,review_content,sentiment
3,Whether audiences will get behind The Lightnin...,1
6,Harry Potter knockoffs don't come more transpa...,0
7,"Percy Jackson isn't a great movie, but it's a ...",1
8,"Fun, brisk and imaginative",1
9,"Crammed with dragons, set-destroying fights an...",0


### Modelamiento

In [18]:
# Descargar stopwords de NLTK
nltk.download('stopwords')

# Convertir las columnas a listas
texts = df['review_content'].astype(str).tolist()
labels = df['sentiment'].astype(int).tolist()

# Tokenizador de BERT
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Lista de stopwords
stop_words = set(stopwords.words('english'))

# Función de tokenización de los textos planos
def tokenize_and_remove_stopwords(texts):
    filtered_texts = []
    for text in texts:
        tokens = tokenizer.tokenize(text)
        tokens = [token for token in tokens if token.lower() not in stop_words]
        filtered_texts.append(" ".join(tokens))
    return filtered_texts

tokenized_texts = tokenize_and_remove_stopwords(texts)

# Separa los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(tokenized_texts, labels, test_size=0.2, random_state=42)

# Vectorización clásica (TF-IDF)
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jmuri\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
