In [1]:
import pandas as pd
import sklearn as sk
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split

In [2]:
import json
# Cargar la configuración desde un archivo JSON
with open("configura_local.json", encoding="utf-8") as f:
    config = json.load(f)

print("Ruta cargada:", config["data_path"])

Ruta cargada: C:/Users/jmuri/OneDrive/Escritorio/Stuff/Proyecto aprendizaje autmomático/archiverotten/rotten_tomatoes_critic_reviews.csv


In [3]:
file_path = json.load(open("configura_local.json", encoding="utf-8"))["data_path"]
df = pd.read_csv(file_path)

In [4]:
df.head(5)

Unnamed: 0,rotten_tomatoes_link,critic_name,top_critic,publisher_name,review_type,review_score,review_date,review_content
0,m/0814255,Andrew L. Urban,False,Urban Cinefile,Fresh,,2010-02-06,A fantasy adventure that fuses Greek mythology...
1,m/0814255,Louise Keller,False,Urban Cinefile,Fresh,,2010-02-06,"Uma Thurman as Medusa, the gorgon with a coiff..."
2,m/0814255,,False,FILMINK (Australia),Fresh,,2010-02-09,With a top-notch cast and dazzling special eff...
3,m/0814255,Ben McEachen,False,Sunday Mail (Australia),Fresh,3.5/5,2010-02-09,Whether audiences will get behind The Lightnin...
4,m/0814255,Ethan Alter,True,Hollywood Reporter,Rotten,,2010-02-10,What's really lacking in The Lightning Thief i...


In [5]:
#Filtrado de columnas que se van a utilizar
columnas = ['review_score','review_content']
df = df[columnas]
df.head(5)

Unnamed: 0,review_score,review_content
0,,A fantasy adventure that fuses Greek mythology...
1,,"Uma Thurman as Medusa, the gorgon with a coiff..."
2,,With a top-notch cast and dazzling special eff...
3,3.5/5,Whether audiences will get behind The Lightnin...
4,,What's really lacking in The Lightning Thief i...


In [6]:
df.dropna(inplace=True)

In [7]:
# Diccionario de conversión de letras a escala de 5 como fracción
letter_to_score = {
    'A': '5/5',
    'A-': '4.7/5',
    'B+': '4.3/5',
    'B': '4/5',
    'B-': '3.7/5',
    'C+': '3.3/5',
    'C': '3/5',
    'C-': '2.7/5',
    'D+': '2.3/5',
    'D': '2/5',
    'D-': '1.7/5',
    'F': '1/5',
}

# Función para reemplazar calificaciones de letra
def convert_score(score):
    score_str = str(score).strip()
    if score_str in letter_to_score:
        return letter_to_score[score_str]
    return score_str  # deja igual si ya es una fracción como "3.5/5"

# Aplica la función a la columna de calificaciones
df['review_score'] = df['review_score'].apply(convert_score)

In [8]:
df.head(5)

Unnamed: 0,review_score,review_content
3,3.5/5,Whether audiences will get behind The Lightnin...
6,1/4,Harry Potter knockoffs don't come more transpa...
7,3.5/5,"Percy Jackson isn't a great movie, but it's a ..."
8,4/5,"Fun, brisk and imaginative"
9,3/5,"Crammed with dragons, set-destroying fights an..."


In [9]:
# Convertir fracciones a número decimal
def fraction_to_float(score_str):
    try:
        num, den = score_str.split('/')
        return float(num) / float(den)
    except:
        return None  # para manejar errores o valores inesperados

df['review_score'] = df['review_score'].apply(fraction_to_float)


In [10]:
df.head(5)

Unnamed: 0,review_score,review_content
3,0.7,Whether audiences will get behind The Lightnin...
6,0.25,Harry Potter knockoffs don't come more transpa...
7,0.7,"Percy Jackson isn't a great movie, but it's a ..."
8,0.8,"Fun, brisk and imaginative"
9,0.6,"Crammed with dragons, set-destroying fights an..."


In [11]:
# Calcular la mediana
median_score = df['review_score'].median()
print("Mediana de review_score:", round(median_score, 3))

Mediana de review_score: 0.66


In [12]:
# Clasificar como 'positivo' si el score es mayor que la mediana, si no, 'negativo'
df['sentiment'] = df['review_score'].apply(
    lambda x: 'positivo' if x >= median_score else 'negativo'
)

In [13]:
df

Unnamed: 0,review_score,review_content,sentiment
3,0.70,Whether audiences will get behind The Lightnin...,positivo
6,0.25,Harry Potter knockoffs don't come more transpa...,negativo
7,0.70,"Percy Jackson isn't a great movie, but it's a ...",positivo
8,0.80,"Fun, brisk and imaginative",positivo
9,0.60,"Crammed with dragons, set-destroying fights an...",negativo
...,...,...,...
1130006,0.80,As a spectacular war film with a powerful mora...,positivo
1130013,0.70,"Seen today, it's not only a startling indictme...",positivo
1130014,0.86,A rousing visual spectacle that's a prequel of...,positivo
1130015,0.70,"A simple two-act story: Prelude to war, and th...",positivo


In [14]:
print(df['sentiment'].value_counts())

sentiment
positivo    386805
negativo    371904
Name: count, dtype: int64


In [22]:
df = df[['review_content', 'sentiment']]
target_map = {'positivo': 1, 'negativo': 0}
df['sentiment'] = df['sentiment'].map(target_map)
df.head(5)

Unnamed: 0,review_content,sentiment
3,Whether audiences will get behind The Lightnin...,1
6,Harry Potter knockoffs don't come more transpa...,0
7,"Percy Jackson isn't a great movie, but it's a ...",1
8,"Fun, brisk and imaginative",1
9,"Crammed with dragons, set-destroying fights an...",0


In [24]:
df_train, df_test = train_test_split(df)

In [25]:
df_train 

Unnamed: 0,review_content,sentiment
907753,Decline with regrets...,0
125512,87 minutes of pure joy/,1
72387,Here's the bottom line: Denis Cooverman might ...,0
704865,"Woo has created a resounding epic, blending a ...",1
515060,Ill-balanced between a younger and older child...,0
...,...,...
399278,Buoyed by Steve Coogan's unapologetically go-f...,0
1087196,"The movie packs a powerful, emotional punch, a...",1
48719,A guilty pleasure,1
426692,"Ms. Kidman, in a performance of astounding bra...",1


In [26]:
df_test 

Unnamed: 0,review_content,sentiment
466444,"Jack and Jill is total bust, a stupefyingly un...",0
376625,"While beautiful, both the film and its female ...",0
447675,a sumptuous film ... with clever social commen...,1
411084,"If you like gory action shlock, you should giv...",0
16487,"People running into each other can be funny, b...",0
...,...,...
118869,"In the end, Alice Through the Looking Glass ne...",0
756155,"Released over Super Bowl weekend, it makes per...",0
51741,A clever blend of fact and fiction.,1
665683,The predictable ending could have been improve...,0


In [27]:
vectorizer = TfidfVectorizer(max_features=2000)

X_train = vectorizer.fit_transform(df_train['review_content'])
X_test = vectorizer.transform(df_test['review_content'])