# Analisis de comentarios de películas

## 1. Importacion de librerias

In [157]:
import numpy as np
import pandas as pd
import re

import matplotlib.pyplot as plt
import seaborn as sns

# Librerias para preprocesamiento de texto
import spacy
nlp = spacy.load('es_core_news_sm')

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


# Librerias para modelos de clasificacion
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


import scipy.stats as stats

## 2. Carga de datos

In [158]:
# Carga del csv
file = './data/MovieReviews.csv'
raw = pd.read_csv(file, sep=',')
reviews = raw.copy()

In [159]:
reviews.shape

(5000, 3)

In [160]:
reviews.head(3)

Unnamed: 0.1,Unnamed: 0,review_es,sentimiento
0,0,Si está buscando una película de guerra típica...,positivo
1,1,Supongo que algunos directores de películas de...,positivo
2,2,Es difícil contarle más sobre esta película si...,positivo


## 3. Preparación de los datos

In [161]:
# Eliminar la columna de Unnnamed
reviews.drop('Unnamed: 0', axis=1, inplace=True)

In [162]:
# Nulos en el dataset
reviews.isnull().sum()


review_es      0
sentimiento    0
dtype: int64

In [163]:
# Funcion para limpiar el texto utilizando spacy
def clean_text(text):
    # Pasar a minusculas
    text = text.lower()

    # Eliminar las tildes
    text = re.sub(r'[á]', 'a', text)
    text = re.sub(r'[é]', 'e', text)
    text = re.sub(r'[í]', 'i', text)
    text = re.sub(r'[ó]', 'o', text)
    text = re.sub(r'[ú]', 'u', text)
    
    # Eliminar los numeros
    text = re.sub(r'\d+', ' ', text)

    # Eliminar los signos de puntuacion
    text = re.sub(r'[^\w\s]', ' ', text)

    # Tokenizar
    tokens = nlp(text)

    # Eliminar stopwords
    tokens = [token.text for token in tokens if not token.is_stop]

    # Unir los tokens
    text = ' '.join(tokens)

    return text

#### Advertencia
Esto se demora cargando como 2 minutos asi que hay que ser paciente

In [164]:
# Aplicar la funcion de limpieza a la columna 'review_es'
reviews['review_es'] = reviews['review_es'].apply(clean_text)

In [165]:
reviews.head(5)

Unnamed: 0,review_es,sentimiento
0,buscando pelicula guerra tipica nota afici...,positivo
1,supongo directores peliculas lujo sentados bus...,positivo
2,dificil contarle pelicula estropearla disfrute...,positivo
3,pelicula comienza lentamente estilo vida wal...,positivo
4,pelicula accion maxima expresion peliculas p...,positivo


In [166]:
# Creacion del vector de palabras
vectorizer = TfidfVectorizer()

# Creacion de la matriz de palabras
X = vectorizer.fit_transform(reviews["review_es"])
Y = reviews["sentimiento"]

In [167]:
clean_df = pd.DataFrame(X.toarray(), columns=vectorizer.vocabulary_)
clean_df.head(3)



Unnamed: 0,buscando,pelicula,guerra,tipica,nota,aficionados,ansiancia,carnajeria,testosterona,molesten,...,alado,bautismal,aerith,vii,cuandoesta,mante,desgraciados,inanidas,idjits,teer
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [168]:
# Creacion de los conjuntos de entrenamiento y prueba
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

## 4. Implementación algoritmos

## 4.1 Algoritmo Nathalia

## 4.2 Algoritmo Sergio

## 4.3 Algoritmo Calixto

In [169]:
# Entrenar el modelo de clasificación Naive Bayes
clf = MultinomialNB()
clf.fit(X_train, Y_train)

In [170]:
# Realizar predicciones en el conjunto de prueba
Y_pred = clf.predict(X_test)

In [171]:
# Calcular las metricas de evaluacion
print('Exactitud: ', accuracy_score(Y_train, Y_pred))
print('Precisión: ', precision_score(Y_train, Y_pred, average='weighted'))
print('Recall: ', recall_score(Y_train, Y_pred, average='weighted'))
print('F1: ', f1_score(Y_train, Y_pred, average='weighted'))

ValueError: Found input variables with inconsistent numbers of samples: [4000, 1000]

## 5. Análisis de resultados

## 6. Conclusiones