# Analisis de comentarios de películas

## 1. Importacion de librerias

In [1]:
import numpy as np
import pandas as pd
import re

import matplotlib.pyplot as plt
import seaborn as sns

# Librerias para preprocesamiento de texto
import spacy
nlp = spacy.load('es_core_news_sm')

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


# Librerias para modelos de clasificacion
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


import scipy.stats as stats

## 2. Carga de datos

In [2]:
# Carga del csv
file = './data/MovieReviews.csv'
raw = pd.read_csv(file, sep=',')
reviews = raw.copy()

In [3]:
reviews.shape

(5000, 3)

In [4]:
reviews.head(3)

Unnamed: 0.1,Unnamed: 0,review_es,sentimiento
0,0,Si está buscando una película de guerra típica...,positivo
1,1,Supongo que algunos directores de películas de...,positivo
2,2,Es difícil contarle más sobre esta película si...,positivo


## 3. Preparación de los datos

In [5]:
# Eliminar la columna de Unnnamed
reviews.drop('Unnamed: 0', axis=1, inplace=True)

In [6]:
# Nulos en el dataset
reviews.isnull().sum()


review_es      0
sentimiento    0
dtype: int64

In [7]:
# Funcion para limpiar el texto utilizando spacy
def clean_text(text):
    # Pasar a minusculas
    text = text.lower()

    # Eliminar las tildes
    text = re.sub(r'[á]', 'a', text)
    text = re.sub(r'[é]', 'e', text)
    text = re.sub(r'[í]', 'i', text)
    text = re.sub(r'[ó]', 'o', text)
    text = re.sub(r'[ú]', 'u', text)
    
    # Eliminar los numeros
    text = re.sub(r'\d+', ' ', text)

    # Eliminar los signos de puntuacion
    text = re.sub(r'[^\w\s]', ' ', text)

    # Tokenizar
    tokens = nlp(text)

    # Eliminar stopwords
    tokens = [token.text for token in tokens if not token.is_stop]

    # Unir los tokens
    text = ' '.join(tokens)

    return text

#### Advertencia
Esto se demora cargando como 2 minutos largos asi que hay que ser paciente

In [8]:
# Aplicar la funcion de limpieza a la columna 'review_es'
reviews['review_es'] = reviews['review_es'].apply(clean_text)

In [9]:
reviews.head(5)

Unnamed: 0,review_es,sentimiento
0,buscando pelicula guerra tipica nota afici...,positivo
1,supongo directores peliculas lujo sentados bus...,positivo
2,dificil contarle pelicula estropearla disfrute...,positivo
3,pelicula comienza lentamente estilo vida wal...,positivo
4,pelicula accion maxima expresion peliculas p...,positivo


In [10]:
# Creacion del vector de palabras
vectorizer = TfidfVectorizer()

# Creacion de la matriz de palabras
X = vectorizer.fit_transform(reviews["review_es"])
Y = reviews["sentimiento"]

In [11]:
X

<5000x55720 sparse matrix of type '<class 'numpy.float64'>'
	with 429551 stored elements in Compressed Sparse Row format>

In [12]:
# Crear un df para visualizar la matriz de palabras
clean_df = pd.DataFrame(X.toarray(), columns=vectorizer.vocabulary_)
clean_df.head(3)



Unnamed: 0,buscando,pelicula,guerra,tipica,nota,aficionados,ansiancia,carnajeria,testosterona,molesten,...,alado,bautismal,aerith,vii,cuandoesta,mante,desgraciados,inanidas,idjits,teer
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
# Creacion de los conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

## 4. Implementación algoritmos

## 4.1 Algoritmo Nathalia

## 4.2 Algoritmo Sergio

## 4.3 Algoritmo Calixto

In [25]:
# Entrenar el modelo de clasificación Naive Bayes
clf = MultinomialNB()
clf.fit(X_train, y_train)

In [26]:
# Realizar predicciones en el conjunto de entrenaiento y prueba
y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)

In [31]:
# Calcular las metricas de evaluacion
train_accuracy = accuracy_score(y_train, y_train_pred)
train_precision = precision_score(y_train, y_train_pred, pos_label='positivo')
train_recall = recall_score(y_train, y_train_pred, pos_label='positivo')
train_f1 = f1_score(y_train, y_train_pred, pos_label='positivo')

test_accuracy = accuracy_score(y_test, y_test_pred)
test_precision = precision_score(y_test, y_test_pred, pos_label='positivo')
test_recall = recall_score(y_test, y_test_pred, pos_label='positivo')
test_f1 = f1_score(y_test, y_test_pred, pos_label='positivo')

In [34]:
# Imprimir las metricas de evaluacion
print('\nMetricas del conjunto de entrenamiento:')
print("Accuracy:",  train_accuracy)
print("Precision:", train_precision)
print("Recall:",    train_recall)
print("F1 score:",  train_f1)

# Imprimir las metricas de evaluacion
print('\n Metricas del conjunto de prueba:')
print("Accuracy:",  test_accuracy)
print("Precision:", test_precision)
print("Recall:",    test_recall)
print("F1 score:",  test_f1)


Metricas del conjunto de entrenamiento:
Accuracy: 0.96225
Precision: 0.9671551288529561
Recall: 0.957
F1 score: 0.9620507665242523

 Metricas del conjunto de prueba:
Accuracy: 0.825
Precision: 0.8336755646817249
Recall: 0.812
F1 score: 0.822695035460993


## 5. Análisis de resultados

## 6. Conclusiones