## 1. Importacion de librerias

In [8]:
import pandas as pd
import numpy as np
import re

# Librerias para el preprocesamiento
import spacy
nlp = spacy.load('es_core_news_sm')

# Librerias para transformar los datos
from sklearn.preprocessing import FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin



# Librerias para vectorizar 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

# Librerias para el modelo
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

# Librerias para la evaluacion
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score

# Librerias para la busqueda de hiperparametros
from sklearn.model_selection import GridSearchCV

# Librerias para el pipeline
from sklearn.pipeline import Pipeline

# Librerias para exportar el modelo
from joblib import dump, load

## 2. Carga de datos

In [9]:
# Carga del csv entrenamiento
file_name = './data/MovieReviews.csv'
raw = pd.read_csv(file_name, sep=',')
reviews = raw.copy()

## 4. Creacion del pipeline

In [10]:
# Funcion para limpiar el texto utilizando spacy
def clean_text(text):
    # Pasar a minusculas
    text = text.lower()

    # Eliminar las tildes
    text = re.sub(r'[á]', 'a', text)
    text = re.sub(r'[é]', 'e', text)
    text = re.sub(r'[í]', 'i', text)
    text = re.sub(r'[ó]', 'o', text)
    text = re.sub(r'[ú]', 'u', text)
    
    # Eliminar los numeros
    text = re.sub(r'\d+', ' ', text)

    # Eliminar los signos de puntuacion
    text = re.sub(r'[^\w\s]', ' ', text)

    # Tokenizar
    tokens = nlp(text)

    # Eliminar stopwords
    tokens = [token.text for token in tokens if not token.is_stop]

    # Unir los tokens
    text = ' '.join(tokens)

    return text

In [11]:
pipe = Pipeline([
                ('tfidf', CountVectorizer(preprocessor=clean_text)),
                ('model', RandomForestClassifier())
])

## 5. Ejecucion y analisis

In [12]:
# Separacion de las variables de decision y de prediccion
X = reviews['review_es']
Y = reviews['sentimiento']

In [13]:
# Creacion de los conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
y_train = y_train.values.ravel()

In [14]:
# Entrenamiento del modelo
model = pipe.fit(X_train, y_train)

In [15]:
# Realizar predicciones en el conjunto de entrenaiento y prueba
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

In [16]:
# Calcular las metricas de evaluacion
train_accuracy = accuracy_score(y_train, y_train_pred)
train_precision = precision_score(y_train, y_train_pred, pos_label='positivo')
train_recall = recall_score(y_train, y_train_pred, pos_label='positivo')
train_f1 = f1_score(y_train, y_train_pred, pos_label='positivo')

test_accuracy = accuracy_score(y_test, y_test_pred)
test_precision = precision_score(y_test, y_test_pred, pos_label='positivo')
test_recall = recall_score(y_test, y_test_pred, pos_label='positivo')
test_f1 = f1_score(y_test, y_test_pred, pos_label='positivo')

In [17]:
# Imprimir las metricas de evaluacion
print('\nMetricas del conjunto de entrenamiento:')
print("Accuracy:",  train_accuracy)
print("Precision:", train_precision)
print("Recall:",    train_recall)
print("F1 score:",  train_f1)

# Imprimir las metricas de evaluacion
print('\nMetricas del conjunto de prueba:')
print("Accuracy:",  test_accuracy)
print("Precision:", test_precision)
print("Recall:",    test_recall)
print("F1 score:",  test_f1)


Metricas del conjunto de entrenamiento:
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 score: 1.0

Metricas del conjunto de prueba:
Accuracy: 0.786
Precision: 0.7763975155279503
Recall: 0.7796257796257796
F1 score: 0.7780082987551866
