#Preprocesamiento del texto

En está etapa vamos a preprocesar las reviews para establecer un formato mas adecuado para entrenar el módelo de sentimiento

##Librerias y conexiones


In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import tensorflow as tf
import pickle

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Incluimos las funciones que vamos a utilizar:

- **`preprocess_data`**:
  - Convierte revisiones a palabras; intenta leer desde caché si disponible.
  - Limpia y normaliza los conjuntos de datos de entrenamiento, validación y prueba.
  - Guarda datos procesados en caché para prevenir re-procesamiento.

- **`extract_BoW_features`**:
  - Extrae características tipo "Bag of Words" (BoW) para documentos preprocesados.
  - Emplea `CountVectorizer` para transformar texto a representación numérica.
  - Almacena características BoW en caché para futura utilización.

- **`calculate_results`**:
  - Evalúa precisión, precisión, recall y F1 para un modelo de clasificación.
  - Basado en comparación de etiquetas verdaderas y predicciones.


In [None]:
def preprocess_data(data_train, data_val , data_test, labels_train, labels_val, labels_test,
                    cache_dir=os.path.join("cache", "sentiment_analysis"), cache_file="preprocessed_data.pkl"):
    """Convert each review to words; read from cache if available."""

    cache_data = None
    if cache_file is not None:
      try:
        with open(os.path.join(cache_dir, cache_file), "rb") as f:
            cache_data = pickle.load(f)
        print("Read preprocessed data from cache file:", cache_file)
      except:
        pass

    if cache_data is None:
          words_train = list(map(review_to_words, data_train))
          words_val   = list(map(review_to_words, data_val))
          words_test  = list(map(review_to_words, data_test))

          if cache_file is not None:
            cache_data = dict(words_train=words_train, words_val=words_val, words_test=words_test,
                                    labels_train=labels_train , labels_val =labels_val , labels_test=labels_test)
            with open(os.path.join(cache_dir, cache_file), "wb") as f:
              pickle.dump(cache_data, f)
            print("Wrote preprocessed data to cache file:", cache_file)
    else:
      words_train, words_val, words_test, labels_train, labels_val, labels_test = (cache_data['words_train'],
                                                                                        cache_data['words_val'],
                                                                                        cache_data['words_test'],
                                                                                        cache_data['labels_train'],
                                                                                        cache_data['labels_val'],
                                                                                        cache_data['labels_test'])
    return words_train, words_val, words_test, labels_train,labels_val, labels_test

import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import joblib


In [None]:
def extract_BoW_features(words_train, words_val , words_test, vocabulary_size=500,
                         cache_dir=os.path.join("cache", "sentiment_analysis"), cache_file="bow_features.pkl"):
    """Extract Bag-of-Words for a given set of documents, already preprocessed into words."""

    cache_data = None
    if cache_file is not None:
        try:
            with open(os.path.join(cache_dir, cache_file), "rb") as f:
                cache_data = joblib.load(f)
            print("Read features from cache file:", cache_file)
        except:
            pass

    if cache_data is None:
        vectorizer = CountVectorizer(max_features=vocabulary_size,min_df=0.0175, preprocessor=lambda x: x, tokenizer=lambda x: x)  # already preprocessed
        features_train = vectorizer.fit_transform(words_train).toarray()

        features_val = vectorizer.transform(words_val).toarray()
        features_test = vectorizer.transform(words_test).toarray()

        if cache_file is not None:
            vocabulary = vectorizer.vocabulary_
            cache_data = dict(features_train=features_train, features_val=features_val, features_test=features_test,
                             vocabulary=vocabulary)
            with open(os.path.join(cache_dir, cache_file), "wb") as f:
                joblib.dump(cache_data, f)
            print("Wrote features to cache file:", cache_file)
    else:
        features_train, features_val, features_test, vocabulary = (cache_data['features_train'],
                 cache_data['features_val'] , cache_data['features_test'], cache_data['vocabulary'])

    return features_train, features_val, features_test, vocabulary

# Function to evaluate: accuracy, precision, recall, f1-score
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def calculate_results(y_true, y_pred):
  """
  Calculates model accuracy, precision, recall and f1 score of a binary classification model.

  Args:
      y_true: true labels in the form of a 1D array
      y_pred: predicted labels in the form of a 1D array

  Returns a dictionary of accuracy, precision, recall, f1-score.
  """
  # Calculate model accuracy
  model_accuracy = accuracy_score(y_true, y_pred)
  # Calculate model precision, recall and f1 score using "weighted average
  model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
  model_results = {"accuracy": model_accuracy,
                  "precision": model_precision,
                  "recall": model_recall,
                  "f1": model_f1}
  return model_results

In [None]:
RUTA = "/content/drive/MyDrive/NLP"

In [None]:
path_csv = "/content/drive/MyDrive/NLP/reviews_large.csv"
df = pd.read_csv(path_csv, sep=';', decimal='.')

In [None]:
df = df.sample(frac=1,random_state=42)

In [None]:
df.head()

Unnamed: 0,overall,reviewText
33553,4,"Folks, anyone been waiting to witness PACINO s..."
9427,1,Shen Mue has received rave reviews from almost...
199,1,I had really high hopes for this mouse. It fi...
12447,2,"First, it's not really a 'game'- it's just mus..."
39489,4,Great controller for the Wii U if your used to...


In [None]:
# Función para contar palabras en un texto
def contar_palabras(texto):
  if pd.isna(texto):
    return 0
  return len(texto.split())

# Crear la nueva columna 'palabras_en_review'
df['word_count'] =  df['reviewText'].apply(contar_palabras)

In [None]:
df = df[ (df['word_count'] >=25) & (df['word_count']<= 100)]

In [None]:
df =df[['overall','reviewText']]

In [None]:
df.to_csv('reviews_large_processed.csv', sep=';', decimal='.', index=False)

In [None]:
df.head()

Unnamed: 0,overall,reviewText
10822,2,My son loves wii and wii U.He does not like th...
4144,1,Waste of money on a dead game. No one plays th...
36958,4,Plugging the red and white audio jacks intoyou...
34304,4,I purchased this for my husband since he neede...
12609,2,"It's difficult to control the cars, and the ma..."


## Datos de entrenamiento

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['reviewText'].to_numpy(),
                                                    df['overall'].to_numpy(),
                                                    train_size=0.1,
                                                    test_size=0.1,
                                                    shuffle=True,
                                                    stratify=df['overall'],
                                                    random_state=42)

X_valid, X_test, y_valid, y_test = train_test_split(X_test,
                                                    y_test,
                                                    test_size=0.5,
                                                    shuffle=True,
                                                    stratify=y_test,
                                                    random_state=42)

print(f'Dimensiones del dataset de training:   {X_train.shape}')
print(f'Dimensiones del dataset de validation: {X_valid.shape}')
print(f'Dimensiones del dataset de test:       {X_test.shape}')


Dimensiones del dataset de training:   (1827,)
Dimensiones del dataset de validation: (914,)
Dimensiones del dataset de test:       (914,)


##Preprocesado BOW

In [None]:
def review_to_words(review):
    """Convert a raw review string into a sequence of words."""
    # Eliminamos las etiquetas HTML
    #text = re.sub( re.compile('<.*?>'), '', review)
    text = BeautifulSoup(review, "html5lib").get_text()
    # Convertimos a minúscula y quitamos todo lo que no sea texto o números
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())
    # Dividimos en tokens por espacios
    words = text.split()
    # Eliminamos stopwords
    words = [w for w in words if w not in stopwords.words("english")]
    # remove remaining tokens that are not alphabetic
    words = [w for w in words if w.isalpha()]
    # Aplicamos stemming
    words = [PorterStemmer().stem(w) for w in words]

    return words

In [None]:
# BeautifulSoup to easily remove HTML tags
from bs4 import BeautifulSoup

# RegEx for removing non-letter characters
import re

# NLTK library for the remaining steps
import nltk
nltk.download("stopwords")   # download list of stopwords (only once; need not run it again)
from nltk.corpus import stopwords # import stopwords

from nltk.stem.porter import *
stemmer = PorterStemmer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
def review_to_words(review):
    """Convert a raw review string into a sequence of words."""
    # Eliminamos las etiquetas HTML
    #text = re.sub( re.compile('<.*?>'), '', review)
    text = BeautifulSoup(review, "html5lib").get_text()
    # Convertimos a minúscula y quitamos todo lo que no sea texto o números
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())
    # Dividimos en tokens por espacios
    words = text.split()
    # Eliminamos stopwords
    words = [w for w in words if w not in stopwords.words("english")]
    # remove remaining tokens that are not alphabetic
    words = [w for w in words if w.isalpha()]
    # Aplicamos stemming
    words = [PorterStemmer().stem(w) for w in words]

    return words

In [None]:
cache_dir = os.path.join("cache", "sentiment_analysis")
os.makedirs(cache_dir, exist_ok=True)

words_train , words_valid, words_test, labels_train ,labels_valid , labels_test = preprocess_data(X_train, X_valid , X_test , y_train, y_valid, y_test)

  text = BeautifulSoup(review, "html5lib").get_text()


Wrote preprocessed data to cache file: preprocessed_data.pkl


In [None]:
features_train, features_valid , features_test, vocabulary = extract_BoW_features(words_train,words_valid, words_test)

Wrote features to cache file: bow_features.pkl




## Vocabularios

In [None]:
vocabulary

{'love': 141,
 'seri': 219,
 'gameplay': 98,
 'son': 228,
 'actual': 2,
 'play': 182,
 'way': 263,
 'much': 156,
 'around': 15,
 'game': 97,
 'point': 184,
 'minut': 149,
 'time': 248,
 'good': 103,
 'movi': 155,
 'week': 264,
 'finish': 88,
 'everyth': 73,
 'music': 158,
 'new': 162,
 'version': 257,
 'get': 100,
 'bit': 26,
 'awesom': 17,
 'control': 46,
 'noth': 166,
 'bad': 19,
 'say': 213,
 'realli': 200,
 'back': 18,
 'also': 7,
 'fun': 96,
 'one': 169,
 'like': 133,
 'part': 175,
 'charact': 37,
 'mean': 147,
 'come': 41,
 'worth': 273,
 'suck': 237,
 'real': 199,
 'even': 70,
 'wii': 267,
 'quickli': 195,
 'someth': 226,
 'mani': 144,
 'featur': 82,
 'littl': 135,
 'set': 220,
 'purchas': 191,
 'sure': 239,
 'includ': 116,
 'look': 139,
 'great': 106,
 'make': 143,
 'star': 231,
 'said': 211,
 'could': 48,
 'better': 24,
 'keep': 120,
 'campaign': 33,
 'thing': 243,
 'done': 58,
 'call': 31,
 'whole': 266,
 'compar': 42,
 'sinc': 223,
 'never': 161,
 'though': 245,
 'multiplay'