# Ejercicio 2 - Etapa de preprocesado de texto

### Importaciones

In [1]:
import pandas as pd
import numpy as np
import re
import os
import joblib

from bs4 import BeautifulSoup 
import nltk
nltk.download("stopwords")  
from nltk.corpus import stopwords

from nltk.stem.porter import *
stemmer = PorterStemmer()

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import GradientBoostingClassifier

from keras.models import Sequential
from keras.preprocessing import sequence
import sklearn.preprocessing as pr
from keras.layers import Embedding, LSTM, Dense, Dropout, GRUV2, SimpleRNN
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, precision_recall_curve

[nltk_data] Downloading package stopwords to /home/jose/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
def preprocesado(path, size):
    
    df = pd.read_json(path, lines=True, 
                        compression='gzip')[:size][['reviewText', 'overall']]
    
    df.overall = [1 if int(row) > 2 else 0 for row in df.overall] 
    
    #Balanceo de etiquetas
    label_1, label_0 = df['overall'].value_counts()

    df = pd.concat([df[df.overall == 1].sample(label_0 * 2),
                    df[df.overall == 0]],
                   axis=0)
    
    X_train, X_test, y_train, y_test = train_test_split(
        df.reviewText,
        df.overall,   
        test_size=0.3,
        random_state=42,
        shuffle=True
    )
    
    data_train = [sentence for sentence in X_train.values]
    labels_train = [label for label in y_train.values]
    data_test = [sentence for sentence in X_test.values]
    labels_test = [label for label in y_test.values]      
    
    def review_to_words(review):
        """Convert a raw review string into a sequence of words."""
        text = BeautifulSoup(review, "html5lib").get_text()
        text = re.sub(r"[^a-zA-Z0-9]", " ", review.lower())
        words = text.split()
        words = [w for w in words if w not in stopwords.words("english")]
        words = [PorterStemmer().stem(w) for w in words]    
        return words
    
    words_train = list(map(review_to_words, data_train))
    words_test = list(map(review_to_words, data_test)) 
    
    vectorizer = CountVectorizer(max_features=5000,
             preprocessor=lambda x: x, tokenizer=lambda x: x)  # already preprocessed

    features_train_gradient = vectorizer.fit_transform(words_train).toarray()
    features_test_gradient = vectorizer.transform(words_test).toarray()
    vocabulary = vectorizer.vocabulary_
    
    for sentence in words_train:
        words = []
        for word in sentence:
            try:
                words.append(vocabulary[word])
            except:
                pass
        features_train.append(words)
    
    for sentence in words_test:
        words = []
        for word in sentence:
            try:
                words.append(vocabulary[word])
            except:
                pass
        features_test.append(words)
        
    features_train = sequence.pad_sequences(features_train, maxlen=500)    
    features_test = sequence.pad_sequences(features_test, maxlen=500)
    
    return features_train,\
           features_test,\
           np.array(labels_train),\
           np.array(labels_test),\
           vocabulary

In [3]:
#!wget http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Amazon_Instant_Video_5.json.gz
features_train,\
features_test,\
labels_train,\
labels_test,\
vocabulary = preprocesado('reviews_Amazon_Instant_Video_5.json.gz',37126)     

--2022-02-14 05:10:08--  http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Amazon_Instant_Video_5.json.gz
Resolviendo snap.stanford.edu (snap.stanford.edu)... 171.64.75.80
Conectando con snap.stanford.edu (snap.stanford.edu)[171.64.75.80]:80... conectado.
Petición HTTP enviada, esperando respuesta... 200 OK
Longitud: 9517526 (9,1M) [application/x-gzip]
Guardando como: “reviews_Amazon_Instant_Video_5.json.gz”


2022-02-14 05:10:17 (1,15 MB/s) - “reviews_Amazon_Instant_Video_5.json.gz” guardado [9517526/9517526]



# Ejercicio 3 -  Etapa de entrenamiento y testeo de un modelo de análisis de sentimiento

## Deep Learning

In [4]:
def crear_model(mod, emb_size, vocabulary_size, max_words):
    embedding_size = emb_size
    model = Sequential()
    model.add(Embedding(vocabulary_size, embedding_size, input_length = max_words))
    model.add(mod(100))
    model.add(Dense(1, activation='sigmoid'))
    print(model.summary())
    return model

def entreno(batch_size, num_epochs, X_train, labels_train, model):
    
    X_valid, y_valid = X_train[:batch_size], labels_train[:batch_size]  # first batch_size samples
    X_train2, y_train2 = X_train[batch_size:], labels_train[batch_size:]  # rest for training

    model.fit(X_train2, y_train2,
              validation_data=(X_valid, y_valid),
              batch_size=batch_size, epochs=num_epochs)
    return model

def evaluacion(model, X_test, labels_test):        
    print("Test accuracy:", model.evaluate(X_test, labels_test, verbose=0)[1]) 

In [6]:
for mod in [LSTM, GRUV2, SimpleRNN]:
    print("\nNuevo modelo\n")
    model = crear_model(mod, 32, len(vocabulary), 500)    
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    print("\nComienza entrenamiento\n")
    entreno(32, 1, features_train, labels_train, model)
    print("\nComienza evaluación\n")
    evaluacion(model, features_test, labels_test)


Nuevo modelo

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 500, 32)           160000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 101       
Total params: 213,301
Trainable params: 213,301
Non-trainable params: 0
_________________________________________________________________
None

Comienza entrenamiento


Comienza evaluación

Test accuracy: 0.7941720485687256

Nuevo modelo

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 500, 32)           160000    
__________________________________________