In [1]:
import os
import csv
import pandas as pd
import numpy as np
import tensorflow as tf

# Librerías de la limpieza
import re
from stop_words import get_stop_words
import unicodedata
from num2words import num2words
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import *
stemmer = PorterStemmer()

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import chi2
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, precision_recall_curve

import matplotlib.pyplot as plt


In [2]:
dataset_path = '../dataset'

pet_json = 'reviews_Pet_Supplies_5.json.gz'

pet = pd.read_json(os.path.join(dataset_path, pet_json), lines=True, compression='gzip')


In [3]:
# Preparar los datos (codificar puntuaciones y seleccionar 10000 registros de cada grupo y ordenarlos al azar)
def prepare_data(df):
    df = df.dropna(subset=['reviewText', 'overall'])[['reviewText', 'overall']]
    df["overall"].replace({1: 0, 2: 0, 3: 0, 4: 1, 5: 1}, inplace=True)
    
    positive = df[df["overall"] > 0]
    negative = df[df["overall"] < 1]
    
    positive = positive[0:10000]
    negative = negative[0:10000]
    
    df = pd.concat([positive, negative], axis=0)
    
    dataframe = df.sample(frac=1).reset_index(drop=True)
    
    return dataframe


def lower(text):
    text = text.lower()
    
    return text

def clean_characters(text):
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    text = re.sub(r'[0-9]+', ' ', text)
    
    return text 

def eliminate_stopwords(text):
    sw_list = get_stop_words('en')
    text = ' '.join([word for word in text.split() if word not in sw_list])
    
    return text

def lemmatization(text):
    lemmatizer = WordNetLemmatizer()
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    
    return text

def clean_text(text, split=False):
    text = lower(text)
    text = clean_characters(text)
    text = eliminate_stopwords(text)
    text = lemmatization(text)
    if split:
        text = text.split()
    else:
        pass
    return text



# Juntamos las funciones que trabajan sobre cada una de las columnas
def cleaning(df, split=False):
    reviews = []
    
    for text in df.reviewText:
        reviews.append(clean_text(text, split))
    
    sentiment = []
    for item in df.overall:
        if item > 0:
            sentiment.append(1)
        else:
            sentiment.append(0)
    
    
    df = pd.DataFrame({
    'review': reviews,
    'sentiment': sentiment
    })

    df.dropna(subset=['review', 'sentiment'], inplace=True)
    df.reset_index(drop=True, inplace=True)
    
    return df



# Juntamos todas las funciones
def transform_df_to_text(df, split):
    a = prepare_data(df)
    df = cleaning(a, split)
    
    return df

In [4]:
df_pet = transform_df_to_text(pet, True)


In [5]:
df_pet[0:3]

Unnamed: 0,review,sentiment
0,"[made, mistake, ordering, little, jacs, bit, l...",0
1,"[absolutely, love, product, first, bought, dog...",1
2,"[thing, always, falling, bottom, tank, reach, ...",0


In [6]:
df_pet2 = transform_df_to_text(pet, False)


In [7]:
df_pet2[0:3]

Unnamed: 0,review,sentiment
0,job suppose recommend extra support use fluval...,1
1,bought two trying two brand unit three advanta...,1
2,first bought diffuser every room house recomme...,0


# **1. Método Clásico: Regresión logística**

In [8]:
# Separación train/test
X_train, X_test, y_train, y_test = train_test_split(
    df_pet2['review'],
    df_pet2['sentiment'],
    train_size=0.75,
    test_size=0.25,
    random_state=42,
    shuffle=True
)

In [9]:
# Extracción de features
cv = TfidfVectorizer(
    max_df=0.95,
    min_df=0.05,
    max_features=23370,
    ngram_range=(1, 1)
)
cv.fit(X_train)

TfidfVectorizer(max_df=0.95, max_features=23370, min_df=0.05)

Para el vectorizer se han escogido estos parámetros debido a:

1- *max_df* y *min_df*: eliminamos las palabras con una frecuencia excesivamente alta y baja.

2- *max_features*: Considera las 5000 palabras más frecuentes únicamente.

In [10]:
# TF IDF
X_train_ = cv.transform(X_train)
X_test_ = cv.transform(X_test)

In [11]:
lr = LogisticRegression(C=0.05, solver='lbfgs', max_iter=500)
lr.fit(X_train_, y_train)
    
train_predict = lr.predict(X_train_)
test_predict = lr.predict(X_test_)
    
print ("Accuracy for C=0.05: {}".format(accuracy_score(y_test, test_predict)))


Accuracy for C=0.05: 0.7204


# **2. Gradient Boosting**

**2.1. Bag of Words**

In [12]:
# Separación train/test
X_train2, X_test2, y_train2, y_test2 = train_test_split(
    df_pet['review'],
    df_pet['sentiment'],
    train_size=0.75,
    test_size=0.25,
    random_state=42,
    shuffle=True
)

In [13]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import joblib

# Bag of Words.
def extract_BoW_features(words_train, words_test, vocabulary_size):
    vectorizer = CountVectorizer(max_features=vocabulary_size,
            preprocessor=lambda x: x, tokenizer=lambda x: x)  # already preprocessed
    features_train = vectorizer.fit_transform(words_train).toarray()

    features_test = vectorizer.transform(words_test).toarray()
                
    vocabulary = vectorizer.vocabulary_
    
    return features_train, features_test, vocabulary

features_train, features_test, vocabulary = extract_BoW_features(X_train2, X_test2, 23370)

In [14]:
import sklearn.preprocessing as pr

features_train = pr.normalize(features_train, axis=1)
features_test = pr.normalize(features_test, axis=1)

**2.2 Gradient Boosting**

In [15]:
from sklearn.ensemble import GradientBoostingClassifier

n_estimators = 32

def classify_gboost(X_train, X_test, y_train, y_test):        
    clf = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=1.0, max_depth=1, random_state=42)
    
    clf.fit(X_train, y_train)
    
    print("[{}] Accuracy: train = {}, test = {}".format(
            clf.__class__.__name__,
            clf.score(X_train, y_train),
            clf.score(X_test, y_test)))
    
    return clf, clf.score(X_train, y_train), clf.score(X_test, y_test)


clf2, score_train, score_test = classify_gboost(features_train, features_test, y_train2, y_test2)

[GradientBoostingClassifier] Accuracy: train = 0.7404, test = 0.723


Los Accuracy son prácticamente iguales, por lo que no parece que haya overfitting o underfitting. Debido a la gran cantidad de tiempo que lleva este proceso, lo mejor es hacer este procedimiento pero usando deep learning para evitar el pre-procesamiento.


# **3. Red Neuronal**

**GRU**

Al haber reducido los datos que estoy utilizando he optado por utilizar GRU sobre LSTM ya que suele funcionar mejor con conjuntos de datos más pequeños.

In [16]:
# Calculamos el tamaño del vocabulario y la longitud de la frase más larga.

vocab = dict()
maxlen = 0

for l in df_pet.review:
    for item in l:
        if item in vocab:
            vocab[item] += 1
        else:
            vocab.update({item : 1})

for l in df_pet.review:
    if len(l) > maxlen:
        maxlen = len(l)
    else:
        pass

In [17]:
print("---[Tamaño del vocabulario]---")
print(len(vocab))
print("---[Mayor longitud de frase]---")
print(maxlen)

---[Tamaño del vocabulario]---
23370
---[Mayor longitud de frase]---
770


In [18]:
# Hay que transformar cada palabra en un número, para ello tomaré las keys del diccionario, las asignaré un número único y cambiaré todas las palabras del df
n = 0
vocab_uniq = dict()
reviews_plane = []

for key in vocab:
    vocab_uniq.update({key : n})
    n += 1

for l in df_pet.review:
    l2 = []
    for item in l:
        l2.append(vocab_uniq[item])
    l2 = np.array(l2)
    l2 = tf.convert_to_tensor(l2)
    reviews_plane.append(l2)
        

In [19]:
data = pd.DataFrame({"review" : reviews_plane, "sentiment" : df_pet.sentiment})

# Separación train/test
X_trainDL, X_testDL, y_trainDL, y_testDL = train_test_split(
    data['review'],
    data['sentiment'],
    train_size=0.75,
    test_size=0.25,
    random_state=42,
    shuffle=True
)


from keras.preprocessing import sequence

max_words = maxlen

X_trainDL = sequence.pad_sequences(X_trainDL, maxlen=max_words)
X_testDL = sequence.pad_sequences(X_testDL, maxlen=max_words)


In [20]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout, GRUV2, SimpleRNN

vocabulary_size = len(vocab)
max_words = maxlen

embedding_size = 64
model_gru = Sequential()
model_gru.add(Embedding(vocabulary_size, embedding_size, input_length=max_words))
model_gru.add(GRUV2(100))
model_gru.add(Dense(1, activation='sigmoid'))

print(model_gru.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 770, 64)           1495680   
                                                                 
 gru (GRU)                   (None, 100)               49800     
                                                                 
 dense (Dense)               (None, 1)                 101       
                                                                 
Total params: 1,545,581
Trainable params: 1,545,581
Non-trainable params: 0
_________________________________________________________________
None


In [21]:
model_gru.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [22]:
from keras.utils.vis_utils import plot_model
plot_model(model_gru, to_file='model_plot.png', show_shapes=True, show_layer_names=True)

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model/model_to_dot to work.


In [23]:
batch_size = 64
num_epochs = 2

X_validDL, y_validDL = X_trainDL[:batch_size], y_trainDL[:batch_size]  # first batch_size samples
X_trainDL2, y_trainDL2 = X_trainDL[batch_size:], y_trainDL[batch_size:]  # rest for training

modelDL = model_gru.fit(X_trainDL2, y_trainDL2,
          validation_data=(X_validDL, y_validDL),
          batch_size=batch_size, epochs=num_epochs)

Epoch 1/2
Epoch 2/2


In [24]:
evaluation = model_gru.evaluate(X_testDL, y_testDL, verbose=0)

Vemos que la accuracy es bastante alta, aunque el error también lo es considerando que se trata de un problema binario.

# **4. Métricas**

**Comparación final y conclusiones**

In [25]:
print("\r\n--- [Regresión logística] ---")
print ("La precisión de este modelo ha sido de: [{}]".format(accuracy_score(y_test, test_predict)))
print("\r\n--- [Gradient Boosting] ---")
print ("La precisión de este modelo ha sido de: [{}] ".format(score_test))
print("\r\n--- [Deep Learning: GRU] ---")
print("La precisión de este modelo ha sido de: [{:.2f}],\r\nLa función de pérdida de este ha sido:  [{:.2f}]\r\n".format(evaluation[1], evaluation[0]))


--- [Regresión logística] ---
La precisión de este modelo ha sido de: [0.7204]

--- [Gradient Boosting] ---
La precisión de este modelo ha sido de: [0.723] 

--- [Deep Learning: GRU] ---
La precisión de este modelo ha sido de: [0.81],
La función de pérdida de este ha sido:  [0.43]



La precisión de los primeros modelos ha sido de 0.7~. En el caso del último modelo se ha obtenido una precisión de 0.81, mucho mejor que los anteriores, no obstante la función de pérdida indica que no es un modelo muy optimizado.

Pese a esto, ninguno de los modelos muestra señales de overfitting o underfitting a simple vista.

La precisión ha podido ser menor en los modelos más clásicos debido a la menor complejidad de estos. Tratándose de un problema complejo como es determinar la puntuación que los clientes han puesto a los productos a partir de la reseña, parece lógico pensar que modelos más complejos podrían obtener mejores resultados.
