In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


**CARGAMOS DATOS**

In [None]:
import pandas as pd
import gzip
import json

# Ruta al archivo en tu Google Drive
file_path = '/content/drive/My Drive/Appliances_5.json.gz'

# Cargar el archivo JSON
with gzip.open(file_path, 'rt', encoding='utf-8') as f:
    data = [json.loads(line) for line in f]

# Convertir a DataFrame
df = pd.DataFrame(data)

# Mostrar las primeras filas del dataframe
print(df.head())

# Información del dataframe
print(df.info())


   overall  verified   reviewTime      reviewerID        asin  \
0      5.0      True  08 22, 2013  A34A1UP40713F8  B00009W3I4   
1      5.0      True   02 8, 2016  A1AHW6I678O6F2  B00009W3PA   
2      5.0      True   08 5, 2015   A8R48NKTGCJDQ  B00009W3PA   
3      5.0      True  04 24, 2015   AR3OHHHW01A8E  B00009W3PA   
4      5.0      True  03 21, 2015  A2CIEGHZ7L1WWR  B00009W3PA   

                       style     reviewerName  \
0  {'Style:': ' Dryer Vent'}    James. Backus   
1       {'Size:': ' 6-Foot'}           kevin.   
2       {'Size:': ' 6-Foot'}        CDBrannom   
3       {'Size:': ' 6-Foot'}  Calvin E Reames   
4       {'Size:': ' 6-Foot'}   albert j. kong   

                                          reviewText        summary  \
0  I like this as a vent as well as something tha...  Great product   
1                                          good item     Five Stars   
2                     Fit my new LG dryer perfectly.     Five Stars   
3                     Good val

**LIMPIEZA Y PREPARACION DE DATOS**

In [None]:
import re
import string
from sklearn.model_selection import train_test_split

# Eliminar filas con valores nulos en `reviewText` y `overall`
df_clean = df.dropna(subset=['reviewText', 'overall'])

# Convertir la columna 'overall' a tipo numérico
df_clean['overall'] = pd.to_numeric(df_clean['overall'], errors='coerce')

# Eliminar filas con valores negativos en `overall`
df_clean = df_clean[df_clean['overall'] >= 0]

# Convertir las puntuaciones `overall` a una clasificación binaria
df_clean['label'] = df_clean['overall'].apply(lambda x: 1 if x >= 4 else 0)

# Función para limpiar el texto
def clean_text(text):
    text = text.lower()  # Convertir a minúsculas
    text = re.sub(r'\d+', '', text)  # Eliminar números
    text = text.translate(str.maketrans('', '', string.punctuation))  # Eliminar puntuación
    text = text.strip()  # Eliminar espacios en blanco al inicio y al final
    return text

# Aplicar limpieza de texto
df_clean['cleaned_reviewText'] = df_clean['reviewText'].apply(clean_text)

# División de los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(df_clean['cleaned_reviewText'], df_clean['label'], test_size=0.2, random_state=42)

print(X_train.head())
print(y_train.head())


2008    this review is for gardus rle linteater piece ...
2142    first thing first it works the kit is great in...
2015    i bought this last october and finally got aro...
8       luved it for the few months it worked  great l...
1530    at first this contraption was a little confusi...
Name: cleaned_reviewText, dtype: object
2008    1
2142    0
2015    1
8       0
1530    0
Name: label, dtype: int64


**Clasificador en base a heurísticas (Regexp y lexicones)**

In [None]:
#definimos un lexicon simple
positive_words = set(['good', 'great', 'excellent', 'amazing', 'fantastic', 'love'])
negative_words = set(['bad', 'terrible', 'awful', 'worst', 'poor', 'hate'])

def heuristic_classifier(text):
    pos_count = sum([1 for word in text.split() if word in positive_words])
    neg_count = sum([1 for word in text.split() if word in negative_words])
    return 1 if pos_count > neg_count else 0

# Aplicar el clasificador heurístico al conjunto de prueba
heuristic_predictions = X_test.apply(heuristic_classifier)

# Evaluación del clasificador heurístico
from sklearn.metrics import accuracy_score

heuristic_accuracy = accuracy_score(y_test, heuristic_predictions)
heuristic_accuracy


0.618421052631579

**Representación del Texto mediante BoW y Clasificación**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

# Representación BoW
vectorizer = CountVectorizer()
X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

# Clasificador Naive Bayes
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_bow, y_train)
nb_predictions = nb_classifier.predict(X_test_bow)

# Evaluación del clasificador Naive Bayes
nb_accuracy = accuracy_score(y_test, nb_predictions)

# Clasificador de Regresión Logística
lr_classifier = LogisticRegression(max_iter=1000)
lr_classifier.fit(X_train_bow, y_train)
lr_predictions = lr_classifier.predict(X_test_bow)

# Evaluación del clasificador de Regresión Logística
lr_accuracy = accuracy_score(y_test, lr_predictions)

nb_accuracy, lr_accuracy


(0.9846491228070176, 0.9956140350877193)

vemos que el clasificador de Regresion logistica nos da un mejor resultado, aunque con muy poca diferencia.

**Empleo de Word Embeddings como representación del texto y clasificador**

In [None]:
import numpy as np
from gensim.models import Word2Vec
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Crear embeddings Word2Vec sobre el propio corpus
sentences = [text.split() for text in X_train]
w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=2, workers=4)
w2v_model.train(sentences, total_examples=len(sentences), epochs=10)

# Función para obtener los embeddings de Word2Vec
def get_w2v_embeddings(text):
    words = text.split()
    embedding = np.mean([w2v_model.wv[word] for word in words if word in w2v_model.wv] or [np.zeros(100)], axis=0)
    return embedding

# Convertir los textos a sus embeddings
X_train_w2v = np.array([get_w2v_embeddings(text) for text in X_train])
X_test_w2v = np.array([get_w2v_embeddings(text) for text in X_test])

# Clasificador de Regresión Logística
lr_w2v_classifier = LogisticRegression(max_iter=1000)
lr_w2v_classifier.fit(X_train_w2v, y_train)
lr_w2v_predictions = lr_w2v_classifier.predict(X_test_w2v)

# Evaluación del clasificador de Regresión Logística con Word Embeddings
lr_w2v_accuracy = accuracy_score(y_test, lr_w2v_predictions)
print(f"Accuracy of Logistic Regression with Word Embeddings: {lr_w2v_accuracy}")




Accuracy of Logistic Regression with Word Embeddings: 0.9956140350877193


**Clasificador con WE y una arquitectura de DL.**

In [None]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from sklearn.metrics import accuracy_score

# Tokenización y padding
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Definir la longitud máxima de las secuencias
max_len = max(len(seq) for seq in X_train_seq)

# Aplicar padding
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

# Crear una matriz de embeddings
word_index = tokenizer.word_index
embedding_matrix = np.zeros((len(word_index) + 1, 100))
for word, i in word_index.items():
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]

# Construir el modelo LSTM
model = Sequential()
model.add(Embedding(input_dim=len(word_index) + 1, output_dim=100, weights=[embedding_matrix], input_length=max_len, trainable=False))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Entrenar el modelo
history = model.fit(X_train_pad, y_train, epochs=5, batch_size=64, validation_data=(X_test_pad, y_test), verbose=2)

# Evaluación del modelo
dl_accuracy = model.evaluate(X_test_pad, y_test, verbose=0)[1]
print(f"Accuracy of LSTM with Word Embeddings: {dl_accuracy}")


Epoch 1/5
29/29 - 50s - loss: 0.2193 - accuracy: 0.9379 - val_loss: 0.0639 - val_accuracy: 0.9868 - 50s/epoch - 2s/step
Epoch 2/5
29/29 - 44s - loss: 0.0572 - accuracy: 0.9857 - val_loss: 0.0395 - val_accuracy: 0.9868 - 44s/epoch - 2s/step
Epoch 3/5
29/29 - 46s - loss: 0.0402 - accuracy: 0.9879 - val_loss: 0.0318 - val_accuracy: 0.9868 - 46s/epoch - 2s/step
Epoch 4/5
29/29 - 47s - loss: 0.0316 - accuracy: 0.9890 - val_loss: 0.0216 - val_accuracy: 0.9912 - 47s/epoch - 2s/step
Epoch 5/5
29/29 - 45s - loss: 0.0253 - accuracy: 0.9918 - val_loss: 0.0165 - val_accuracy: 0.9934 - 45s/epoch - 2s/step
Accuracy of LSTM with Word Embeddings: 0.9934210777282715
