In [1]:
import joblib
import gzip
import json
import numpy as np
import random
import pandas as pd
import os

In [2]:
# Esto es solo para cargar un conjunto de reviews, puede cambiarse para leeer cualquier df con reviews
list_reviews = joblib.load("../src/etl/extract/downloaded_reviews.joblib")
ejemplo = list_reviews[0]
with gzip.open(os.path.join("../src/etl/extract/",ejemplo), 'r') as fin:
    decompressed_data = fin.read().decode('utf-8')
json_objects = [json.loads(line) for line in decompressed_data.splitlines() if line]
df = pd.DataFrame(json_objects)

In [3]:
print(df.shape)

(55933, 11)


In [4]:
from transformers import pipeline
model_path = "cardiffnlp/twitter-roberta-base-sentiment-latest"
sentiment_task = pipeline("sentiment-analysis", model=model_path, tokenizer=model_path)

  from .autonotebook import tqdm as notebook_tqdm
2023-08-01 12:00:57.633464: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a Bert

In [14]:
sample = df.sample(1).reset_index().iloc[0, :]
print(len('summary'), sample['summary'])
print(len('reviewText'), sample['reviewText'])
print(sentiment_task(df.loc[0,'summary']))
print(sentiment_task(df.loc[0,'reviewText']))

7 Great Game
10 My boyfriend and I love this game!  It has great levels and fun graphics and themes.  I defiantly recommend it.
[{'label': 'positive', 'score': 0.9819431900978088}]
[{'label': 'positive', 'score': 0.9367033839225769}]


In [15]:
df_sample = df.sample(100)

In [16]:
# Al intentar ejecutar este código puede surgir un error porque el modelo solo permite
# un máximo de 513 tokens. Como algunos reviews son muy largos, se puede recurrir a
# hacer el análisis de sentimiento sobre el 'summary'
df_sample.loc[:, 'reviewText'].apply(sentiment_task)

RuntimeError: The expanded size of the tensor (885) must match the existing size (514) at non-singleton dimension 1.  Target sizes: [1, 885].  Tensor sizes: [1, 514]

In [17]:
df_sample.loc[:, 'reviewText'].astype('str').apply(len).describe()

count     100.000000
mean      412.790000
std       816.987547
min         2.000000
25%        42.750000
50%       134.500000
75%       531.500000
max      6720.000000
Name: reviewText, dtype: float64

In [18]:
def safe_sentiment_task(row):
    try:
        return tuple(sentiment_task(row['reviewText'])[0].values())
    except RuntimeError:
        try:
            return tuple(sentiment_task(row['summary'])[0].values())
        except (RuntimeError, IndexError):
                return tuple(None, None)  # or some default value

In [19]:
sentiment = df_sample.apply(safe_sentiment_task, axis=1)

In [20]:
df_sample[['sentiment', 'sentiment_score']] = pd.DataFrame(sentiment.to_list(), index=df_sample.index)
df_sample.head()

Unnamed: 0,asin,overall,reviewText,reviewerID,reviewerName,summary,unixReviewTime,verified,style,vote,image,sentiment,sentiment_score
52129,B000N8Q4JA,5.0,Very nice work pants extremely tough and durab...,A3LIKTXEQFTDX3,argelyn,Very nice work pants extremely tough and durab...,1481673600,True,"{""Size:"":"" 34W x 32L"",""Color:"":"" Moss""}",,,positive,0.98649
55577,B000N96HH8,5.0,These boots are great! The fit are just like ...,A1IJ8SXXBW8FSU,Richard Chan,Great boots,1403481600,True,"{""Size:"":"" 10 EE US"",""Color:"":"" Dark Brown""}",,,positive,0.987269
7118,B00005RCQY,5.0,Ok,A3JV766PM0V45T,jezreeljordan,Five Stars,1423872000,True,,,,positive,0.487016
39532,B000067FDW,2.0,The idea is cool. The game is well thought-ou...,A1BIGCD00TMJ38,A Real Person,Ok but WHAT FOR!!!!!!,1163030400,False,"{""Format:"":"" Video Game""}",,,positive,0.364864
19227,B000MXQ2CU,5.0,"Look great, fit great, at a great price point....",A3JHUKYBSFCTAA,Vernon,Great value in pants.,1421971200,True,"{""Size:"":"" 32W x 29L"",""Color:"":"" Graphite""}",,,positive,0.965721


In [21]:
df_sample['sentiment'].value_counts()

sentiment
positive    74
neutral     13
negative    13
Name: count, dtype: int64