In [31]:
import pandas as pd
from textblob import TextBlob
from collections import Counter

### Lendo dados das reviews

In [30]:
reviews = pd.read_csv('data/amazon_reviews_us_PC_v1_00.tsv', sep='\t', \
                      usecols=['product_id', 'product_title', 'product_category', 'star_rating', 'verified_purchase', 'review_date', 'review_body'], \
                      dtype = {"star_rating":"int"})

### Criando e preenchendo as colunas para polaridade e subjetividade

In [3]:
def calculate_sentiment(review_text):
    tb = TextBlob(str(review_text))
    return pd.Series({'polarity': tb.sentiment.polarity, 'subjectivity': tb.sentiment.subjectivity}) 

In [4]:
reviews = reviews.merge(reviews['review_body'].apply(calculate_sentiment), left_index=True, right_index=True)

In [16]:
reviews_by_product = reviews.groupby(['product_id', 'star_rating']).agg({'polarity': 'mean', 'subjectivity': 'mean'})

In [40]:
grouped = reviews.groupby('product_title').size()

In [47]:
grouped.sort_values(ascending=False).head(5)

product_title
Fire HD 7, 7" HD Display, Wi-Fi, 8 GB                                                 30846
Kindle Fire HDX 7", HDX Display (Previous Generation - 3rd)                           30026
Kindle Paperwhite, 6" High-Resolution Display (212 ppi) with Built-in Light, Wi-Fi    23959
Kindle Fire (Previous Generation - 1st)                                               23310
SanDisk Ultra microSDHC Card Plus Adapter                                             22297
dtype: int64

In [43]:
reviews_per_product = Counter(reviews["product_title"])

In [45]:
most_reviewd_products = reviews_per_product.most_common(20)

In [48]:
most_reviewd_products[:5]

[('Fire HD 7, 7" HD Display, Wi-Fi, 8 GB', 30846),
 ('Kindle Fire HDX 7", HDX Display (Previous Generation - 3rd)', 30026),
 ('Kindle Paperwhite, 6" High-Resolution Display (212 ppi) with Built-in Light, Wi-Fi',
  23959),
 ('Kindle Fire (Previous Generation - 1st)', 23310),
 ('SanDisk Ultra microSDHC Card Plus Adapter', 22297)]

In [51]:
import re
import string
document_text = open('data/amazon_reviews_us_PC_v1_00.tsv', 'r')
text_string = document_text.read()
match_pattern = re.findall(r'Fire HD 7, 7" HD Display, Wi-Fi, 8 GB', text_string)
len(match_pattern)

30913