In [2]:
import pandas as pd
import numpy as np
from textblob import TextBlob

### Amazon Alexa Reviews Dataset

This data was extracted from Amazon and made available at [Kaggle](https://www.kaggle.com/sid321axn/amazon-alexa-reviews).

This dataset consists of a nearly 3000 Amazon customer reviews (input text), star ratings, date of review, variant and feedback of various amazon Alexa products like Alexa Echo, Echo dots, and Alexa Firesticks.

In [3]:
data = pd.read_csv('data/amazon_alexa.tsv', delimiter='\t')
data.head()

Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1
4,5,31-Jul-18,Charcoal Fabric,Music,1


- Using TextBlob count the number of words in each review, put the results in a new column `words`
- Is there a correlation between the length of the review and its rating? 
- Using TextBlob extract the nouns (in a list) of each review, put the results in a new column `nouns`
- Let us create a high-level summary of review content per rating. Combine the `nouns` per rating. **HINT: you can use groupby then sum to combine lists**
- Next, convert each list (of a rating) to a dictionary with keys being words and values are the counts. Order by values descending
- Compare ratings 1 and 5 any notable differences?

In [4]:
data['words'] = data.verified_reviews.apply(lambda review: len(TextBlob(review).words))

In [5]:
np.corrcoef(data.rating, data.words)

array([[ 1.        , -0.19027298],
       [-0.19027298,  1.        ]])

In [6]:
def extract_nouns(sentence):
    blob = TextBlob(sentence)
    return [word.lower() for word, pos in blob.tags if pos.startswith('NN')]

In [7]:
data['nouns'] = data.verified_reviews.apply(extract_nouns)

In [8]:
data.head()

Unnamed: 0,rating,date,variation,verified_reviews,feedback,words,nouns
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1,3,[echo]
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1,2,[]
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1,38,"[game, question, alexa, lights, home]"
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1,34,"[lot, fun, thing, learns, dinosaurs, i, lights..."
4,5,31-Jul-18,Charcoal Fabric,Music,1,1,[]


In [9]:
summaries = data.groupby('rating').nouns.sum()
summaries

rating
1    [features, stopped, weeks, commands, fun, alex...
2    [siri, fact, siri, alexa, need, household, bar...
3    [cellphone, features, ipad, use, alarm, r, dea...
4    [game, question, alexa, lights, home, echo, fa...
5    [echo, lot, fun, thing, learns, dinosaurs, i, ...
Name: nouns, dtype: object

In [10]:
def word_freq(list_of_words):
    return pd.Series(list_of_words).value_counts().to_dict()

In [11]:
summary_freq = summaries.apply(word_freq)
summary_freq

rating
1    {'amazon': 40, 'echo': 38, 'dot': 26, 'product...
2    {'alexa': 33, 'echo': 29, 'device': 29, 'music...
3    {'alexa': 51, 'echo': 44, 'speaker': 30, 'dot'...
4    {'echo': 125, 'alexa': 112, 'music': 100, 'spe...
5    {'echo': 441, 'alexa': 375, 'music': 370, 'sou...
Name: nouns, dtype: object

In [12]:
summary_freq[1]

{'amazon': 40,
 'echo': 38,
 'dot': 26,
 'product': 25,
 'device': 25,
 't': 23,
 'time': 19,
 'screen': 18,
 'alexa': 18,
 'thing': 16,
 '’': 13,
 'phone': 13,
 'months': 13,
 'app': 12,
 'speaker': 11,
 'way': 11,
 'money': 11,
 'i': 9,
 'home': 9,
 'item': 9,
 'music': 8,
 'spot': 8,
 'things': 8,
 'work': 8,
 'devices': 8,
 'nothing': 8,
 'question': 7,
 'wifi': 7,
 'sound': 7,
 'bluetooth': 7,
 'show': 7,
 'connection': 6,
 'cycle': 6,
 'unit': 6,
 'features': 6,
 'times': 6,
 'customer': 6,
 'light': 6,
 'anything': 6,
 'cards': 6,
 'hue': 6,
 'hub': 6,
 'problems': 6,
 'day': 6,
 'bulb': 6,
 'voice': 5,
 'try': 5,
 'repair': 5,
 'bridge': 5,
 'one': 5,
 'clock': 5,
 'lights': 5,
 'piece': 5,
 'video': 5,
 'hours': 5,
 'service': 5,
 'garbage': 5,
 '*': 5,
 'number': 5,
 'instructions': 4,
 'return': 4,
 '....': 4,
 'support': 4,
 'firmware': 4,
 'waste': 4,
 'stopped': 4,
 'minutes': 4,
 'days': 4,
 'problem': 4,
 'alexi': 4,
 'jack': 4,
 'use': 4,
 'issues': 4,
 'questions': 4,

In [13]:
summary_freq[5]

{'echo': 441,
 'alexa': 375,
 'music': 370,
 'sound': 179,
 'product': 166,
 'dot': 160,
 'home': 143,
 'great': 139,
 'amazon': 121,
 'speaker': 117,
 'works': 114,
 'day': 113,
 'time': 109,
 'fun': 107,
 'device': 104,
 'room': 101,
 'things': 95,
 'house': 87,
 'quality': 85,
 'everything': 85,
 'show': 85,
 'weather': 84,
 'thing': 84,
 'prime': 83,
 'tv': 82,
 'devices': 80,
 'love': 78,
 't': 72,
 'spot': 68,
 'easy': 67,
 'lights': 67,
 'clock': 65,
 'alarm': 65,
 'bedroom': 61,
 'price': 57,
 'screen': 54,
 'hub': 53,
 'voice': 53,
 'family': 49,
 '’': 48,
 'phone': 47,
 'use': 47,
 'video': 46,
 'news': 46,
 'purchase': 46,
 'dots': 45,
 'lot': 45,
 'stick': 44,
 'fire': 44,
 'kitchen': 42,
 'speakers': 41,
 'plus': 41,
 'features': 40,
 'i': 39,
 'questions': 36,
 'app': 36,
 'information': 36,
 'gift': 35,
 'songs': 34,
 'size': 34,
 'item': 34,
 'setup': 33,
 'kids': 33,
 'way': 33,
 'feature': 32,
 'morning': 32,
 'daughter': 31,
 'bought': 30,
 'something': 28,
 'wife': 