In this notebook, we will create a html report for the amazon reviews of a single product (currently Samsung Chromebook).

TO DO:
-Classify reviews using pickled model trained on Electronics data.
-Classify sentences using this classifier rather than text blobs.
-Generalize, so that can input product name, retrieve asin from metadata and then produce report.

First read in and clean Product Reviews (this should be changed that it is for any arbitrary product at some point)

In [1]:
import pandas as pd
popular_product = pd.read_csv("Samsung_chromebook_review.csv", sep='\t')

In [2]:
import datetime
popular_product['unixReviewTime'] = pd.to_datetime(popular_product['unixReviewTime'],unit='s')
popular_product["overall"] = popular_product["overall"].astype(int)

Sort product reviews into positive or negative depending on star rating (this needs to be replaced with whichever pickled classifier we decide on)

In [3]:
popular_product["overall"] = popular_product["overall"].astype(int)

#popular_product = popular_product[popular_product["overall"] != 3]
#popular_product["sentiment"] = popular_product["overall"] >= 4

pos_reviews = popular_product[popular_product["overall"] >= 4]
neg_reviews = popular_product[popular_product["overall"] <= 2]

pos_reviews.index = range(0,len(pos_reviews))
neg_reviews.index = range(0,len(neg_reviews))

In [4]:
#bin number of reviews by month and year. First all reviews
time_dist=popular_product.groupby([popular_product.unixReviewTime.dt.year,popular_product.unixReviewTime.dt.month]).overall.count()

df_time = time_dist.to_frame()
df_time["date"] = pd.to_datetime( time_dist.index, format='(%Y, %m)')

#Now positive and negative reviews
pos_time_dist=pos_reviews.groupby([pos_reviews.unixReviewTime.dt.year,pos_reviews.unixReviewTime.dt.month]).overall.count()
neg_time_dist=neg_reviews.groupby([neg_reviews.unixReviewTime.dt.year,neg_reviews.unixReviewTime.dt.month]).overall.count()

pos_time = pos_time_dist.to_frame()
pos_time["date"] = pd.to_datetime( pos_time.index, format='(%Y, %m)')

neg_time = neg_time_dist.to_frame()
neg_time["date"] = pd.to_datetime( neg_time.index, format='(%Y, %m)')

Plot total reviews vs time and save URL

In [5]:
import plotly.plotly as py
import plotly.graph_objs as go

#Only include months with more than one review - Amazon seems to accidently include some reviews for other products pre 2012...

trace1 = go.Scatter(
    x=df_time["date"][df_time["overall"] > 1],
    y=df_time["overall"][df_time["overall"] > 1],
)
data = [trace1]

layout = go.Layout(
    title='Reviews vs time',
    xaxis=dict(
        title='Date',
    ),
    yaxis=dict(
        title='Number of reviews',
    ),
    showlegend=False
)

fig = go.Figure(data=data, layout=layout)
all_reviews_plot_url = py.plot(fig, filename='reviews_vs_time')

Plot positive and negative reviews vs time and save URL

In [6]:
#Only include months with more than one review - Amazon seems to accidently include some reviews for other products pre 2012...

trace1 = go.Scatter(
    x=pos_time["date"][pos_time["overall"] > 1],
    y=pos_time["overall"][pos_time["overall"] > 1],
    name="positive reviews"
)
trace2 = go.Scatter(
    x=neg_time["date"][neg_time["overall"] > 1],
    y=neg_time["overall"][neg_time["overall"] > 1],
    name="negative reviews"
)
data = [trace1, trace2]

layout = go.Layout(
    title='Reviews vs time',
    xaxis=dict(
        title='Date',
    ),
    yaxis=dict(
        title='Number of reviews',
    ),
    showlegend=True
)

fig = go.Figure(data=data, layout=layout)
sent_reviews_plot_url = py.plot(fig, filename='sent_reviews_vs_time')

Now clean and POS tag text and get noun phrases for positive and negative sentences. Mostly use textblobs library for this.

In [7]:
import nltk.data
from textblob import TextBlob
import re

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')


def review_to_sentences( review, tokenizer ):
    sentences = tokenizer.tokenize(review.strip())
    blobs = []
    review_sentences = []
    for sentence in sentences:
        if len(sentence) > 0:
            review_sentences.append( sentence)
            blobs.append(TextBlob(sentence))
    return review_sentences

def review_to_blobs( review, tokenizer ):
    blobs = []
    review_sentences = []
    sentences = tokenizer.tokenize(review.strip())
    for sentence in sentences:
        if len(sentence) > 0:
            sentence = re.sub("[^a-zA-Z]", " ", sentence)
            review_sentences.append( sentence)
            blobs.append(TextBlob(sentence))
    return blobs

In [8]:
blobs = []
review_sentences = []

print("Parsing sentences from training set")
icount = 1
for review in popular_product["reviewText"]:
    icount += 1
    if icount%1000 == 0:
        print("Cleaning and tokenizing review", icount, "of", len(popular_product))
    review_sentences += review_to_sentences(review, tokenizer)
    blobs += review_to_blobs(review, tokenizer)

Parsing sentences from training set
Cleaning and tokenizing review 1000 of 4580
Cleaning and tokenizing review 2000 of 4580
Cleaning and tokenizing review 3000 of 4580
Cleaning and tokenizing review 4000 of 4580


Positive nouns phrases selected from sentences with sentiment above 0.3. Negative noun phrases selected from sentences with sentiment below -0.3.

In [9]:
positive_sentences = []
negative_sentences = []

positive_noun_phrases = []
negative_noun_phrases = []
for blob in blobs:
    if blob.polarity > 0.3:
        positive_sentences.append(blob)
        positive_noun_phrases.append(blob.noun_phrases)
    elif blob.polarity < -0.3:
        negative_sentences.append(blob)
        negative_noun_phrases.append(blob.noun_phrases)

In [10]:
negative_phrases = []
for noun_phrases in negative_noun_phrases:
    for noun_phrase in noun_phrases:
        negative_phrases.append(noun_phrase)
        
positive_phrases = []
for noun_phrases in positive_noun_phrases:
    for noun_phrase in noun_phrases:
        positive_phrases.append(noun_phrase)

Find most frequent positive and negative noun phrases

In [11]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
import numpy as np
import re
import nltk
import string

from nltk.tokenize.punkt import PunktSentenceTokenizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

def get_word_freq(words):
    vectorizer = CountVectorizer(analyzer = "word", tokenizer = None, preprocessor = None, stop_words = None, max_features = 1000, ngram_range=(2,3)) 
    
    word_features = vectorizer.fit_transform(words)
    word_features = word_features.toarray()
    
    vocab = vectorizer.get_feature_names()
    
    dist = np.sum(word_features, axis=0)

    word_freq = {'count': dist, 'vocab': vocab}
    word_freq = pd.DataFrame(word_freq)
    word_freq = word_freq.sort_values(by="count",ascending=False)
    return word_freq

In [12]:
freq_pos = get_word_freq(positive_phrases)
freq_neg = get_word_freq(negative_phrases)

Put top 5 positive and negative noun phrases in table

In [13]:
top_five_neg = freq_neg.head(5)

neg_table = top_five_neg.to_html().replace('<table border="1" class="dataframe">','<table class="table table-striped">')

In [14]:
top_five_pos = freq_pos.head()

pos_table = top_five_pos.to_html().replace('<table border="1" class="dataframe">','<table class="table table-striped">')

In [15]:
html_string = '''
<html>
    <head>
        <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.1/css/bootstrap.min.css">
        <style>body{ margin:0 100; background:whitesmoke; }</style>
    </head>
    <body>
        <h1>Amazon Product Reviews for Samsung Chromebook</h1>

        <!-- *** Section 1 *** --->
        <h2>Figure 1: Total number of reviews over time</h2>
        <iframe width="1000" height="550" frameborder="0" seamless="seamless" scrolling="no" \
src="''' + all_reviews_plot_url + '''.embed?width=800&height=550"></iframe>
        <p>There are many reviews immeadiately after product release, then decreases over time.</p>
        
        <!-- *** Section 2 *** --->
        <h2>Figure 2: Positive and negative reviews over time</h2>
        <iframe width="1000" height="550" frameborder="0" seamless="seamless" scrolling="no" \
src="''' + sent_reviews_plot_url + '''.embed?width=1000&height=550"></iframe>
        <p>Negative reviews remain roughly constant over time, whereas positive rise initially and then fall over time.\
        Slight peaks in positive reviews maybe related to sales times, eg. January 2014?</p>
        <h3>Top 5 noun phrases for positive reviews</h3>
        ''' + pos_table + '''
        <h3>Top 5 noun phrases for negative reviews</h3>
        ''' + neg_table + '''
    </body>
</html>'''

In [16]:
f = open('Samsung_chromebook_product_review_report.html','w')
f.write(html_string)
f.close()