In [582]:
# Load libraries
import spacy
import pandas as pd
from spacytextblob.spacytextblob import SpacyTextBlob

# Load language model
nlp = spacy.load('en_core_web_sm')

# Add extension
nlp.add_pipe('spacytextblob')

<spacytextblob.spacytextblob.SpacyTextBlob at 0x219b829bbd0>

In [583]:
# Load dataset & inspect 
df = pd.read_csv("Amazon_Reviews.csv")
print(df.head(3))  # Check dataset loaded



                     id             dateAdded           dateUpdated  \
0  AVqVGZNvQMlgsOJE6eUY  2017-03-03T16:56:05Z  2018-10-25T16:36:31Z   
1  AVqVGZNvQMlgsOJE6eUY  2017-03-03T16:56:05Z  2018-10-25T16:36:31Z   
2  AVqVGZNvQMlgsOJE6eUY  2017-03-03T16:56:05Z  2018-10-25T16:36:31Z   

                                                name       asins   brand  \
0  Amazon Kindle E-Reader 6" Wifi (8th Generation...  B00ZV9PXP2  Amazon   
1  Amazon Kindle E-Reader 6" Wifi (8th Generation...  B00ZV9PXP2  Amazon   
2  Amazon Kindle E-Reader 6" Wifi (8th Generation...  B00ZV9PXP2  Amazon   

                                          categories primaryCategories  \
0  Computers,Electronics Features,Tablets,Electro...       Electronics   
1  Computers,Electronics Features,Tablets,Electro...       Electronics   
2  Computers,Electronics Features,Tablets,Electro...       Electronics   

                                           imageURLs  \
0  https://pisces.bbystatic.com/image2/BestBuy_US...   
1

In [584]:
print(df.columns) # Check correct column names & spellings

Index(['id', 'dateAdded', 'dateUpdated', 'name', 'asins', 'brand',
       'categories', 'primaryCategories', 'imageURLs', 'keys', 'manufacturer',
       'manufacturerNumber', 'reviews.date', 'reviews.dateAdded',
       'reviews.dateSeen', 'reviews.doRecommend', 'reviews.id',
       'reviews.numHelpful', 'reviews.rating', 'reviews.sourceURLs',
       'reviews.text', 'reviews.title', 'reviews.username', 'sourceURLs'],
      dtype='object')


In [585]:
print(df.info)    # Check general structure of dataset

<bound method DataFrame.info of                         id             dateAdded           dateUpdated  \
0     AVqVGZNvQMlgsOJE6eUY  2017-03-03T16:56:05Z  2018-10-25T16:36:31Z   
1     AVqVGZNvQMlgsOJE6eUY  2017-03-03T16:56:05Z  2018-10-25T16:36:31Z   
2     AVqVGZNvQMlgsOJE6eUY  2017-03-03T16:56:05Z  2018-10-25T16:36:31Z   
3     AVqVGZNvQMlgsOJE6eUY  2017-03-03T16:56:05Z  2018-10-25T16:36:31Z   
4     AVqVGZNvQMlgsOJE6eUY  2017-03-03T16:56:05Z  2018-10-25T16:36:31Z   
...                    ...                   ...                   ...   
4995  AVqkIdZiv8e3D1O-leaJ  2017-03-06T14:59:25Z  2017-09-04T11:19:31Z   
4996  AVqkIdZiv8e3D1O-leaJ  2017-03-06T14:59:25Z  2017-09-04T11:19:31Z   
4997  AVqkIdZiv8e3D1O-leaJ  2017-03-06T14:59:25Z  2017-09-04T11:19:31Z   
4998  AVqkIdZiv8e3D1O-leaJ  2017-03-06T14:59:25Z  2017-09-04T11:19:31Z   
4999  AVqkIdZiv8e3D1O-leaJ  2017-03-06T14:59:25Z  2017-09-04T11:19:31Z   

                                                   name       asins   brand  \


This is a comparitively small datset, compared with many NLP processing projects, with 24 features. 

The target feature here is the 'reviews.text' column.

In [586]:
reviews_data = df['reviews.text'] # Create dataset using reviews column alone



In [587]:
print(reviews_data.isnull().sum())  # Check missing values

0


A sample of reviews has been loaded, the data is non-numeric, & there are no missing values in the dataset.

In [588]:
# To create a function to analyse sentiment, we will first clean the text in the reviews
def clean_text(text):
    text = str(text).lower().strip()

    # Process with spacy
    doc = nlp(text)
    cleaned_tokens = [token.text for token in doc if not token.is_stop and token.is_alpha] 
    cleaned_text = ' '.join(cleaned_tokens) # Converts cleaned tokens into 1 long string with no gaps

    return cleaned_text

In [589]:
# Check the function works on the first review in the dataset
# Select the first review as a sample
sample = reviews_data.iloc[0]
sample

'I thought it would be as big as small paper but turn out to be just like my palm. I think it is too small to read on it... not very comfortable as regular Kindle. Would definitely recommend a paperwhite instead.'

In [590]:
# Convert the sample text
converted_sample = clean_text(sample)
converted_sample

'thought big small paper turn like palm think small read comfortable regular kindle definitely recommend paperwhite instead'

The text has been stripped of stop words, & the string re-joined together. So we can see that the function works.

In [591]:
# Create a new column of cleaned text
reviews_data_cleaned = reviews_data.apply(clean_text)

reviews_data_cleaned.head()


0    thought big small paper turn like palm think s...
1               kindle light easy use especially beach
2    nt know use kindle went lower end m happy litt...
3    happy purchase caught sale good price normally...
4    solid entry level kindle great kids gifted kid...
Name: reviews.text, dtype: object

In [592]:
# Join cleaned column to original dataframe using concatanation
df = pd.concat([reviews_data, reviews_data_cleaned], axis = 1, join= 'inner')
df.head()

Unnamed: 0,reviews.text,reviews.text.1
0,I thought it would be as big as small paper bu...,thought big small paper turn like palm think s...
1,This kindle is light and easy to use especiall...,kindle light easy use especially beach
2,Didnt know how much i'd use a kindle so went f...,nt know use kindle went lower end m happy litt...
3,I am 100 happy with my purchase. I caught it o...,happy purchase caught sale good price normally...
4,Solid entry level Kindle. Great for kids. Gift...,solid entry level kindle great kids gifted kid...


Because both columns have the same name, the name of the cleaned column will replaced using indexing.

In [593]:
# Renames 1 column using indexing, to avoid errors relating to the dataframe's characteristics
df.columns.values[1] = "cleaned_reviews.text"   
df.head()

Unnamed: 0,reviews.text,cleaned_reviews.text
0,I thought it would be as big as small paper bu...,thought big small paper turn like palm think s...
1,This kindle is light and easy to use especiall...,kindle light easy use especially beach
2,Didnt know how much i'd use a kindle so went f...,nt know use kindle went lower end m happy litt...
3,I am 100 happy with my purchase. I caught it o...,happy purchase caught sale good price normally...
4,Solid entry level Kindle. Great for kids. Gift...,solid entry level kindle great kids gifted kid...


In [594]:
# Pick 1 review
example = df.iat[0,1]
example

'thought big small paper turn like palm think small read comfortable regular kindle definitely recommend paperwhite instead'

In [595]:
# Run the natural language program on the example review
doc = nlp(example)


In [596]:
# Analyse polarity of comment
polarity = doc._.blob.polarity
polarity

-0.016666666666666663

This comment is analysed as slightly negative. Reading it, infers the same conclusion.

In [597]:
# Creating a function to evaluate polarity of the all the reviews
def analyse_sentiment(text):
    doc = nlp(text)

    polarity = doc._.blob.polarity

    if polarity > 0:
        sentiment = "Positive" 
    elif polarity < 0:
        sentiment = "Negative"
    else:
        sentiment = "Neutral"

    return sentiment

In [598]:
# Define reviews for checking analysis
review1 = df.iat[0,1]
review2 = df.iat[1,1]
review3 = df.iat[2,1]

review1, review2, review3

('thought big small paper turn like palm think small read comfortable regular kindle definitely recommend paperwhite instead',
 'kindle light easy use especially beach',
 'nt know use kindle went lower end m happy little dark')

In [599]:
# Define sentiment analysis for eac review
sentiment1 = analyse_sentiment(review1)
sentiment1

'Negative'

In [600]:
sentiment2 = analyse_sentiment(review2)
sentiment2

'Positive'

In [601]:
sentiment3 = analyse_sentiment(review3)
sentiment3

'Positive'

Looking at these 3 reviews, the sentiment analysis appears to have correctly analyse these revviews.

So this looks like a good model, based on the reviews analysed here.

In [602]:
# compare similarity
def compare_similarity(statement1, statement2):
    doc1 = nlp(statement1)
    doc2 = nlp(statement2)

    similarity = doc1.similarity(doc2)

    return similarity

In [603]:
# Compare 2 reviews for similarity
compare_similarity(review1, review2)

  similarity = doc1.similarity(doc2)


0.7042453286282411

This similarity analysis found these 2 reviews similar. This score of 0.7 (around 70%) is quite high.

This is because a score of 1 is a perfect match; a score of 0 is total dissimilarity. 
I do not agree with this analysis. Review 1. is slightly negative, & quite long. Whereas Review 2. is positive and more detailed. They are therefore not similar.

In [604]:
# Compare a different pair of reviews
compare_similarity(review1, review3)

  similarity = doc1.similarity(doc2)


0.7653237270899829

The same analysis finds Reviews 1 & 3 to also be similar, slightly more so than the comparison for Review 2. 

Although these 2 comments do seem to be more similar, than the previous comparison, this score of 77% is still higher than expected.

In order to inspect these models further, more sample reviews for each function will be analysed.

In [605]:
# First, the effectiveness of analysis function is evaluated
# Create a column of analysed, cleaned reviews. 
df_sentiment = reviews_data_cleaned.apply(analyse_sentiment)
df_sentiment.head()

0    Negative
1    Positive
2    Positive
3    Positive
4    Positive
Name: reviews.text, dtype: object

In [606]:
# Add the column to the dataset
df = pd.concat([reviews_data, reviews_data_cleaned, df_sentiment], axis = 1, join= 'inner')
df.head()

Unnamed: 0,reviews.text,reviews.text.1,reviews.text.2
0,I thought it would be as big as small paper bu...,thought big small paper turn like palm think s...,Negative
1,This kindle is light and easy to use especiall...,kindle light easy use especially beach,Positive
2,Didnt know how much i'd use a kindle so went f...,nt know use kindle went lower end m happy litt...,Positive
3,I am 100 happy with my purchase. I caught it o...,happy purchase caught sale good price normally...,Positive
4,Solid entry level Kindle. Great for kids. Gift...,solid entry level kindle great kids gifted kid...,Positive


In [607]:
# Rename all columns as before
df.columns.values[1] = "cleaned_reviews.text"   
df.columns.values[2] = 'sentiment'
df.head(30)

Unnamed: 0,reviews.text,cleaned_reviews.text,sentiment
0,I thought it would be as big as small paper bu...,thought big small paper turn like palm think s...,Negative
1,This kindle is light and easy to use especiall...,kindle light easy use especially beach,Positive
2,Didnt know how much i'd use a kindle so went f...,nt know use kindle went lower end m happy litt...,Positive
3,I am 100 happy with my purchase. I caught it o...,happy purchase caught sale good price normally...,Positive
4,Solid entry level Kindle. Great for kids. Gift...,solid entry level kindle great kids gifted kid...,Positive
5,This make an excellent ebook reader. Don't exp...,excellent ebook reader expect device read basi...,Positive
6,"I ordered this for my daughter, as I have the ...",ordered daughter black paperwhite love read bi...,Positive
7,I bought my Kindle about 2 months ago and the ...,bought kindle months ago battery dead charge,Negative
8,"amazon kindle is always the best ebook, upgrad...",amazon kindle best ebook upgrade new model,Positive
9,"It's beyond my expectation, and it can even sh...",expectation music score fast turning,Positive


All of these reviews appear to be correctly classified, but the negative reviews seem to be more nuanced.

Taking a closer look at the negative reviews will enable fuller evaluation of the sentiment model.

In [608]:
# Just negative comments.
negative_reviews = df[df["sentiment"] == "Negative"]
negative_reviews.head(30)

Unnamed: 0,reviews.text,cleaned_reviews.text,sentiment
0,I thought it would be as big as small paper bu...,thought big small paper turn like palm think s...,Negative
7,I bought my Kindle about 2 months ago and the ...,bought kindle months ago battery dead charge,Negative
27,I've wanted a kindle for a while and decided t...,wanted kindle decided bb sale disappointed,Negative
49,The battery doesn't last as long as was advert...,battery long advertised money recommend backli...,Negative
80,it would not load my books proper. took a doze...,load books proper took dozen tries erasing dre...,Negative
85,I especially like the Amazon has a system to c...,especially like amazon system convert pdf kind...,Negative
93,"The screen is too dark, and cannot adjust the ...",screen dark adjust brightness,Negative
143,I have to say it was a little confusing and fr...,little confusing frustrating getting verificat...,Negative
188,I‚Äôve asked itquestions and answers were unkn...,asked itquestions answers unknown help,Negative
209,I bought the echo after seeing it demonstrated...,bought echo seeing demonstrated brother procee...,Negative


From the portion of the reviews that is visible, the majority of these negative reviews appear to be correctly classified. Rows 85, 335, & 513 do not appear to be negative though. So again, further inspection would yield more detailed evaluation. 

On the whole, the sentiment analysis using blob.polarity is effective.

In [609]:
# Defines 12 more reviews for evaluating the similarity model
review10 = df.iat[9,1]
review11 = df.iat[10,1]
review12 = df.iat[11,1]
review13 = df.iat[12,1]
review14 = df.iat[13,1]
review15 = df.iat[14,1]
review16 = df.iat[15,1]
review17 = df.iat[16,1]
review18 = df.iat[17,1]
review19 = df.iat[18,1]
review20 = df.iat[19,1]
review21 = df.iat[20,1]

In [610]:
# Randomly compare 3 pairs of reviews again
# See reviews 10 & 13
review10, review13

('expectation music score fast turning', 'good product child need read books')

In [611]:
# Analyse similarity using user function defined above, & give score for comparing randomly selected reviews
compare_similarity(review10, review13)

  similarity = doc1.similarity(doc2)


0.6810882735658104

In [612]:
# See reviews 20 & 18
review20, review18

('great read external pdfs properly zooms little kindle books provides great experience battery life awesome',
 'item work easy read day light')

In [613]:
compare_similarity(review20, review18) # Gives score for  comparing reviews 20 & 18

  similarity = doc1.similarity(doc2)


0.7297337505338036

In [614]:
# See reviews 19 & 13
review19, review3

('great product service refer friend sales man r good',
 'nt know use kindle went lower end m happy little dark')

In [615]:
compare_similarity(review19, review3) # Gives score for comparing reviews 19 & 3


  similarity = doc1.similarity(doc2)


0.5144639768860673

From this further comparison of reviews, again, the comments do not appear to be as similar as the score indicates. 

So in conclusion, the similarity method does not appear to be as effective as the polarity attribute.   The user warning message in line 603 regarding the use of small models, may be affecting the way the sentences have been analysed. It may be possible that they have been tagged similarly more because of the subject matter, rather than the actual sentiment.


