In [145]:
# Import the necessary packages
import spacy
import pandas as pd
import numpy as np

In [147]:
# Put our raw data into a dataframe
df = pd.read_csv('amazon_product_reviews.csv')
# Output the top 5 rows so we have an initial idea of the dataset
df.head()

Unnamed: 0,id,dateAdded,dateUpdated,name,asins,brand,categories,primaryCategories,imageURLs,keys,...,reviews.dateSeen,reviews.doRecommend,reviews.id,reviews.numHelpful,reviews.rating,reviews.sourceURLs,reviews.text,reviews.title,reviews.username,sourceURLs
0,AVqVGZNvQMlgsOJE6eUY,2017-03-03T16:56:05Z,2018-10-25T16:36:31Z,"Amazon Kindle E-Reader 6"" Wifi (8th Generation...",B00ZV9PXP2,Amazon,"Computers,Electronics Features,Tablets,Electro...",Electronics,https://pisces.bbystatic.com/image2/BestBuy_US...,allnewkindleereaderblack6glarefreetouchscreend...,...,"2018-05-27T00:00:00Z,2017-09-18T00:00:00Z,2017...",False,,0,3,http://reviews.bestbuy.com/3545/5442403/review...,I thought it would be as big as small paper bu...,Too small,llyyue,https://www.newegg.com/Product/Product.aspx%25...
1,AVqVGZNvQMlgsOJE6eUY,2017-03-03T16:56:05Z,2018-10-25T16:36:31Z,"Amazon Kindle E-Reader 6"" Wifi (8th Generation...",B00ZV9PXP2,Amazon,"Computers,Electronics Features,Tablets,Electro...",Electronics,https://pisces.bbystatic.com/image2/BestBuy_US...,allnewkindleereaderblack6glarefreetouchscreend...,...,"2018-05-27T00:00:00Z,2017-07-07T00:00:00Z,2017...",True,,0,5,http://reviews.bestbuy.com/3545/5442403/review...,This kindle is light and easy to use especiall...,Great light reader. Easy to use at the beach,Charmi,https://www.newegg.com/Product/Product.aspx%25...
2,AVqVGZNvQMlgsOJE6eUY,2017-03-03T16:56:05Z,2018-10-25T16:36:31Z,"Amazon Kindle E-Reader 6"" Wifi (8th Generation...",B00ZV9PXP2,Amazon,"Computers,Electronics Features,Tablets,Electro...",Electronics,https://pisces.bbystatic.com/image2/BestBuy_US...,allnewkindleereaderblack6glarefreetouchscreend...,...,2018-05-27T00:00:00Z,True,,0,4,https://reviews.bestbuy.com/3545/5442403/revie...,Didnt know how much i'd use a kindle so went f...,Great for the price,johnnyjojojo,https://www.newegg.com/Product/Product.aspx%25...
3,AVqVGZNvQMlgsOJE6eUY,2017-03-03T16:56:05Z,2018-10-25T16:36:31Z,"Amazon Kindle E-Reader 6"" Wifi (8th Generation...",B00ZV9PXP2,Amazon,"Computers,Electronics Features,Tablets,Electro...",Electronics,https://pisces.bbystatic.com/image2/BestBuy_US...,allnewkindleereaderblack6glarefreetouchscreend...,...,2018-10-09T00:00:00Z,True,177283626.0,3,5,https://redsky.target.com/groot-domain-api/v1/...,I am 100 happy with my purchase. I caught it o...,A Great Buy,Kdperry,https://www.newegg.com/Product/Product.aspx%25...
4,AVqVGZNvQMlgsOJE6eUY,2017-03-03T16:56:05Z,2018-10-25T16:36:31Z,"Amazon Kindle E-Reader 6"" Wifi (8th Generation...",B00ZV9PXP2,Amazon,"Computers,Electronics Features,Tablets,Electro...",Electronics,https://pisces.bbystatic.com/image2/BestBuy_US...,allnewkindleereaderblack6glarefreetouchscreend...,...,2018-05-27T00:00:00Z,True,,0,5,https://reviews.bestbuy.com/3545/5442403/revie...,Solid entry level Kindle. Great for kids. Gift...,Solid entry-level Kindle. Great for kids,Johnnyblack,https://www.newegg.com/Product/Product.aspx%25...


In [148]:
# Isolate the reviews column into it's own dataframe
text_data = df[['reviews.text']]
# Output the top values to see this has worked correctly
text_data.head()

Unnamed: 0,reviews.text
0,I thought it would be as big as small paper bu...
1,This kindle is light and easy to use especiall...
2,Didnt know how much i'd use a kindle so went f...
3,I am 100 happy with my purchase. I caught it o...
4,Solid entry level Kindle. Great for kids. Gift...


In [149]:
# Check for null values, since there are none in this case, we can proceed
text_data.isnull().sum()

reviews.text    0
dtype: int64

In [170]:
# # Load the language model (in this case small)
nlp = spacy.load('en_core_web_sm')

# Create a function to process the data 
def preprocess(data):
    # Tokenise the data via our language model
    doc = nlp(data.lower().strip())
    # Remove unecessary data
    processed = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct]
    # Append the data
    return ' '.join(processed)

In [151]:
# Create a new column called processed.text by taking the review data and applying my preprocess function to it
text_data['processed.text'] = text_data['reviews.text'].apply(preprocess)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text_data['processed.text'] = text_data['reviews.text'].apply(preprocess)


In [171]:
# Check the new column
text_data.head()

Unnamed: 0,reviews.text,processed.text
0,I thought it would be as big as small paper bu...,think big small paper turn like palm think sma...
1,This kindle is light and easy to use especiall...,kindle light easy use especially beach
2,Didnt know how much i'd use a kindle so went f...,not know use kindle go low end m happy little ...
3,I am 100 happy with my purchase. I caught it o...,100 happy purchase catch sale good price norma...
4,Solid entry level Kindle. Great for kids. Gift...,solid entry level kindle great kid gift kid fr...


In [178]:
# Import an additional package
from spacytextblob.spacytextblob import SpacyTextBlob

# Create a function to find the polarity
def spacy_polarity(text):
    # Run the data through the model
    doc = nlp(text)
    # Uncomment the below line to add the pipeline
    # nlp.add_pipe('spacytextblob')
    # Find and attribute polarity
    polarity_value = doc._.blob.polarity
    # Return the polarity
    return polarity_value

In [179]:
# Isolate the values from the dataframe
data = text_data['processed.text'].values
# Create an empty array to fill
pol_data = []
# Loop through the values for each review
for text in data:
    # Call upon our function to find the polarity
    val = spacy_polarity(text)
    # Append these values to the empty array
    pol_data.append(val)

In [183]:
# Print out this array to see it has run correctly
pol_data

[-0.016666666666666663,
 0.2777777777777778,
 0.115625,
 0.2767857142857143,
 0.43200000000000005,
 0.5599999999999999,
 0.1772108843537415,
 -0.2,
 0.41818181818181815,
 0.2,
 0.327037037037037,
 0.37777777777777777,
 0.7,
 0.43333333333333335,
 0.4611111111111111,
 0.30000000000000004,
 0.8,
 0.4166666666666667,
 0.75,
 0.40208333333333335,
 0.2333333333333333,
 0.44722222222222224,
 0.2833333333333333,
 0.4166666666666667,
 0.43333333333333335,
 0.5,
 0.6666666666666666,
 0.0,
 0.4,
 0.25857142857142856,
 0.2,
 0.35000000000000003,
 0.05,
 0.5,
 0.9,
 0.10714285714285714,
 0.8,
 0.25,
 0.3333333333333333,
 0.012499999999999997,
 0.6,
 0.5,
 0.3666666666666667,
 0.47500000000000003,
 0.30727272727272725,
 0.3,
 0.05,
 0.35000000000000003,
 0.4,
 -0.05,
 0.182,
 0.7,
 0.35,
 0.05333333333333333,
 0.6166666666666667,
 0.21666666666666667,
 0.0,
 0.13999999999999999,
 0.28958333333333336,
 0.42500000000000004,
 0.0,
 0.20625,
 0.6333333333333333,
 0.43333333333333335,
 0.5,
 -0.25,
 0.3

In [184]:
# Create an empty array to fill
sentiment = []
# Run through the polarity of each review
for score in pol_data:
    # Check the score and apply an appropriate label to it
    if score > 0:
        sentiment.append('positive')
    elif score < 0:
        sentiment.append('negative')
    else:
        sentiment.append('neutral')

In [185]:
# Calculate the sum of all positive, negative and neutral reviews
posi = sentiment.count('positive')
nega = sentiment.count('negative')
neut = sentiment.count('neutral')

# Calculate the amount of all reviews 
tot = len(sentiment)

# Caluculate each of these as a percentage
posi_perc = (posi / tot) * 100
nega_perc = (nega / tot) * 100
neut_perc = (neut / tot) * 100

# Output the distribution of reviews
print(f"Positive percentage: {posi_perc:.2f}%")
print(f"Negative percentage: {nega_perc:.2f}%")
print(f"Neutral percentage: {neut_perc:.2f}%")

Positive percentage: 88.82%
Negative percentage: 5.00%
Neutral percentage: 6.18%


5.1 - The dataset here is various reviews on Amazon electronics.

5.2 - First we isolate our column, then we remove all NaN and error values from the list to clean this up. We also remove all stop words and punctuations and also decapitalise all the words for consistent comparison. This new version of the data is saved in a separate column.

5.3 - A vast majority of the reviews are positive accounting for 89% of the data. This is followed by neutral reviews at 6% and negative at 5%. 

5.4 - The language model we used was small, had we used medium, it would have been able to discernt polarity more accurately.