In [1]:
# Import and load neccesary libraries 
import pandas as pd
import spacy
from spacytextblob.spacytextblob import SpacyTextBlob
nlp = spacy.load('en_core_web_sm')
nlp.add_pipe('spacytextblob')

<spacytextblob.spacytextblob.SpacyTextBlob at 0x280477da0>

In [2]:
# Load in dataset using pandas
df = pd.read_csv('Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products_May19.csv')

In [3]:
# Assign target column to variable 
reviews = df['reviews.text']
reviews.head()

0    I order 3 of them and one of the item is bad q...
1    Bulk is always the less expensive way to go fo...
2    Well they are not Duracell but for the price i...
3    Seem to work as well as name brand batteries a...
4    These batteries are very long lasting the pric...
Name: reviews.text, dtype: object

In [4]:
# Remove NA from reviews.text column 
clean_reviews = df.dropna(subset=['reviews.text'])
review_text = clean_reviews['reviews.text']
print(review_text)

0        I order 3 of them and one of the item is bad q...
1        Bulk is always the less expensive way to go fo...
2        Well they are not Duracell but for the price i...
3        Seem to work as well as name brand batteries a...
4        These batteries are very long lasting the pric...
                               ...                        
28327    I got 2 of these for my 8 yr old twins. My 11 ...
28328    I bought this for my niece for a Christmas gif...
28329    Very nice for light internet browsing, keeping...
28330    This Tablet does absolutely everything I want!...
28331    At ninety dollars, the expectionations are low...
Name: reviews.text, Length: 28332, dtype: object


In [5]:
# Function to clean text within the column by converting to lower case, removing stop words and punctuation.
def preprocess(text):
    
    doc = nlp(text.lower().strip())
    processed = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct]
    
    return ' '.join(processed)


In [6]:
# Apply processing function to reviews within the dataframe and create a new column containing these cleaned reviews
review_text['processed.text'] = review_text.apply(preprocess)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  review_text['processed.text'] = review_text.apply(preprocess)


In [7]:
# Preview the cleaned reviews column 
print(review_text['processed.text'])

0        order 3 item bad quality miss backup spring pc...
1                          bulk expensive way product like
2                                     duracell price happy
3                            work brand battery well price
4                            battery long last price great
                               ...                        
28327    get 2 8 yr old twin 11 yr old well perfect way...
28328         buy niece christmas gift.she 9 year old love
28329    nice light internet browsing keep email view v...
28330    tablet absolutely want watch tv show movie che...
28331    ninety dollar expectionation low good table go...
Name: reviews.text, Length: 28332, dtype: object


In [12]:
# Function to perform sentiment analysis of the cleaned reviews using spacy's polarity function
def predict_sentiment(review):
    # Process the review text using the processing function
    clean_review = preprocess(review)
    # Analyze sentiment using spacy's TextBlob capabilities
    doc = nlp(clean_review)
    # Get polarity score
    polarity = doc._.polarity
    sent = doc._.blob.sentiment
    # Classify sentiment based on polarity score
    if polarity > 0:
        return 'Positive'
    elif polarity < 0:
        return 'Negative'
    else:
        return 'Neutral'
     
# Function for sentiment analysis using spacy's sentiment attribute values
def predict_sentiment_sent(review):
    # Process the review text using the processing function
    clean_review = preprocess(review)
    # Analyze sentiment using spaCy's TextBlob capabilities
    doc = nlp(clean_review)
    # Get sentiment attribute values
    sent = doc._.blob.sentiment
    # return sentiment attribute values 
    return sent

# Variables to select the index values for reviews within the cleaned column    
index_1 = 4
index_2 = 50
index_3 = 77
index_4 = 17   
    
    # Test the sentiment analysis function on some product reviews from the dataset
sample_reviews = [review_text['processed.text'][index_1], review_text['processed.text'][index_2], 
                  review_text['processed.text'][index_3], review_text['processed.text'][index_4]
                  ]
    
print("Product Review sentiment analysis:")
for review in sample_reviews:
    s = predict_sentiment(review)
    A = predict_sentiment_sent(review)
    print(f"Review: {review} \nSentiment based upon polarity is: {s}\nSentiment values using sentiment attribute are: {A}\n________________________________________________________________________________________________________________________")

Product Review sentiment analysis:
Review: battery long last price great 
Sentiment based upon polarity is: Positive
Sentiment values using sentiment attribute are: Sentiment(polarity=0.375, subjectivity=0.575)
________________________________________________________________________________________________________________________
Review: definitely love price quantity kid tthrough fast 
Sentiment based upon polarity is: Positive
Sentiment values using sentiment attribute are: Sentiment(polarity=0.35, subjectivity=0.6)
________________________________________________________________________________________________________________________
Review: last long duracell 
Sentiment based upon polarity is: Negative
Sentiment values using sentiment attribute are: Sentiment(polarity=-0.05, subjectivity=0.4)
________________________________________________________________________________________________________________________
Review: opinion near long duracel thing like lead candle crazy trail ca

In [14]:
# Function that calculates similarity between two product reviews using spacy
def calculate_similarity(review1, review2):
    # Tokenize the reviews
    doc1 = nlp(review1)
    doc2 = nlp(review2)
    # Calculate similarity using spacy's similarity function
    similarity_score = doc1.similarity(doc2)
    return similarity_score
index1 = 0
index2 = 1

# Check if indices are within the bounds of the column
if index1 < len(review_text) and index2 < len(review_text):
    review1 = review_text['processed.text'][index1]
    review2 = review_text['processed.text'][index2]

    # clean the reviews using the preprocess function previously created
    clean_review1 = preprocess(review1)
    clean_review2 = preprocess(review2)

    # Calculate similarity between the two reviews using the .similarity function
    similarity_score = calculate_similarity(clean_review1, clean_review2)
    
    print(f'Review {index1}: {clean_review1}\nReview {index2}: {clean_review2}')
    
    print(f"The similarity between Review {index1} and Review {index2} is: {similarity_score}")
else:
    print("Index out of bounds. Please select a number within the column length.")

Review 0: order 3 item bad quality miss backup spring pc aluminum battery work
Review 1: bulk expensive way product like
The similarity between Review 0 and Review 1 is: 0.6573121598608317


  similarity_score = doc1.similarity(doc2)
