In [124]:
# Import Libraries
import spacy
import pandas as pd
from spacy.lang.en.stop_words import STOP_WORDS
import string
from spacytextblob.spacytextblob import SpacyTextBlob

In [125]:
# Load spacy model
nlp = spacy.load("en_core_web_md")

# Load Textblob
nlp.add_pipe("spacytextblob")

<spacytextblob.spacytextblob.SpacyTextBlob at 0x252e5c62a80>

In [126]:
# Load Dataset
df = pd.read_csv("Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products_May19.csv")

In [127]:
# Data Cleaning and Preprocessing
df.isnull().sum()


id                         0
dateAdded                  0
dateUpdated                0
name                       0
asins                      0
brand                      0
categories                 0
primaryCategories          0
imageURLs                  0
keys                       0
manufacturer               0
manufacturerNumber         0
reviews.date               0
reviews.dateSeen           0
reviews.didPurchase    28323
reviews.doRecommend    12246
reviews.id             28291
reviews.numHelpful     12217
reviews.rating             0
reviews.sourceURLs         0
reviews.text               0
reviews.title              0
reviews.username           5
sourceURLs                 0
dtype: int64

In [128]:
df = df.drop(columns=["reviews.didPurchase", "reviews.doRecommend", "reviews.id", "reviews.numHelpful"])

In [129]:
# Review Text
reviews_data = df["reviews.text"]
clean_data = df.dropna(subset=["reviews.text"])

In [130]:
# Text Preprocessing
def preprocess_text(text):
    """
    function assist with whitespace trimming,
    lowercase,
    remove stopwards,
    remove punctuation"""
    text = str(text).lower().strip()
    doc = nlp(text)
    tokens = []
    for token in doc:
        if token.text not in STOP_WORDS and token.text not in string.punctuation:
            tokens.append(token.lemma_)
    return " ".join(tokens)  

In [131]:
# Sentiment Analysis Function
def analyze_sentiment(review):
    doc = nlp(review)
    polarity = doc._.blob.polarity

    if polarity > 0:
        return "Positive"
    elif polarity < 0:
        return "Negative"
    else:
        return "Neutral"

In [132]:
# Test Model
sample_reviews = [
    "Amazing, I wuld defintely buy this product again!",
    "Procuct is GREAT work very well.",
    "Just absolutely terrible! Would not recommend",
]

print("Your product review results: ")
for review in sample_reviews:
    print(f"Customer review: {review}")
    print(f"Predicted Sentiment: {analyze_sentiment(review)}\n")

Your product review results: 
Customer review: Amazing, I wuld defintely buy this product again!
Predicted Sentiment: Positive

Customer review: Procuct is GREAT work very well.
Predicted Sentiment: Positive

Customer review: Just absolutely terrible! Would not recommend
Predicted Sentiment: Negative



In [133]:
# Similarity Check
review_1 = clean_data["reviews.text"].iloc[0]
review_2 = clean_data["reviews.text"].iloc[1]

doc1 = nlp(preprocess_text(review_1))
doc2 = nlp(preprocess_text(review_2))

similarity_score = doc1.similarity(doc2)

print("Review 1: ", review_1)
print("Review 2: ", review_2)
print(f"\nSimilarity Score: {similarity_score}")
print("Score ranges from 0 (not similar) to 1 (very similar)")

Review 1:  I order 3 of them and one of the item is bad quality. Is missing backup spring so I have to put a pcs of aluminum to make the battery work.
Review 2:  Bulk is always the less expensive way to go for products like these

Similarity Score: 0.7479993104934692
Score ranges from 0 (not similar) to 1 (very similar)
