""" This program performs sentiment analysis on a dataset of product reviews. """

In [1]:
# Load the packages that the program will use
import pandas as pd
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import string
import random
from textblob import TextBlob

# Load the english model.
nlp = spacy.load('en_core_web_md')

# Read the csv file that contains the dataset.
df = pd.read_csv('amazon_product_reviews.csv')

In [2]:
# Select the column that contains data we need for the analysis.
reviews = df['reviews.text']

# Call the variable to check it has the desired content.
reviews

0       I thought it would be as big as small paper bu...
1       This kindle is light and easy to use especiall...
2       Didnt know how much i'd use a kindle so went f...
3       I am 100 happy with my purchase. I caught it o...
4       Solid entry level Kindle. Great for kids. Gift...
                              ...                        
4995    This is a great tablet for the price. Amazon i...
4996    This tablet is the perfect size and so easy to...
4997    Purchased this for my son. Has room to upgrade...
4998    I had some thoughts about getting this for a 5...
4999    this is a steal, have 8 gb model as well.This ...
Name: reviews.text, Length: 5000, dtype: object

In [3]:
# Clean the data from missing values
clean_df = df.dropna(subset=['reviews.text'])

In [4]:
# Define a function to process and clean the text of the reviews

def process_data(text):

    # Lowercasing the text to have consistency between the words.
    text = text.lower()     

    # Removing special characters and punctuation from the text
    text = ''.join([char for char in text if char not in string.punctuation])

    doc = nlp(text)     # Tokenization of each word in the text

    # Removing stop words and doing lemmatization to the tokens
    tokens = [token.lemma_ for token in doc if token not in STOP_WORDS]

    processed_text = ' '.join(tokens)    # Joining tokens into a string

    return processed_text
    

In [5]:
# Create a function to do sentiment analysis on the processed_text
def analyze_sentiment(text):

    analysis = TextBlob(text)       # Create the TextBlob object to analize the sentiment polarity

    # Create a conditional statmenet to check the polarity score and express it in words
    if analysis.sentiment.polarity > 0:
        return "Positive"
    elif analysis.sentiment.polarity == 0:
        return "Neutral"
    else:
        return "Negative"

In [7]:
# Testing the program

text = "I do not like this tablet"        # Declaring a variable to store a string

cleaned_text = process_data(text)       # Applying the function to the text

analized_text = analyze_sentiment(cleaned_text)     # Analyzing the sentiment of the text

# Display the results
print(cleaned_text)
print(f"- Has a {analized_text} sentiment.")

I do not like this tablet
- Has a Neutral sentiment.


In [10]:
# Create variables to store random indexex
random_index = random.randint(0, len(df) - 1)
random_index_2 = random.randint(0, len(df) - 1)

# Select random rows from the data
chosen_review_A = df.loc[random_index, 'reviews.text']
chosen_review_B = df.loc[random_index_2, 'reviews.text']

# Apply the function to clean the text of the chosen reviews for the analysis
cleaned_text_A = process_data(chosen_review_A)
cleaned_text_B = process_data(chosen_review_B)

# Call the variable with cleaned text to check the result
analized_text = analyze_sentiment(cleaned_text_A)

# Display the results
print(f"- The review number:", random_index)
print(cleaned_text_A)
print(f"- Has a {analized_text} sentiment.")

- The review number: 2234
I m enjoy the ease of have several reading option without the bulk
- Has a Positive sentiment.


In [21]:
# Check the similarity between two reviews
doc_A = nlp(cleaned_text_A)
doc_B = nlp(cleaned_text_B)

similarity = doc_A.similarity(doc_B)

assumption = ''     # Create a variable to store the conslusion after the similarity score

# Create a conditional statement to check the similarity score and give an assumption
if similarity <= 0.5:
    assumption = "similar"
else:
    assumption = "NOT similar"

# Display the results
print(f"- The review number {random_index}: {cleaned_text_A}")
print(f"- The review number {random_index_2}: {cleaned_text_B}")
print(f"- Have a Similarity score of: {round(similarity, 2)}"
f" and we can assume that the two chosen reviews are {assumption}.")

- The review number 2234: I m enjoy the ease of have several reading option without the bulk
- The review number 2039: move all the ad app to a folder and load what I want small size fit in my sport coat pocket
- Have a Similarity score of: 0.73 and we can assume that the two chosen reviews are NOT similar.
