In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk

In [None]:
# Read processed data
df = pd.read_csv('processed_data.csv')
print(df.shape)

In [None]:
df.head()

In [None]:
example_reviews = df['review_content'][10]
print(example_reviews)

In [None]:
# If encountered LookupError, run following code once
# nltk.download()

In [None]:
tokens = nltk.word_tokenize(example_reviews)
tokens[:20]

In [None]:
tags = nltk.pos_tag(tokens)
tags[:10]

In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm

sia = SentimentIntensityAnalyzer()

In [None]:
sia.polarity_scores(example_reviews)


In [None]:
# Run the polarity score on the entire dataset
reviews = {}
for i, row in tqdm(df.iterrows(), total=len(df)):
    content = row['review_content']
    id = str(row.name)
    reviews[id] = sia.polarity_scores(content)

In [None]:
vaders = pd.DataFrame(reviews)
vaders

#### Process succeed, but we want the sentiment score on each row, so we can use transpose operation

In [None]:
vaders = pd.DataFrame(reviews).T
vaders = vaders.reset_index().rename(columns={'index': 'Id'})
vaders = vaders.merge(df, how='left',left_index=True, right_index=True)
vaders

In [None]:
import math
# Fill the NaN and using math.floor() to get rating_bin
vaders.fillna({'rating': 0}, inplace=True)
vaders['rating_bin'] = vaders['rating'].apply(lambda x: math.floor(x))

plt.figure(figsize=(6, 4))
colors = sns.color_palette('Set1',5)
ax = sns.barplot(data=vaders, x='rating_bin', y='compound', hue='rating_bin', legend = False, palette=colors)
ax.set_title('Compound Score by Rating')
ax.set_xlabel('Rating')
ax.set_ylabel('Compound Score')
plt.show()


In [None]:
# Checking why the compound score is so different in rating 5
vaders_check = vaders[vaders['rating_bin'] >= 5]
vaders_check.head()

#### We only have 3 items in rating 5, and one of them get 0.0000, so it makes a really high ci.

In [None]:
# Generating the plot for each score
fig, axs = plt.subplots(1, 3, figsize=(12, 3))
sns.barplot(data=vaders, x='rating_bin', y='pos', ax=axs[0], hue='rating_bin', legend = False, palette=colors)
sns.barplot(data=vaders, x='rating_bin', y='neu', ax=axs[1], hue='rating_bin', legend = False, palette=colors)
sns.barplot(data=vaders, x='rating_bin', y='neg', ax=axs[2], hue='rating_bin', legend = False, palette=colors)
axs[0].set_title('Positive')
axs[1].set_title('Neutral')
axs[2].set_title('Negative')
plt.tight_layout()
plt.show()

#### Interestingly, the sentiment analysis suggests that most of the reviews are neutral. The most significant difference is the usage of negative words. We can observe a negative correlation between ratings and the negative words.

In [None]:
# Discount percentage vs compound score
vaders['discount_percentage'] = vaders['discount_percentage'].str.replace('%', '').astype(int)

fig, ax = plt.subplots(figsize=(12, 7))
sns.regplot(data=vaders, x = 'discount_percentage', y='compound', dropna = True)
ax.set_xlabel('Discount Percentage (%)')
ax.set_ylabel('Compound Score')
ax.set_title('Discount Percentage vs Compound Score')
plt.show()

#### Compound score is low correlated with the discount percentage of the product.