# Sentiment Analysis on Amazon Fine Food Reviews

### `Sentiment Analysis` is the use of Natural Language Processing to identify the emotions behind text.

### Techniques to be used:

1. `VADER` - Bag of Words Approach
2. `Roberta Pretrained Model`

=============

# Part 1

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk

In [None]:
# load dataset
try:
    fine_food = pd.read_csv('amazon-fine-food-reviews.csv')
    print('Let\'s go.............')
except:
    print('No such dataset.........')

In [None]:
# first 5 rows
fine_food.head()

In [None]:
# last 5 rows
fine_food.tail()

In [None]:
# check columns
fine_food.keys()

In [None]:
# check the number of rows and columns in the dataset -> 

fine_food.shape

In [None]:
# More understanding of the dataset -> 

fine_food.info()

In [None]:
# check datatypes of all the columns -> 

fine_food.dtypes

In [None]:
# check the count of features/ non-null values in each column -> 

fine_food.count()

In [None]:
# Let's replace the empty strings with NaN values

fine_food= fine_food.replace(' ', np.nan)

# Let's replace the question marks (?) with NaN values

fine_food= fine_food.replace('?', np.nan)

# Let's replace the question marks (.) with NaN values

fine_food= fine_food.replace('.', np.nan)

# Let's replace \N (always add an extra forward class)-> (\\N) with NaN values

fine_food= fine_food.replace('\\N', np.nan)

In [None]:
# check for duplication
fine_food.duplicated().sum()

In [None]:
# check for missing values
fine_food.isna().sum().sum()

===========

# Part 2

### Quick EDA

In [None]:
# score column
fine_food['Score'].value_counts()

In [None]:
# plot the score column
ax = fine_food['Score'].value_counts().sort_index().plot(
    kind='bar',
    title='Count of Reviews by Stars',
    figsize=(10, 5)
)

ax.set_xlabel('Review Stars')
plt.show()

`Observation`: 

1. There is 1 through 5 stars to be reviewed.
2. 5 star reviews are most common

==========

# Basic NLTK

In [None]:
# text column and first 50 sentences
example = fine_food['Text'][50]  
example

In [None]:
# tokenization
tokens = nltk.word_tokenize(example)
tokens[:10] # print the first 10 tokens

In [None]:
# get parts of speech
tagged = nltk.pos_tag(tokens)
tagged[:10] # print the first 10 

In [None]:
# group into chunks/entities
entities = nltk.chunk.ne_chunk(tagged)
entities.pprint()

========

# Step 1: VADER Sentiment Scoring

- We will use NLTK's `SentimentIntensityAnalyzer` to get the negative/neutral/positive scores of the text

- This uses a "bag of words" approach:

1. Stop words are removed
2. Each word is scored and combined to a total score

In [4]:
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm

sia = SentimentIntensityAnalyzer()

In [5]:
# example
sia.polarity_scores('I am so happy') # positive statement

{'neg': 0.0, 'neu': 0.334, 'pos': 0.666, 'compound': 0.6115}

In [6]:
# example 2
sia.polarity_scores('This is the worst thing ever') # negative statement

{'neg': 0.451, 'neu': 0.549, 'pos': 0.0, 'compound': -0.6249}

In [None]:
# Run the polarity score on the dataset

res = {}
for i, row in tqdm(fine_food.iterrows(), total=len(fine_food)):
    text = row['Text']
    my_id = row['Id']
    res[my_id] = sia.polarity_scores(text)

In [None]:
res

In [None]:
vaders = pd.DataFrame(res).T

In [None]:
vaders = vaders.reset_index().rename(columns={'index':'Id'}, inplace=True)

# merge with original dataset
vaders = vaders.merge(fine_food, how='left')

In [None]:
vaders.head()

In [None]:
ax = sns.barplot(data=vaders, x='Score', y='compound')
ax.set_title('Compound Score by Amazon Star Review')
plt.show()

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(12, 3))
sns.barplot(data=vaders, x='Score', y='pos', ax=axs[0])
sns.barplot(data=vaders, x='Score', y='pos', ax=axs[1])
sns.barplot(data=vaders, x='Score', y='pos', ax=axs[2])
axs[0].set_title('Positive')
axs[1].set_title('Neutral')
axs[2].set_title('Negative')
plt.tight_layout()
plt.show()

========================

## `Roberta Pretrained Model`

1. Use a model trained of a large corpus of data

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax

In [None]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment" # model from hugging face
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL) # load weights

====

In [None]:
# The above example
print(example)
sia.polarity_scores(example) # VADER

In [None]:
# run roberta on the example

def polarity_scores_roberta(example):
    encoded_text = tokenizer(example, return_tensors='pt')
    output = model(**encoded_text) # output is tensor
    scores = output[0][0].detach().numpy() # convert to numpy
    scores = softmax(scores)
    scores_dict = {
        'roberta_neg' : scores[0],
        'roberta_neu' : scores[1],
        'roberta_pos' : scores[2]
    }
    return scores_dict

=======

In [None]:
# Run the roberta model on the dataset

res = {}
for i, row in tqdm(fine_food.iterrows(), total=len(fine_food)):
    try:
        text = row['Text']
        my_id = row['Id']
        vader_result = sia.polarity_scores(text)
        
        vader_result_rename = {}
        for key, value in vader_result.items():
            vader_result_rename[f'vader_{key}'] = value
        
        roberta_result = polarity_scores_roberta(text)
        both = {**vader_result, **roberta_result}
        
        res[my_id] = both
    except RuntimeError:
        print(f'Broke for id {my_id}')

In [None]:
results_df = pd.DataFrame(res).T
results_df = results_df.reset_index().rename(columns={'index':'Id'}, inplace=True)

# merge with original dataset
results_df = results_df.merge(fine_food, how='left')

In [None]:
results_df.head()

========

# Step 3

# `Combine and Compare`

In [None]:
sns.pairplot(
    data=results_df,
    vars=[
        'vader_neg', 'vader_neu', 'vader_pos',
        'roberta_neg', 'roberta_neu', 'roberta_pos'
    ],
    hue='Score',
    palette='tab10'
)

# Step 4

# `Review Examples`

- Positive 1-Star and Negative 5-Star Reviews

Lets look at some examples where the model scoring and review score differ the most

In [None]:
results_df.query('Score == 1').sort_values('roberta_pos', ascending=False)['Text'].values[0]

In [None]:
results_df.query('Score == 1').sort_values('vader_pos', ascending=False)['Text'].values[0]

negative sentiment 5-star review

In [None]:
results_df.query('Score == 5').sort_values('roberta_neg', ascending=False)['Text'].values[0]

In [None]:
results_df.query('Score == 5').sort_values('vader_neg', ascending=False)['Text'].values[0]

# The END