# Sentiment Analysis: Food Reviews

Multiple Techniques used:
1. VADER(Valence Aware Dictionary and sEntiment Reasoner)- Bag of words approach
2. RoBERTa Pretrained Model from HuggingFace

## Read the Data and NLTK Basics

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('ggplot')

import nltk

In [None]:
#read the data
df = pd.read_csv('/kaggle/input/amazon-fine-food-reviews/Reviews.csv')
df = df.head(10000)
print(df.shape)

# EDA of the dataset

In [None]:
df.head(5)


In [None]:
ax = df['Score'].value_counts().sort_index().plot(kind='bar', 
                                             title='Count of Reviews Based on Stars',
                                             figsize=(10,5))
ax.set_xlabel('Review Stars')
plt.show()

## NLTK 

In [None]:
example = df['Text'][50]
print(example)

In [None]:
tokens = nltk.word_tokenize(example)
tokens[:10]

In [None]:
nltk.download('averaged_perceptron_tagger_eng')
tagged = nltk.pos_tag(tokens)
tagged[:10]

In [None]:
nltk.download('maxent_ne_chunker_tab')
#group into chunks of text
entities= nltk.chunk.ne_chunk(tagged)
entities.pprint()

# 1. VADER Sentiment Scoring

Using NLTK's SentimentIntensityAnalyzer to get neg/neu/pos scores of text
* uses a 'Bag of Words' approach:
   1. Stop words are removed
   2. each word is scored and combined to a total score
* doesn't account for relationship between words

In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm

sia = SentimentIntensityAnalyzer()

In [None]:
sia

In [None]:
sia.polarity_scores("I am so happy!")

In [None]:
sia.polarity_scores("this is the worst thing ever.")

In [None]:
sia.polarity_scores(example)

In [None]:
#run the polarity score on the entire dataset
res = {}
for i, row in tqdm(df.iterrows(), total=len(df)):
    text = row['Text']
    myid = row['Id']
    res[myid] = sia.polarity_scores(text)  

In [None]:
vaders = pd.DataFrame(res).T
vaders = vaders.reset_index().rename(columns={'index':'Id'})
vaders = vaders.merge(df, how='left')

In [None]:
#sentiment score merged with metadata
vaders

# Plot VADER Results

In [None]:
ax = sns.barplot(data=vaders, x='Score', y='compound')
ax.set_title('Compound Score By Amazon Review Stars')
plt.show()

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(12, 3))
sns.barplot(data=vaders, x='Score', y='pos', ax=axs[0])
sns.barplot(data=vaders, x='Score', y='neu', ax=axs[1])
sns.barplot(data=vaders, x='Score', y='neg', ax=axs[2])
axs[0].set_title('Positive')
axs[1].set_title('Neutral')
axs[2].set_title('Negative')
plt.tight_layout()
plt.show()

# Roberta Pretrained Model
* model is trained on large amount of data
* Transformer model accounts for the word and the context related to other words

In [None]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax

In [None]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

In [None]:
#VADER results on example
print(example)
sia.polarity_scores(example)

In [None]:
#Run example for Roberta Model
encoded_text = tokenizer(example, return_tensors='pt')
output = model(**encoded_text)
scores = output[0][0].detach().numpy()
scores = softmax(scores)
scores_dict = {'roberta_neg':scores[0], 
               'roberta_neu':scores[1], 
               'roberta_pos':scores[2]
              }
print(scores_dict)

In [None]:
def polarity_scores_roberta(example):
    if not example.strip():
        return {'roberta_neg': 0, 'roberta_neu': 0, 'roberta_pos': 0}
    
    encoded_text = tokenizer(example, return_tensors='pt',
                             truncation=True, padding=True)
    output = model(**encoded_text)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    scores_dict = {'roberta_neg':scores[0], 
                   'roberta_neu':scores[1], 
                   'roberta_pos':scores[2]}
    return scores_dict

In [None]:
res = {}
for i, row in tqdm(df.iterrows(), total=len(df)):
    try:
        text = row['Text']
        myid = row['Id']
        vader_result = sia.polarity_scores(text) 
        vader_result_rename = {}
        for key, value in vader_result.items():
            vader_result_rename[f"vader_{key}"] = value
    
        roberta_result = polarity_scores_roberta(text)
        combined_result = vader_result_rename | roberta_result
        res[myid] = combined_result
    except RuntimeError:
        print(f"Failed for id {myid}")
        

In [None]:
results_df = pd.DataFrame(res).T
results_df = results_df.reset_index().rename(columns={'index':'Id'})
results_df = results_df.merge(df, how='left')

In [None]:
results_df.head()

# Comparing Scores between the Models

In [None]:
results_df.columns

In [None]:
sns.pairplot(data=results_df, 
             vars=['vader_neg', 'vader_neu', 'vader_pos',
                  'roberta_neg', 'roberta_neu', 'roberta_pos'],
             hue='Score',
             palette='tab10')
plt.show()

# Reviewing Examples

In [None]:
#1-star review with positive sentiment
results_df.query('Score==1')\
        .sort_values('roberta_pos', ascending=False)['Text'].values[1]

In [None]:
results_df.query('Score==1')\
        .sort_values('vader_pos', ascending=False)['Text'].values[0]

In [None]:
#5-star review with negative sentiment
results_df.query('Score==5')\
        .sort_values('roberta_neg', ascending=False)['Text'].values[3]

In [None]:
results_df.query('Score==5')\
        .sort_values('vader_neg', ascending=False)['Text'].values[0]