In [None]:
import nltk
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('maxent_ne_chunker_tab')
nltk.download('words')
nltk.download('vader_lexicon')


: 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('ggplot')

import opendatasets as od
import nltk

In [None]:
od.download("https://www.kaggle.com/datasets/snap/amazon-fine-food-reviews")

In [None]:
df = pd.read_csv('/content/amazon-fine-food-reviews/Reviews.csv')

# Quick EDA

In [None]:
df.Score.value_counts().sort_index().plot(kind = 'bar',title='Count of reviews')

# Basic of NLTK :


In [None]:
example = df.Text[50]
example

In [None]:
tokens = nltk.word_tokenize(example)
tokens[:10]

In [None]:
tag = nltk.pos_tag(tokens)
tag[:10]

In [None]:
entities = nltk.chunk.ne_chunk(tag)
entities.pprint()

# Step 1 : VADER Sentiment Scoring

In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm

sia = SentimentIntensityAnalyzer()

In [None]:
sia.polarity_scores('I am so happy')

In [None]:
sia.polarity_scores(example)

We need to run this polarity score on the entire dataset

In [None]:
res = {}
for i, row in tqdm(df.iterrows(), total=len(df)):
    text = row.Text
    myid = row.Id
    res[myid] = sia.polarity_scores(text)

In [None]:
vaders = pd.DataFrame(res).T
vaders = vaders.reset_index().rename(columns={'index': 'Id'})
vaders = vaders.merge(df, how='left')

In [None]:
vaders

Plot Vaders Results

In [None]:
ax = sns.barplot(data=vaders,x='Score', y='compound')
ax.set_title('Compound score by amazon score review')
plt.show()

# Step 3 : Roberta Pre trained model

In [None]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax

In [None]:
model = f"cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model)
model = AutoModelForSequenceClassification.from_pretrained(model)

In [None]:
# vader result
print(example)
sia.polarity_scores(example)

In [None]:
def polarity_scores_roberta(example):
  # Run for roberta model
  encoded_text = tokenizer(example,return_tensors='pt')
  output = model(**encoded_text)
  scores = output[0][0].detach().numpy()
  scores = softmax(scores)
  scores_dict = {
    'roberta_neg' : scores[0],
    'roberta_neu' : scores[1],
    'roberta_pos' : scores[2]
  }
  return scores_dict

In [None]:
res = {}
for i, row in tqdm(df.iterrows(),total = len(df)):
  try:
    text = row.Text
    myid = row.Id
    vader_result = sia.polarity_scores(text)

    vader_result_rename = {}
    for key, value in vader_result.items():
      vader_result_rename[f"vader_{key}"] = value

    roberta_result = polarity_scores_roberta(text)
    both = {**vader_result_rename,**roberta_result}
    res[myid] = both
  except RuntimeError:
    print(f'Broke for id {myid}')

In [None]:
results_df = pd.DataFrame(res).T
results_df = results_df.reset_index().rename(columns={'index': 'Id'})
results_df = results_df.merge(df, how='left')

In [None]:
results_df.head()

In [None]:
results_df.columns

# Comparing Scores between Models

In [None]:
sns.pairplot(data = results_df,vars=['vader_neg', 'vader_neu', 'vader_pos', 'vader_compound',
       'roberta_neg', 'roberta_neu', 'roberta_pos'],hue = 'Score', palette = 'tab10')
plt.show()

# Review and examples :

In [None]:
results_df[results_df['Score']==1].sort_values('roberta_pos',ascending=False)['Text'].values[0]

In [None]:
#negative 5 star statemts:
results_df[results_df['Score']==5].sort_values('roberta_neg',ascending=False)['Text'].values[0]