In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

plt.style.use('ggplot')

import nltk

In [2]:
#reading data
df = pd.read_csv('../input/amazon-fine-food-reviews/Reviews.csv')

In [3]:
df.head()
print(df.shape)
df = df.head(500)
print(df.shape)
df.head(500)

In [4]:
df['Score'].value_counts()

In [5]:
ax = df['Score'].value_counts().sort_index().plot(kind = "bar",
                                                 title = "Count of reviews bu Stars",
                                                 figsize = (10, 5))
ax.set_xlabel('Review Stars')
plt.show()

In [6]:
example = df['Text'][50]
print(example)

In [7]:
tokens = nltk.word_tokenize(example)
tokens[:10]

In [8]:
tagged = nltk.pos_tag(tokens)
tagged[:10]

In [9]:
entities = nltk.chunk.ne_chunk(tagged)
entities.pprint()

In [10]:
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm

sia = SentimentIntensityAnalyzer()

In [11]:
sia.polarity_scores("this is so good!")

In [12]:
#Run polarity score on the dataset 
df

In [13]:
res = {}
for i, row in tqdm(df.iterrows(), total=len(df)):
    text = row['Text']
    myid = row['Id']
    res[myid ] = sia.polarity_scores(text)

In [14]:
vaders = pd.DataFrame(res).T
vaders = vaders.reset_index().rename(columns={'index':'Id'})
vaders = vaders.merge(df,how = 'left' )

In [15]:
vaders.head()

In [16]:
import seaborn as sns
ax = sns.barplot(data = vaders, x = 'Score', y='compound')
ax.set_title('Compound Scoreby amazon stars review')

In [17]:
fig, axs = plt.subplots(1, 3, figsize = (12, 3))
sns.barplot(data = vaders,x = 'Score', y='pos',ax=axs[0])
sns.barplot(data = vaders,x = 'Score', y='neu',ax=axs[1])
sns.barplot(data = vaders,x = 'Score', y='neg',ax=axs[2])
axs[0].set_title('positive')
axs[1].set_title('neutral')
axs[2].set_title('negative')
plt.tight_layout
plt.show()

In [18]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification

In [19]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

In [20]:
#vader results on example
print(example)
sia.polarity_scores(example)
example2 = """This was a good tutorial. I'm trying to get my feet wet in data analytics and found myself overwhelmed while trying to read the NLTK documentation, so thanks for the structured guidance.
I'm working on analyzing sentiment across a dataset I've gathered myself, so I wasn't following along in kaggle and hit a hiccup as AutoModelForSequenceClassification requires pytorch and I initialized a python 3.10 environment."""

In [21]:
from scipy.special import softmax
#run for roberta model
def polarity_scores_roberta(example):
    encoded_text = tokenizer(example2, return_tensors='pt')
    output = model(**encoded_text)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    scores_dict = {
        'roberta_neg' : scores[0],
        'roberta_neu' : scores[1],
        'roberta_pos' : scores[2]
    }
    return scores_dict

In [22]:
res = {}
for i, row in tqdm(df.iterrows(), total= len(df)):
    text = row['Text']
    myid = row['Id']
    roberta_result = polarity_scores_roberta(text)
#     both = {**vader_result, **roberta_result}
    res[myid] = roberta_result

In [26]:
example2 ="""the food is very good"""

In [27]:
encoded_text = tokenizer(example2, return_tensors='pt')
output = model(**encoded_text)
scores = output[0][0].detach().numpy()
scores = softmax(scores)
scores_dict = {
        'roberta_neg' : scores[0],
        'roberta_neu' : scores[1],
        'roberta_pos' : scores[2]
    }
print(scores_dict)