In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from tqdm.notebook import tqdm
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax


nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('vader_lexicon')

plt.style.use('ggplot')


df = pd.read_csv('Reviews.csv')
print(df.shape)
df = df.head(500)
print(df.shape)


ax = df['Score'].value_counts().sort_index().plot(kind='bar', title='Count of Reviews by Stars', figsize=(10, 5))
ax.set_xlabel('Review Stars')
plt.show()


example = df["Text"][50]
print(example)


tokens = nltk.word_tokenize(example)
print(tokens[:10])


tagged = nltk.pos_tag(tokens)
print(tagged[:10])


entities = nltk.chunk.ne_chunk(tagged)
entities.pprint()


sia = SentimentIntensityAnalyzer()
print(sia.polarity_scores('I am sad!'))
print(sia.polarity_scores(example))


res_vader = {}
for i, row in tqdm(df.iterrows(), total=len(df)):
    text = row['Text']
    myid = row['Id']
    res_vader[myid] = sia.polarity_scores(text)

vaders = pd.DataFrame(res_vader).T.reset_index().rename(columns={'index': 'Id'})
vaders = vaders.merge(df, how='left')
print(vaders.shape)


ax = sns.barplot(data=vaders, x='Score', y='compound', color='yellow')
ax.set_title('Compound Score by Reviews')
plt.show()

fig, axs = plt.subplots(1, 3, figsize=(13, 3))
sns.barplot(data=vaders, x='Score', y='pos', ax=axs[0])
sns.barplot(data=vaders, x='Score', y='neu', ax=axs[1])
sns.barplot(data=vaders, x='Score', y='neg', ax=axs[2])
axs[0].set_title('Positive')
axs[1].set_title('Neutral')
axs[2].set_title('Negative')
plt.show()


MODEL = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)


encoded_text = tokenizer(example, return_tensors='pt')
output = model(**encoded_text)
scores = output[0][0].detach().numpy()
scores = softmax(scores)
scores_dict = {
    'roberta_neg': scores[0],
    'roberta_neu': scores[1],
    'roberta_pos': scores[2]
}
print(scores_dict)


def polarity_scores_roberta(text):
    encoded_text = tokenizer(text, return_tensors='pt')
    output = model(**encoded_text)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    scores_dict = {
        'roberta_neg': scores[0],
        'roberta_neu': scores[1],
        'roberta_pos': scores[2]
    }
    return scores_dict


res_roberta = {}
for i, row in tqdm(df.iterrows(), total=len(df)):
    try:
        text = row['Text']
        myid = row['Id']
        if myid in [83, 187]: 
            continue
        vader_result = sia.polarity_scores(text)
        vader_result_rename = {f"vader_{k}": v for k, v in vader_result.items()}
        roberta_result = polarity_scores_roberta(text)
        both = {**vader_result_rename, **roberta_result}
        res_roberta[myid] = both
    except RuntimeError:
        print(f'Broke for id {myid}')


result_df = pd.DataFrame(res_roberta).T.reset_index().rename(columns={'index': 'Id'}) 


result_df['Id'] = result_df['Id'].astype(str)


df['Id'] = df['Id'].astype(str)


result_df = result_df.merge(df, how='left', on='Id')


if 'Score' not in result_df.columns:
    result_df['Score'] = 'unknown'  


sns.pairplot(data=result_df, vars=['vader_neg', 'vader_neu', 'vader_pos', 'roberta_neg', 'roberta_neu', 'roberta_pos'], hue='Score', palette='tab10')
plt.show()

ModuleNotFoundError: No module named 'numpy.char'