####Sentiment Analysis in Python

   In this notebook some sentiment analysis is done in python using two different techniques:

1. VADER(Valence Aware Dictionary and sEntiment Reasoner)
2. Roberta Pretrained Model from 🤗
3. Huggingface Pipeline

STEP 0. Read in Data and NLTK Basics

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


plt.style.use("ggplot")

import nltk

In [None]:
#Read in data
df = pd.read_csv("Reviews.csv")
print(df.shape)
df = df.head(500)
print(df.shape)


In [None]:
df.head()

In [None]:
df["Text"].values[0]

QUICK EDA

In [None]:
ax = df["Score"].value_counts().sort_index()\
    .plot(kind = "bar",
          title = "Count of Reviews by Stars",
          figsize = (10,5) )
ax.set_xlabel("Reviews Stars")
plt.show()

Basic NLTK


In [None]:
example = df["Text"][50]
print(example)

In [None]:
nltk.download('punkt_tab')
tokens = nltk.word_tokenize(example)
tokens[:10]   #first 10

In [None]:
nltk.download('averaged_perceptron_tagger_eng')
tagged = nltk.pos_tag(tokens)
tagged[:10]

In [None]:
nltk.download('maxent_ne_chunker_tab')
nltk.download('words')
entities = nltk.chunk.ne_chunk(tagged)
entities.pprint()

Step 1. VADER Sentiment Scoring


In [None]:
nltk.download('vader_lexicon')
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm

sia = SentimentIntensityAnalyzer()

sia.polarity_scores("I am so happy")

In [None]:
sia.polarity_scores("I am so unhappy")

In [None]:
sia.polarity_scores(example)

In [None]:
# Running polarity score on entire dataset
res = {}
for i, row in tqdm(df.iterrows(), total = len(df)):
  text = row["Text"]
  myid = row["Id"]
  res[myid] = sia.polarity_scores(text)

In [None]:
vaders = pd.DataFrame(res).T
vaders

In [None]:
vaders = vaders.reset_index().rename(columns={"index": "Id"})
vaders = vaders.merge(df, how = "left")
vaders

In [None]:
vaders.head()

Plot VADERS results

In [None]:
ax = sns.barplot(data=vaders, x = "Score", y="compound")
ax.set_title("Compound Score by Amazon Star Review")
plt.show()

In [None]:
fig, axs = plt.subplots(1,3, figsize=(12,3))
sns.barplot(data=vaders, x="Score", y = "pos", ax=axs[0])
sns.barplot(data=vaders, x="Score", y = "neu", ax=axs[1])
sns.barplot(data=vaders, x="Score", y = "neg", ax=axs[2])
axs[0].set_title("Positive")
axs[1].set_title("Neutral")
axs[2].set_title("Negative")
plt.tight_layout()
plt.show()

STEP 2 Roberta Pretrained Model

In [None]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax

In [None]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

In [None]:
#VADER result on example
print(example)
sia.polarity_scores(example)

In [None]:
#For Roberta Model
encoded_text = tokenizer(example, return_tensors="pt")
output = model(**encoded_text)
scores = output[0][0].detach().numpy()
scores = softmax(scores)
scores_dict = {
    "reberta_neg": scores[0],
    "reberta_neu": scores[1],
    "reberta_pos": scores[2],
}
print(scores_dict)

In [None]:
def polarity_scores_roberta(example):
    encoded_text = tokenizer(example, return_tensors="pt")
    output = model(**encoded_text)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    scores_dict = {
      "reberta_neg": scores[0],
      "reberta_neu": scores[1],
      "reberta_pos": scores[2],
    }
    return scores_dict


In [None]:
res = {}
for i, row in tqdm(df.iterrows(), total = len(df)):
  try:
      text = row["Text"]
      myid = row["Id"]
      vader_result = sia.polarity_scores(text)

      vader_result_rename = {}
      for key, value in vader_result.items():
            vader_result_rename[f"vader_{key}"]= value


      roberta_result = polarity_scores_roberta(text)
      both = {**vader_result_rename, **roberta_result}
      res[myid] = both

  except RuntimeError:
    print(f"Broke for id {myid}")

In [None]:
results_df = pd.DataFrame(res).T
results_df = results_df.reset_index().rename(columns={"index": "Id"})
results_df = results_df.merge(df, how="left")

In [None]:
results_df.head()

Compare Scores between models

In [None]:
results_df.columns

In [None]:
sns.pairplot(data=results_df,
             vars =['vader_neg', 'vader_neu', 'vader_pos',
                    'reberta_neg', 'reberta_neu', 'reberta_pos'],
             hue = "Score",
             palette = "tab10")
plt.show()

STEP 3: Review Examples

In [None]:
results_df.query("Score == 1")\
    .sort_values("reberta_pos", ascending=False)["Text"].values[0]

In [None]:
results_df.query("Score == 1")\
    .sort_values("vader_pos", ascending=False)["Text"].values[0]

In [None]:
#Negative Sentiment with 5-Star rating
results_df.query("Score == 5")\
    .sort_values("reberta_neg", ascending=False)["Text"].values[0]

In [None]:
results_df.query("Score == 5")\
    .sort_values("vader_neg", ascending=False)["Text"].values[0]

Additional: The Transformers Pipeline

In [None]:
from transformers import pipeline
sent_pipeline = pipeline("sentiment-analysis")

In [None]:
sent_pipeline("I love sentiment analysis")

In [None]:
sent_pipeline("this was sooooo deliscious but too bad i ate em too fast and gained 2 pds! my fault")