In [1]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm

plt.style.use('ggplot')
import numpy as np
import tensorflow as tf
import nltk
from collections import Counter
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from nltk import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import tensorflow_text as text
import tensorflow_hub as hub


with open('reviews.txt', 'r') as f:
  reviews = f.read()
with open('labels.txt', 'r') as f:
  labels = f.read()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
reviews = "".join([char for char in reviews if char not in string.punctuation])

reviews = reviews.split('\n')
labels = labels.split('\n')
labels = [1 if label == "positive" else 0 for label in labels]

stop_words = set(stopwords.words('english'))

reviews_tokenized = []
for review in reviews:
  splitted_review = nltk.word_tokenize(review)
  splitted_review = [WordNetLemmatizer().lemmatize(w) for w in splitted_review]
  splitted_review = [PorterStemmer().stem(w).strip() for w in splitted_review]
#   splitted_review = [token for token in splitted_review if token.lower() not in stop_words]
  splitted_review = ' '.join(splitted_review)
  reviews_tokenized.append(splitted_review)

In [3]:
df = pd.DataFrame({'Text': reviews, 'Label': labels})
df = df.reset_index()
df['Id'] = df['index'] + 1
df.drop('index', axis = 1, inplace = True)
df.head()

Unnamed: 0,Text,Label,Id
0,bromwell high is a cartoon comedy it ran at t...,1,1
1,story of a man who has unnatural feelings for ...,0,2
2,homelessness or houselessness as george carli...,1,3
3,airport starts as a brand new luxury pla...,0,4
4,brilliant over acting by lesley ann warren b...,1,5


In [4]:
sia = SentimentIntensityAnalyzer()

res = {}
for i, row in tqdm(df.iterrows(), total = len(df)):
  text = row['Text']
  myid = row['Id']
  res[myid] = sia.polarity_scores(text)

  0%|          | 0/25001 [00:00<?, ?it/s]

In [5]:
vaders = pd.DataFrame(res).T
vaders = vaders.reset_index().rename(columns = {'index': 'Id'})
vaders = vaders.merge(df, how = 'left')
vaders['Predicted_Label'] = np.where(vaders['compound'] >= 0, 1, 0)
vaders.head()

Unnamed: 0,Id,neg,neu,pos,compound,Text,Label,Predicted_Label
0,1,0.044,0.916,0.041,-0.1027,bromwell high is a cartoon comedy it ran at t...,1,0
1,2,0.108,0.746,0.146,0.7003,story of a man who has unnatural feelings for ...,0,1
2,3,0.12,0.733,0.147,0.9311,homelessness or houselessness as george carli...,1,1
3,4,0.161,0.692,0.147,-0.918,airport starts as a brand new luxury pla...,0,0
4,5,0.077,0.738,0.185,0.9657,brilliant over acting by lesley ann warren b...,1,1


In [6]:
print('Model accuracy: {}'.format(len(vaders[vaders['Label'] == vaders['Predicted_Label']]) / len(vaders)))

Model accuracy: 0.6908523659053638


In [7]:
reviews_tokenized[0]

'bromwel high is a cartoon comedi it ran at the same time a some other program about school life such a teacher my year in the teach profess lead me to believ that bromwel high s satir is much closer to realiti than is teacher the scrambl to surviv financi the insight student who can see right through their pathet teacher pomp the petti of the whole situat all remind me of the school i knew and their student when i saw the episod in which a student repeatedli tri to burn down the school i immedi recal at high a classic line inspector i m here to sack one of your teacher student welcom to bromwel high i expect that mani adult of my age think that bromwel high is far fetch what a piti that it isn t'

In [8]:
from transformers import AutoTokenizer
from transformers import TFAutoModelForSequenceClassification
from scipy.special import softmax

MODEL = f'cardiffnlp/twitter-roberta-base-sentiment'
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = TFAutoModelForSequenceClassification.from_pretrained(MODEL)

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


In [9]:
# VADER results on example

example = reviews_tokenized[0]

print(example)
sia.polarity_scores(example)

bromwel high is a cartoon comedi it ran at the same time a some other program about school life such a teacher my year in the teach profess lead me to believ that bromwel high s satir is much closer to realiti than is teacher the scrambl to surviv financi the insight student who can see right through their pathet teacher pomp the petti of the whole situat all remind me of the school i knew and their student when i saw the episod in which a student repeatedli tri to burn down the school i immedi recal at high a classic line inspector i m here to sack one of your teacher student welcom to bromwel high i expect that mani adult of my age think that bromwel high is far fetch what a piti that it isn t


{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}

In [10]:
# Run Roberta Model

encoded_text = tokenizer(example, return_tensors = 'tf')
output = model(**encoded_text)
scores = output[0][0].numpy()
scores = softmax(scores)
scores_dict = {
  'roberta_neg': scores[0],
  'roberta_neu': scores[1],
  'roberta_pos': scores[2]
}

print(scores_dict)

{'roberta_neg': 0.17288321, 'roberta_neu': 0.7178752, 'roberta_pos': 0.109241605}


In [11]:
def polarity_scores_roberta(example):
  encoded_text = tokenizer(example, return_tensors = 'tf')
  output = model(**encoded_text)
  scores = output[0][0].numpy()
  scores = softmax(scores)
  scores_dict = {
    'roberta_neg': scores[0],
    'roberta_neu': scores[1],
    'roberta_pos': scores[2]
  }
  return scores_dict

In [None]:
res = {}
for i, row in tqdm(df.iterrows(), total = len(df)):
    text = row['Text']
    myid = row['Id']
    vader_result = sia.polarity_scores(text)

    vader_result_rename = {}
    for key, value in vader_result.items():
      vader_result_rename[f'vader_{key}'] = value

    roberta_result = polarity_scores_roberta(text)
    both_results = {**vader_result_rename, **roberta_result}
    res[myid] = both_results

  0%|          | 0/25001 [00:00<?, ?it/s]

In [21]:
roberta_results = []
for i in range(len(df)):
  roberta_results.append()

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [76]:
results_df = pd.DataFrame(res).T
results_df = results_df.reset_index().rename(columns = {'index': 'Id'})
results_df = results_df.merge(vaders, how = 'left')
results_df.drop(['neg', 'neu', 'pos', 'compound'], axis = 1, inplace = True)

results_df['Roberta_Predictions'] = results_df['roberta_pos'] > results_df['roberta_neg']
results_df['Roberta_Predictions'] = results_df['Roberta_Predictions'].astype(int)

In [77]:
results_df.head(3)

Unnamed: 0,Id,vader_neg,vader_neu,vader_pos,vader_compound,roberta_neg,roberta_neu,roberta_pos,Text,Label,Predicted_Label,Roberta_Predictions
0,1,0.044,0.916,0.041,-0.1027,0.676774,0.275353,0.047873,bromwell high is a cartoon comedy it ran at t...,1,0,0
1,2,0.108,0.746,0.146,0.7003,0.397576,0.36184,0.240584,story of a man who has unnatural feelings for ...,0,1,0
2,3,0.12,0.733,0.147,0.9311,0.450941,0.481239,0.06782,homelessness or houselessness as george carli...,1,1,0


In [78]:
print('Roberta model accuracy: {}'.format(sum(results_df['Label'] == results_df['Roberta_Predictions']) / len(results_df)))

Roberta model accuracy: 0.8517864311521477
