In [31]:
import pandas as pd
import numpy as np

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder

from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
from scipy.special import softmax

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\fbeto\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [37]:
df = pd.read_csv('Input/raw_agg.csv')
df

Unnamed: 0,speaker,line_text,MBTI,F1,F2
0,Andy,"Hey, Big Tuna! You're single right? She's pre...",ESFP,Se,Fi
1,Angela,I bet it's gonna be me. Probably gonna be me. ...,ISTJ,Si,Te
2,Darryl,It's not my real name. Darryl Philbin. Then Re...,ISTP,Ti,Se
3,Dwight,[singing] Shall I play for you? Pa rum pump um...,ESTJ,Te,Si
4,Holly,"Hi. Yes, uh, I'm Holly Flax. I was told to ask...",INFP,Fi,Ne
5,Jan,[on her cell phone] Just before lunch. That wo...,ENTJ,Te,Ni
6,Jim,"Oh, I told you. I couldn't close it. So... Act...",ENTP,Ne,Ti
7,Karen,Jim's nice enough. I dont... I don't know how...,ISTJ,Si,Te
8,Kelly,I have a customer meeting. I just had the long...,ESFP,Se,Fi
9,Kevin,"Yeah, it'll be you. I don't wanna be laid off....",ISFP,Fi,Se


# 1. NLTK Vader

In [16]:
df_vader = df.copy()

sid = SentimentIntensityAnalyzer()

# Function to apply sentiment analysis and extract sentiment scores
def vader_pos(text):
    scores = sid.polarity_scores(text)
    return scores['pos']
def vader_neg(text):
    scores = sid.polarity_scores(text)
    return scores['neg']
def vader_neu(text):
    scores = sid.polarity_scores(text)
    return scores['neu']
def vader_comp(text):
    scores = sid.polarity_scores(text)
    return scores['compound']


# Apply sentiment analysis and create a new column with sentiment scores
df_vader['vader_pos'] = df_vader['line_text'].apply(lambda text: vader_pos(text))
df_vader['vader_neg'] = df_vader['line_text'].apply(lambda text: vader_neg(text))
df_vader['vader_neu'] = df_vader['line_text'].apply(lambda text: vader_neu(text))
#compound = (pos - neg) / (pos + neg + neu)
df_vader['vader_comp'] = df_vader['line_text'].apply(lambda text: vader_comp(text))

In [21]:
#add a column "comp_power"
df_vader['vader_power'] = (df_vader['vader_pos'] + df_vader['vader_neg']) \
                        / (df_vader['vader_pos'] + df_vader['vader_neg'] + df_vader['vader_neu'])
df_vader.to_csv('Output/vader_scores.csv', index=False)
#df_vader

# 2. Textblob

In [32]:
df_blob = df.copy()

# Apply sentiment analysis using TextBlob
def analyze_sentiment(text):
    blob = TextBlob(text)
    return blob.sentiment.polarity, blob.sentiment.subjectivity

# Apply the function and expand the result into separate columns
df_blob[['polarity', 'subjectivity']] = df_blob['line_text'].apply(lambda x: analyze_sentiment(x)).apply(pd.Series)

df_blob.to_csv('Output/textblob.csv', index=False)

# Display the DataFrame with sentiment analysis results
#df_blob

In [33]:
df_blob

Unnamed: 0,line_text,pos,neg,neu,polarity,subjectivity
0,"I love this sunny weather, but I hate the heat.",0.0,0.392998,0.607002,-0.15,0.75
1,This movie is amazing!,0.0,0.982438,0.017562,0.75,0.9
2,I am not sure about this product.,0.0,0.770051,0.229949,-0.25,0.888889
3,The service was terrible.,0.0,0.927603,0.072397,-1.0,1.0


# 3. Hugging face transformers

In [30]:
##Test

# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
#model.save_pretrained(MODEL)
text = "Covid cases are increasing fast!"
text = preprocess(text)
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)
scores = output[0][0].detach().numpy()
scores = softmax(scores)
# # TF
# model = TFAutoModelForSequenceClassification.from_pretrained(MODEL)
# model.save_pretrained(MODEL)
# text = "Covid cases are increasing fast!"
# encoded_input = tokenizer(text, return_tensors='tf')
# output = model(encoded_input)
# scores = output[0][0].numpy()
# scores = softmax(scores)
# Print labels and scores
ranking = np.argsort(scores)
ranking = ranking[::-1]
for i in range(scores.shape[0]):
    l = config.id2label[ranking[i]]
    s = scores[ranking[i]]
    print(f"{i+1}) {l} {np.round(float(s), 4)}")

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


1) negative 0.7236
2) neutral 0.2287
3) positive 0.0477


In [38]:
df_roberta = df.copy()

# Load tokenizer and model
MODEL = "cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

#Preprocess
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

#Apply
def calculate_sentiment_scores(text):
    text = preprocess(text)
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    return scores.tolist()

# Apply sentiment analysis to each row in 'text' column
sentiment_scores = df_roberta['line_text'].apply(calculate_sentiment_scores).tolist()

# Extract scores and assign to new columns
df_roberta[['neg', 'neu', 'pos']] = pd.DataFrame(sentiment_scores, index=df_roberta.index)

# Print the DataFrame with sentiment scores
print(df_roberta)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


RuntimeError: The expanded size of the tensor (19474) must match the existing size (514) at non-singleton dimension 1.  Target sizes: [1, 19474].  Tensor sizes: [1, 514]