In [13]:
import pandas as pd
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment import SentimentIntensityAnalyzer
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import numpy as np

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\danie\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


## Vader Analysis

In [14]:
vader = SentimentIntensityAnalyzer()

In [15]:
def analyze_vader(text):
    scores = vader.polarity_scores(text)
    return scores['compound']  # Main sentiment score [-1, 1]

def add_vader_sentiment(df, text_column='dialogue'):
    df['vader_sentiment'] = df[text_column].apply(analyze_vader)
    return df

## BERT

In [16]:
# === BERT Setup ===
model_name = "kornosk/polibertweet-mlm"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

def analyze_bert(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True)
    with torch.no_grad():
        logits = model(**inputs).logits
    probs = torch.nn.functional.softmax(logits, dim=-1).squeeze().numpy()
    return float(probs[1])  # Probability of positive sentiment

def add_bert_sentiment(df, text_column='dialogue'):
    df['bert_sentiment'] = (2*(df[text_column].apply(analyze_bert)))-1  # Scale to [-1, 1]
    return df

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at kornosk/polibertweet-mlm and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Comparison Values

In [17]:
def add_diff_sentiment(df, text_column='dialogue'):
    df = add_vader_sentiment(df, text_column)
    df = add_bert_sentiment(df, text_column)
    df['diff_sentiment'] = df['bert_sentiment'] - df['vader_sentiment']
    return df

## First Test

In [18]:
with open('debate_transcripts_cleaned.csv', 'r') as f:
    df = pd.read_csv(f)
    df = add_diff_sentiment(df, text_column='dialogue')
    df.to_csv('debate_transcripts_sentiment.csv', index=False)

In [19]:
df.head()

Unnamed: 0,year,date,actor,dialogue,is_candidate,party,is_winner,VP_debate,is_incumbent,vader_sentiment,bert_sentiment,diff_sentiment
0,1960,1960-09-26,Kennedy,"Mr. Smith, Mr. Nixon. In the election of 1860,...",True,Democratic,True,False,False,0.9993,0.020701,-0.978599
1,1960,1960-09-26,Smith,And now the opening statement by Vice Presiden...,False,,False,False,False,0.0,0.038227,0.038227
2,1960,1960-09-26,Nixon,"Mr. Smith, Senator Kennedy. The things that Se...",True,Republican,False,False,False,0.9992,-0.00136,-1.00056
3,1960,1960-09-26,Smith,"Thank you, Mr. Nixon. That completes the openi...",False,,False,False,False,0.3612,0.034914,-0.326286
4,1960,1960-09-26,Fleming,"Senator, the Vice President in his campaign ha...",False,,False,False,False,-0.2732,0.088109,0.361309
