In [4]:
import pandas as pd
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment import SentimentIntensityAnalyzer
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import numpy as np

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\danie\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


## Vader Analysis

In [5]:
vader = SentimentIntensityAnalyzer()

In [6]:
def analyze_vader(text):
    scores = vader.polarity_scores(text)
    return scores['compound']  # Main sentiment score [-1, 1]

def add_vader_sentiment(df, text_column='dialogue'):
    df['vader_sentiment'] = df[text_column].apply(analyze_vader)
    return df

## BERT

In [10]:
# === BERT Setup ===
model_name = "kornosk/polibertweet-mlm"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

def analyze_bert(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True)
    with torch.no_grad():
        logits = model(**inputs).logits
    probs = torch.nn.functional.softmax(logits, dim=-1).squeeze().numpy()
    return float(probs[1])  # Probability of positive sentiment

def add_bert_sentiment(df, text_column='dialogue'):
    df['bert_sentiment'] = (2*(df[text_column].apply(analyze_bert)))-1  # Scale to [-1, 1]
    return df

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at kornosk/polibertweet-mlm and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Comparison Values

In [11]:
def add_diff_sentiment(df, text_column='dialogue'):
    df = add_vader_sentiment(df, text_column)
    df = add_bert_sentiment(df, text_column)
    df['diff_sentiment'] = df['bert_sentiment'] - df['vader_sentiment']
    return df

## First Test

In [12]:
with open('debate_transcripts_cleaned.csv', 'r') as f:
    df = pd.read_csv(f)
    df = add_diff_sentiment(df, text_column='dialogue')
    df.to_csv('debate_transcripts_sentiment.csv', index=False)

KeyboardInterrupt: 