# Library import

In [1]:
import pandas as pd
import numpy as np
import torch
import os
import matplotlib.pyplot as plt

from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
from torch.nn.functional import softmax
from tqdm import tqdm

#pd.set_option('display.max_colwidth', None)

  from .autonotebook import tqdm as notebook_tqdm


# Data Load

In [2]:
path = os.path.join(os.getcwd(), 'Data', 'DataTestComment.csv')
df_test = pd.read_csv(path)

  df_test = pd.read_csv(path)


In [3]:
df_test = df_test.dropna(subset=["commentBody"])                          # removes NaNs
df_test = df_test[df_test["commentBody"].str.len() > 0]                   # removes empty strings
df_sample = df_test.sample(2000, random_state=42).reset_index(drop=True)  # sample 2000 rows
df_sample.to_csv('Data/Sample.csv', index=False, mode='w+')               # save sample to CSV

quick_mode = True
if quick_mode:
    df_use = df_sample
    print("Quick mode activated. Using a smaller sample for testing.")
else:
    df_use = df_test
    print("Using the full dataset for sentiment analysis.")

comments = df_use["commentBody"].tolist()

Quick mode activated. Using a smaller sample for testing.


# Annotation

## Calling all pretrained model

In [4]:
sentiment_pipe = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment")
toxicity_pipe = pipeline("text-classification", model="unitary/toxic-bert", top_k=None)

Device set to use cpu
Device set to use cpu


In [5]:
emotion_model = AutoModelForSequenceClassification.from_pretrained("nateraw/bert-base-uncased-emotion")
emotion_tokenizer = AutoTokenizer.from_pretrained("nateraw/bert-base-uncased-emotion")
emotion_model.eval()  # Set the model to evaluation mode

def get_emotion(text):
    inputs = emotion_tokenizer(text, return_tensors="pt", truncation=True)
    with torch.no_grad():
        logits = emotion_model(**inputs).logits
    predicted_class = torch.argmax(logits, dim=1).item()
    label_list = ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']
    return label_list[predicted_class]

In [6]:
hate_model = AutoModelForSequenceClassification.from_pretrained("facebook/roberta-hate-speech-dynabench-r4-target")
hate_tokenizer = AutoTokenizer.from_pretrained("facebook/roberta-hate-speech-dynabench-r4-target")
hate_model.eval()  # Set the model to evaluation mode

def get_hate_score(text):
    try:
        inputs = hate_tokenizer(text, return_tensors="pt", truncation=True)
        with torch.no_grad():
            outputs = hate_model(**inputs)
            scores = torch.sigmoid(outputs.logits)
        return scores[0][0].item()
    except:
        return 0.0

In [7]:
def get_sentiment(text):
    try:
        result = sentiment_pipe(text)[0]
        label_map = {
            "LABEL_0": "negative",
            "LABEL_1": "neutral",
            "LABEL_2": "positive"
        }
        return label_map.get(result["label"], "UNKNOWN")
    except:
        return "UNKNOWN"

def get_toxicity(text):
    try:
        result = toxicity_pipe(text)[0]
        return {r['label'].lower(): r['score'] for r in result}
    except:
        return {}

In [8]:
ensemble_data = []
for comment in tqdm(comments[:2000]):  # limit to 2000 rows for now
    result = {
        "comment": comment,
        "sentiment": get_sentiment(comment),
        "hate_score": get_hate_score(comment),
        "emotion": get_emotion(comment)
    }

    toxicity_scores = get_toxicity(comment)
    for key in ['toxicity', 'obscene', 'identity_attack', 'insult', 'threat', 'sexual_explicit']:
        result[key] = toxicity_scores.get(key, 0.0)
    
    result["label"] = "unknown"  # placeholder for manual or rule-based label
    ensemble_data.append(result)

100%|██████████| 2000/2000 [09:48<00:00,  3.40it/s]


In [9]:
ensemble_data_df = pd.DataFrame(ensemble_data)
ensemble_data_df.head(10)

Unnamed: 0,comment,sentiment,hate_score,emotion,toxicity,obscene,identity_attack,insult,threat,sexual_explicit,label
0,an interesting statistic came out today in a u...,neutral,0.866758,joy,0.0,0.000181,0.0,0.000184,0.000111,0.0,unknown
1,heartily agree trump offers such a smorgasbord...,negative,0.939092,sadness,0.0,0.00029,0.0,0.000248,0.000121,0.0,unknown
2,it is ever so critical now for the average ame...,neutral,0.982114,joy,0.0,0.000183,0.0,0.000209,0.000142,0.0,unknown
3,you sold your soul no less than any of them yo...,negative,0.808653,anger,0.0,0.007351,0.0,0.06812,0.015863,0.0,unknown
4,did mr trumps people forget to tell him that h...,negative,0.8701,sadness,0.0,0.000403,0.0,0.000518,0.000147,0.0,unknown
5,there are black farmers in north carolina that...,neutral,0.938146,joy,0.0,0.000279,0.0,0.000296,0.00015,0.0,unknown
6,our manchuria candidatebut it would not have b...,negative,0.975874,joy,0.0,0.000169,0.0,0.000191,0.000121,0.0,unknown
7,obama once again demonstrates that he is capab...,neutral,0.971533,joy,0.0,0.000172,0.0,0.000178,0.00013,0.0,unknown
8,with his business first preoccupations and raw...,neutral,0.896069,joy,0.0,0.000259,0.0,0.000253,0.000116,0.0,unknown
9,trump offered the same lame excuse he always h...,negative,0.928705,sadness,0.0,0.000233,0.0,0.000241,0.000104,0.0,unknown
