# Emotion Analysis

In [147]:
### Imports

import pandas as pd
from lxml.parser import result
from pandas import json_normalize, Series
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import style
from sympy.strategies.core import switch
style.use('ggplot')
from textblob import TextBlob
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
#nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, ConfusionMatrixDisplay
import json
from json import loads, dumps
from pprint import pprint
import torch
from transformers import pipeline
from tqdm import tqdm

In [171]:
### Prepare dataframe for analysis
# Target structure:
# [tweet_date,
# tweet_id, tweet_text, tweet_text_cleaned, tweet_emotion1-6, tweet_dominant_emotion,
# quoted_tweet_id, quoted_tweet_text, quoted_tweet_text_cleaned, quoted_emotion1-6, quoted_dominant_emotion]
# Import dataset
df_tweets = pd.read_csv(
    '../data/twitter/tweets_isTweet.csv',
    dtype={'id': 'object'},
    low_memory=False
)
df_tweets = df_tweets[['id', 'createdAt', 'text', 'quoted_tweet']]
df_tweets = df_tweets.rename(columns={'id': 'tweet_id', 'text': 'tweet_text'})

# Normalize json column
quoted_tweets_normalized = pd.json_normalize(
    df_tweets['quoted_tweet'].apply(
        lambda x: json.loads(x) if pd.notna(x) and isinstance(x, str) else None
    )
)
quoted_tweets_normalized = quoted_tweets_normalized.rename(columns={'id': 'quoted_tweet_id', 'text': 'quoted_tweet_text'})

# Link by index
df_tweets.index = quoted_tweets_normalized.index

# Concat both dataframes
df_tweets_normalized = pd.concat([
    df_tweets[['tweet_id', 'createdAt', 'tweet_text']],
    quoted_tweets_normalized[['quoted_tweet_id', 'quoted_tweet_text']]
], axis=1)

# View data
df_tweets_normalized.head()

Unnamed: 0,tweet_id,createdAt,tweet_text,quoted_tweet_id,quoted_tweet_text
0,1917726279195058338,Wed Apr 30 23:42:29 +0000 2025,https://t.co/U6tI9pdin6,,
1,1917693698281787564,Wed Apr 30 21:33:01 +0000 2025,https://t.co/1c1WjFpOva,,
2,1917225430702240067,Tue Apr 29 14:32:17 +0000 2025,This is a big deal,1.9172236516250995e+18,"Last week, Treasury went live with its first a..."
3,1917114631287718009,Tue Apr 29 07:12:01 +0000 2025,https://t.co/6xSd8l67FN,,
4,1917103264417649121,Tue Apr 29 06:26:50 +0000 2025,Whoa,1.9170112797570665e+18,🚨THE INVISIBLE PUPPET MASTERS: AI'S DISTURBING...


In [None]:
### Pre-process data for the analysis
## Variables
ekman_emotions = ['anger', 'fear', 'joy', 'sadness', 'disgust', 'surprise']

## Classifier
# Load Hugging Face's emotion classifier
device = "cuda" if torch.cuda.is_available() else "cpu"
classifier = pipeline("text-classification", model="bhadresh-savani/bert-base-uncased-emotion", top_k=None, device=0 if device == "cuda" else -1)
## Functions
# Removing noise from the text
def remove_noise(text):
    text = text.lower()
    text = re.sub(r"https\S+|www\S+httpss\S+", '', text, flags=re.MULTILINE) # Remove Url
    text = re.sub(r"\@w+|\#", '', text) # remove @ and #
    text = re.sub(r"[^\w\s]", '', text) # remove punctuation
    text_tokens = text.split()
    filtered_text = [w for w in text_tokens if not w in stop_words]
    return " ".join(filtered_text)

# Reduction of dimensionality by abstracting word to word stem
stemmer = PorterStemmer()
def stem_words(text):
    stemmed_text = [stemmer.stem(word) for word in text]
    return stemmed_text

def compute_emotions(text):
    if not isinstance(text, str) or text.strip() == "":
        print(f"Invalid text: {text[:10]}...")
        return {emotion: 0.0 for emotion in ekman_emotions}

    try:
        cleaned_text = remove_noise(text) # remove noise from text
        cleaned_text = stem_words(cleaned_text) # reduce dimensionality
        results = classifier(cleaned_text)[0]
        emotion_scores = {result['label']: result['score'] for emotion in ekman_emotions}

        return {emotion: emotion_scores.get(emotion,  for


    except Exception as e:
        print(f"Error while processing text: {text[:10]}...\nError: {e}")
        return {emotion: 0.0 for emotion in ekman_emotions}
