# Task 4 (Further Balancing Emotion Classes)

## Usage: Classifying emotions in transcribed television show data.

In [11]:
import pandas as pd
import numpy as np
import spacy
from textblob import TextBlob
from collections import Counter
from transformers import MarianMTModel, MarianTokenizer, AutoTokenizer, AutoModel
import torch
from datasets import Dataset

### Load the data

In [12]:
transcript = "transformers_dataset.csv"
df = pd.read_csv(transcript)

# Display emotion classes
df['Emotion'].value_counts()

Emotion
happiness    24583
surprise     10556
sadness       7076
neutral       6312
anger         4971
disgust       3587
fear          1881
Name: count, dtype: int64

In [13]:
class_counts = df["Emotion"].value_counts()
print(class_counts.mean())

8423.714285714286


### Adding synthetic data to balance classes with the lowest nr of instances using Text Attack

#### Also concatenating with the original df

In [14]:
from textattack.augmentation import EasyDataAugmenter
import pandas as pd

# Initialize TextAttack augmenter (you can experiment with different ones)
augmenter = EasyDataAugmenter(pct_words_to_swap=0.2, transformations_per_example=2)

# Define a threshold: Augment only classes below this threshold
THRESHOLD = 9000

# Apply augmentation to underrepresented classes
augmented_sentences = []
augmented_labels = []

for emotion, count in class_counts.items():
    if count < THRESHOLD:
        subset = df[df["Emotion"] == emotion]
        for sentence in subset["Sentence"]:
            augmented = augmenter.augment(sentence)
            augmented_sentences.extend(augmented)
            augmented_labels.extend([emotion] * len(augmented))

# Create DataFrame for augmented data
augmented_df = pd.DataFrame({"Sentence": augmented_sentences, "Emotion": augmented_labels})

# Combine with original data
df = pd.concat([df, augmented_df]).reset_index(drop=True)

# Check new class distribution
print("After Augmentation:\n", df["Emotion"].value_counts())

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


After Augmentation:
 Emotion
happiness    24583
sadness      21191
neutral      18403
anger        14834
disgust      10755
surprise     10556
fear          5526
Name: count, dtype: int64


In [22]:
df_copy = df.copy()

In [33]:
df = df[['Sentence', 'Emotion']]

### Save dataset

In [35]:
# Save cleaned dataset
cleaned_file_path = "../Task 4/balanced_classes_dataset.csv"
df.to_csv(cleaned_file_path, index=False, encoding="utf-8")