# Task 4 (Dataset preprocessing for Transformer Usage)

## Usage: Classifying emotions in transcribed television show data.

In [1]:
import pandas as pd
import numpy as np
import spacy
from textblob import TextBlob
from collections import Counter
from transformers import MarianMTModel, MarianTokenizer, AutoTokenizer, AutoModel
import torch
from datasets import Dataset

2025-03-31 12:18:14.424831: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-31 12:18:14.438651: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1743423494.453179   15736 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1743423494.457614   15736 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1743423494.470900   15736 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

### Load the data

### Adding the Emotion detection dataset (with phrases from the Friends show)

In [2]:
transcript_2 = "emory_nlp_ds.csv"
df2 = pd.read_csv(transcript_2)

# Switching the column names to match the final dataset
df2.rename(columns={'Translated': 'Sentence', 'Sentence': 'Translated'}, inplace=True)
df2 = df2[['Sentence'] + [col for col in df2.columns if col != 'Sentence']]

# Display first few rows
print(df2.head())

                                            Sentence  \
0  Ce que vous ne comprenez pas, c'est, pour nous...   
1              Ouais, c'est vrai! ....... Y'Serious?   
2                                          Oh ouais!   
3  Tout ce que vous devez savoir est dans ce prem...   
4                                        Absolument.   

                                          Translated    Emotion  
0  What you guys don't understand is, for us, kis...  happiness  
1                      Yeah, right!.......Y'serious?    neutral  
2                                          Oh, yeah!  happiness  
3  Everything you need to know is in that first k...  happiness  
4                                        Absolutely.  happiness  


### Adding synthetic data to balance classes with the lowest nr of instances (sadness, surprise, and disgust)

In [3]:
transcript_3 = "synthetic_emotion_dataset.csv"
df3 = pd.read_csv(transcript_3)

# Display first few rows
print(df3.head())

                                           Sentence  \
0  Je ne peux pas croire que cela vient d'arriver !   
1  Je n'avais aucune idée que cela allait arriver !   
2                         Wow ! C'était inattendu !   
3                       C'est absolument choquant !   
4                                 Quelle surprise !   

                            Translated   Emotion  
0  I can't believe this just happened!  surprise  
1       I had no idea this was coming!  surprise  
2            Wow! That was unexpected!  surprise  
3         This is absolutely shocking!  surprise  
4                     What a surprise!  surprise  


### Adding the Go Emotions dataset

In [4]:
transcript_3 = "df_gotranslated.csv"
df4 = pd.read_csv(transcript_3)

# Display first few rows
print(df4.head())

                                            Sentence  \
0  C'est vrai, mais [NAME] est toujours une resso...   
1  Je ne vois pas comment le départ d'un journali...   
2  Si les partisans de [NAME] vous déclenchent pl...   
3  Et si vous êtes quelqu'un, comme moi, qui croi...   
4                     Vous êtes normal, très normal.   

                                          Translated    Emotion  
0  Thats true but [NAME] is still a valuable reso...   surprise  
1  I don't see how a NBC journalist's departure i...      anger  
2  If [NAME] supporters trigger you more than mas...    neutral  
3  What if you're someone, like me, who believes ...   surprise  
4                       You are normal. Very normal.  happiness  


### Concatanate the datasets

In [29]:
df = pd.concat([df2, df3, df4], ignore_index=True)
display(df)

Unnamed: 0,Sentence,Translated,Emotion
0,"Ce que vous ne comprenez pas, c'est, pour nous...","What you guys don't understand is, for us, kis...",happiness
1,"Ouais, c'est vrai! ....... Y'Serious?","Yeah, right!.......Y'serious?",neutral
2,Oh ouais!,"Oh, yeah!",happiness
3,Tout ce que vous devez savoir est dans ce prem...,Everything you need to know is in that first k...,happiness
4,Absolument.,Absolutely.,happiness
...,...,...,...
58961,Ma scène préférée est la tribu qui ramène le m...,Dozens. My favorite scene is the tribe bringin...,happiness
58962,"J'aime [NAME], il est littéralement le canapé ...","I love [NAME], he’s literally the craziest cou...",happiness
58963,C'est affreux. Je suis content [NAME] de faire...,"That's awful. I'm glad [NAME] doing better, bu...",sadness
58964,Je vais voir ça.,I'll check that out. Thank you!!,happiness


### POS tags

In [None]:
nlp_fr = spacy.load("fr_core_news_lg")
def extract_pos(text):
    doc = nlp_fr(text)
    pos_counts = Counter([token.pos_ for token in doc])
    return pos_counts

# Function to process the entire dataset and apply the POS extraction
def extract_pos_batch(batch):
    # Apply the POS extraction for each sentence in the batch
    pos_features = [extract_pos(text) for text in batch["Sentence"]]
    
    # Return the processed POS tags in a suitable format
    return {
        "POS_Tags": [
            " ".join([f"{key}_{value}" for key, value in pos.items()])  # Create the POS tag count representation like "ADP_1 DET_4"
            for pos in pos_features
        ]
    }

In [None]:
### FOR RETAKE

nlp_fr = spacy.load("fr_core_news_lg")
def extract_pos(text):
    doc = nlp_fr(text)
    pos_counts = Counter([token.pos_ for token in doc])
    total = sum(pos_counts.values())
    pos_ratios = {f"POS_{tag}": count / total for tag, count in pos_counts.items()}
    return pos_ratios

def extract_pos_batch(batch):
    pos_features = [extract_pos(text) for text in batch["Sentence"]]
    return pos_features  

## OUTPUT EXAMPLE: 
"""{
  "POS_NOUN": 0.35,
  "POS_VERB": 0.25,
  "POS_ADJ": 0.10,
  ...
}
"""

## The output is ready to be inputed directly into the model as numeric input

### Sentiment Analysis and Emotion Classification

In [36]:
from datasets import Dataset
import pandas as pd
from transformers import pipeline

# Initialize the emotion classifier pipeline (for sentiment) and set device to CPU
emotion_classifier = pipeline(
    "zero-shot-classification",
    model="j-hartmann/emotion-english-distilroberta-base",  # Specify a model for emotion classification
    device=0  # Using GPU 0
)

# Define get_sentiment and get_intensity functions
def get_sentiment(batch):
    candidate_labels = ["happiness", "sadness", "anger", "surprise", "fear", "disgust", "neutral"]
    texts = batch['Sentence']  # Extract all sentences in the batch
    results = emotion_classifier(texts, candidate_labels=candidate_labels)  # Process all sentences at once
    
    # Extract the sentiment (the label with the highest score)
    sentiment_scores = [result['labels'][0] for result in results]  # Get the first label (most confident)
    return {"Sentiment_Score": sentiment_scores}

def get_intensity(batch):
    texts = batch['Sentence']
    results = emotion_classifier(texts, candidate_labels=["happiness", "anger", "fear", "sadness", "surprise", "disgust"])
    
    intensities = []
    for result in results:
        top_score = max(result["scores"])  # Get highest confidence score
        
        # Define intensity levels based on the score
        if top_score < 0.40:
            intensities.append("Mild")
        elif 0.40 <= top_score < 0.55:
            intensities.append("Neutral")
        elif 0.55 <= top_score < 0.75:
            intensities.append("Moderate")
        else:
            intensities.append("Intense")
    
    return {"Intensity": intensities}

Device set to use cuda:0
Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to -1. Define a descriptive label2id mapping in the model config to ensure correct outputs.


### Extract Named Entities

In [43]:
nlp_fr = spacy.load("fr_core_news_lg")

def extract_ner(text):
    doc = nlp_fr(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

### Putting it all together

#### First dataset modifications

### THIS DATASET HAS BEEN REMOVED

#### Dataset modifications

In [2]:
from datasets import Dataset

# Use the datasets library for more efficient processing 
dataset = Dataset.from_pandas(df['Emotion'])

# Apply the functions to the datasets in batches
dataset = dataset.map(get_sentiment, batched=True)

INFO: PyTorch version 2.6.0 available.
INFO: TensorFlow version 2.19.0 available.


NameError: name 'df' is not defined

In [37]:
dataset = dataset.map(get_intensity, batched=True)

Map:   0%|          | 0/58966 [00:00<?, ? examples/s]

#### Applying the functions

In [44]:
# Extract punctuation-based features
def extract_punctuation_features(batch):
    sentences = batch["Sentence"]
    
    return {
        "Exclamations": [s.count("!") for s in sentences],
        "Ellipses": [s.count("...") for s in sentences],
        "Questions": [s.count("?") for s in sentences]
    }

dataset = dataset.map(extract_punctuation_features, batched=True)


# Apply Named Entity Recognition (NER) & POS Tags
dataset = dataset.map(extract_ner_batch, batched=True)

Map:   0%|          | 0/58966 [00:00<?, ? examples/s]

Map:   0%|          | 0/58966 [00:00<?, ? examples/s]

Map:   0%|          | 0/58966 [00:00<?, ? examples/s]

ValueError: Schema and number of arrays unequal

In [56]:
dataset = dataset.map(extract_pos_batch, batched=True)

Map:   0%|          | 0/58966 [00:00<?, ? examples/s]

In [45]:
def word_features(batch):
    word_lists = [sentence.split() for sentence in batch["Sentence"]]
    word_counts = [len(words) for words in word_lists]
    unique_word_ratios = [len(set(words)) / len(words) if words else 0 for words in word_lists]

    # Convert list of words into a comma-separated string
    word_lists_str = [", ".join(words) for words in word_lists]

    return {"Word_List": word_lists_str, "Word_Count": word_counts, "Unique_Word_Ratio": unique_word_ratios}

# Apply transformation
dataset = dataset.map(word_features, batched=True)

Map:   0%|          | 0/58966 [00:00<?, ? examples/s]

In [57]:
# Convert back to Pandas DataFrame
df = dataset.to_pandas()
df.head()

Unnamed: 0,Sentence,Translated,Emotion,Sentiment_Score,Intensity,Exclamations,Ellipses,Questions,NER,Word_List,...,ADV,PRON,DET,ADP,NUM,CONJ,PRT,X,PUNCT,POS_Tags
0,"Ce que vous ne comprenez pas, c'est, pour nous...","What you guys don't understand is, for us, kis...",happiness,surprise,Intense,0,0,0,[],"Ce, que, vous, ne, comprenez, pas,, c'est,, po...",...,4,8,1,2,0,0,0,0,4,PRON_8 ADV_4 VERB_4 PUNCT_4 ADP_2 SCONJ_1 AUX_...
1,"Ouais, c'est vrai! ....... Y'Serious?","Yeah, right!.......Y'serious?",neutral,surprise,Intense,1,2,1,[],"Ouais,, c'est, vrai!, ......., Y'Serious?",...,0,1,0,0,0,0,0,1,4,NOUN_2 PUNCT_4 PRON_1 VERB_1 ADJ_1 X_1
2,Oh ouais!,"Oh, yeah!",happiness,surprise,Intense,1,0,0,"[[Oh ouais!, MISC]]","Oh, ouais!",...,0,0,0,0,0,0,0,0,1,NOUN_1 ADJ_1 PUNCT_1
3,Tout ce que vous devez savoir est dans ce prem...,Everything you need to know is in that first k...,happiness,surprise,Intense,0,0,0,[],"Tout, ce, que, vous, devez, savoir, est, dans,...",...,0,3,1,1,0,0,0,0,1,ADJ_2 PRON_3 VERB_2 AUX_1 ADP_1 DET_1 NOUN_1 P...
4,Absolument.,Absolutely.,happiness,surprise,Intense,0,0,0,[],Absolument.,...,1,0,0,0,0,0,0,0,1,ADV_1 PUNCT_1


In [58]:
df_copy = df.copy()

In [63]:
df_copy

Unnamed: 0,Sentence,Translated,Emotion,Sentiment_Score,Intensity,Exclamations,Ellipses,Questions,NER,Word_List,...,ADV,PRON,DET,ADP,NUM,CONJ,PRT,X,PUNCT,POS_Tags
0,"Ce que vous ne comprenez pas, c'est, pour nous...","What you guys don't understand is, for us, kis...",happiness,surprise,Intense,0,0,0,[],"Ce, que, vous, ne, comprenez, pas,, c'est,, po...",...,4,8,1,2,0,0,0,0,4,PRON_8 ADV_4 VERB_4 PUNCT_4 ADP_2 SCONJ_1 AUX_...
1,"Ouais, c'est vrai! ....... Y'Serious?","Yeah, right!.......Y'serious?",neutral,surprise,Intense,1,2,1,[],"Ouais,, c'est, vrai!, ......., Y'Serious?",...,0,1,0,0,0,0,0,1,4,NOUN_2 PUNCT_4 PRON_1 VERB_1 ADJ_1 X_1
2,Oh ouais!,"Oh, yeah!",happiness,surprise,Intense,1,0,0,"[[Oh ouais!, MISC]]","Oh, ouais!",...,0,0,0,0,0,0,0,0,1,NOUN_1 ADJ_1 PUNCT_1
3,Tout ce que vous devez savoir est dans ce prem...,Everything you need to know is in that first k...,happiness,surprise,Intense,0,0,0,[],"Tout, ce, que, vous, devez, savoir, est, dans,...",...,0,3,1,1,0,0,0,0,1,ADJ_2 PRON_3 VERB_2 AUX_1 ADP_1 DET_1 NOUN_1 P...
4,Absolument.,Absolutely.,happiness,surprise,Intense,0,0,0,[],Absolument.,...,1,0,0,0,0,0,0,0,1,ADV_1 PUNCT_1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58961,Ma scène préférée est la tribu qui ramène le m...,Dozens. My favorite scene is the tribe bringin...,happiness,surprise,Intense,0,0,0,[],"Ma, scène, préférée, est, la, tribu, qui, ramè...",...,0,1,4,2,0,0,0,0,5,DET_4 NOUN_4 ADJ_1 AUX_1 PRON_1 VERB_1 ADP_2 P...
58962,"J'aime [NAME], il est littéralement le canapé ...","I love [NAME], he’s literally the craziest cou...",happiness,surprise,Intense,2,0,0,[],"J'aime, [NAME],, il, est, littéralement, le, c...",...,3,3,5,2,0,0,0,0,5,PRON_3 VERB_4 PUNCT_5 PROPN_1 AUX_2 ADV_3 DET_...
58963,C'est affreux. Je suis content [NAME] de faire...,"That's awful. I'm glad [NAME] doing better, bu...",sadness,surprise,Intense,0,0,0,[],"C'est, affreux., Je, suis, content, [NAME], de...",...,1,3,2,3,0,0,0,0,5,PRON_3 AUX_2 ADJ_2 PUNCT_5 VERB_4 PROPN_1 ADP_...
58964,Je vais voir ça.,I'll check that out. Thank you!!,happiness,surprise,Intense,0,0,0,[],"Je, vais, voir, ça.",...,0,2,0,0,0,0,0,0,1,PRON_2 VERB_2 PUNCT_1


### Calculate TF-IDF

In [59]:
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords

# Initialize TF-IDF vectorizer
vectorizer = TfidfVectorizer(analyzer="char_wb", ngram_range=(2, 5), stop_words=None)

# Compute TF-IDF matrix
tfidf_matrix = vectorizer.fit_transform(df["Sentence"].astype(str))

# Convert TF-IDF values to a list for each row
df["TF-IDF"] = tfidf_matrix.toarray().tolist()

In [70]:
df.drop(columns=["ADJ", "ADP", "ADV", "CONJ", "DET", 
    "NOUN", "NUM", "PRON", "PUNCT", "VERB", "X", "PRT"], inplace=True)

In [None]:
display(df)

In [72]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58966 entries, 0 to 58965
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Sentence           58966 non-null  object 
 1   Translated         58966 non-null  object 
 2   Emotion            58966 non-null  object 
 3   Sentiment_Score    58966 non-null  object 
 4   Intensity          58966 non-null  object 
 5   Exclamations       58966 non-null  int64  
 6   Ellipses           58966 non-null  int64  
 7   Questions          58966 non-null  int64  
 8   NER                58966 non-null  object 
 9   Word_List          58966 non-null  object 
 10  Word_Count         58966 non-null  int64  
 11  Unique_Word_Ratio  58966 non-null  float64
 12  POS_Tags           58966 non-null  object 
 13  TF-IDF             58966 non-null  object 
dtypes: float64(1), int64(4), object(9)
memory usage: 6.3+ MB


### Testing

In [75]:
df[df["Intensity"] == "Intense"]

Unnamed: 0,Sentence,Translated,Emotion,Sentiment_Score,Intensity,Exclamations,Ellipses,Questions,NER,Word_List,Word_Count,Unique_Word_Ratio,POS_Tags,TF-IDF
0,"Ce que vous ne comprenez pas, c'est, pour nous...","What you guys don't understand is, for us, kis...",happiness,surprise,Intense,0,0,0,[],"Ce, que, vous, ne, comprenez, pas,, c'est,, po...",21,0.904762,PRON_8 ADV_4 VERB_4 PUNCT_4 ADP_2 SCONJ_1 AUX_...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,"Ouais, c'est vrai! ....... Y'Serious?","Yeah, right!.......Y'serious?",neutral,surprise,Intense,1,2,1,[],"Ouais,, c'est, vrai!, ......., Y'Serious?",5,1.000000,NOUN_2 PUNCT_4 PRON_1 VERB_1 ADJ_1 X_1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,Oh ouais!,"Oh, yeah!",happiness,surprise,Intense,1,0,0,"[[Oh ouais!, MISC]]","Oh, ouais!",2,1.000000,NOUN_1 ADJ_1 PUNCT_1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,Tout ce que vous devez savoir est dans ce prem...,Everything you need to know is in that first k...,happiness,surprise,Intense,0,0,0,[],"Tout, ce, que, vous, devez, savoir, est, dans,...",11,0.909091,ADJ_2 PRON_3 VERB_2 AUX_1 ADP_1 DET_1 NOUN_1 P...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,Absolument.,Absolutely.,happiness,surprise,Intense,0,0,0,[],Absolument.,1,1.000000,ADV_1 PUNCT_1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58961,Ma scène préférée est la tribu qui ramène le m...,Dozens. My favorite scene is the tribe bringin...,happiness,surprise,Intense,0,0,0,[],"Ma, scène, préférée, est, la, tribu, qui, ramè...",17,0.823529,DET_4 NOUN_4 ADJ_1 AUX_1 PRON_1 VERB_1 ADP_2 P...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
58962,"J'aime [NAME], il est littéralement le canapé ...","I love [NAME], he’s literally the craziest cou...",happiness,surprise,Intense,2,0,0,[],"J'aime, [NAME],, il, est, littéralement, le, c...",26,0.884615,PRON_3 VERB_4 PUNCT_5 PROPN_1 AUX_2 ADV_3 DET_...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
58963,C'est affreux. Je suis content [NAME] de faire...,"That's awful. I'm glad [NAME] doing better, bu...",sadness,surprise,Intense,0,0,0,[],"C'est, affreux., Je, suis, content, [NAME], de...",21,0.904762,PRON_3 AUX_2 ADJ_2 PUNCT_5 VERB_4 PROPN_1 ADP_...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
58964,Je vais voir ça.,I'll check that out. Thank you!!,happiness,surprise,Intense,0,0,0,[],"Je, vais, voir, ça.",4,1.000000,PRON_2 VERB_2 PUNCT_1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


### Save dataset

In [None]:
# Save cleaned dataset
cleaned_file_path = "../Task 4/transformers_dataset.csv"
df.to_csv(cleaned_file_path, index=False, encoding="utf-8")