In [1]:
import os
import pandas as pd
import librosa
import numpy as np
import torch
from transformers import AutoTokenizer
import matplotlib.pyplot as plt
import nlpaug.augmenter.word as naw
import random

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def map_sentiment_emotion(score):
    '''
    Setting up fusion between audio dataset and text dataset.
    input: score from textual dataset
    output: sentiment label matching to audio dataset
    '''
    if score=="1 star":
        return "Angry"
    
    elif score=="2 stars":
        return "Sad"
    
    elif score=="3 stars":
        return "Neutral"
    
    elif score=="4 stars":
        return "Suprised"

    elif score=="5 stars":
        return "Happy"

In [3]:
df=pd.read_csv("../datasets/textualData.csv")
df=df[['tweet', 'language', 'sentiment']].dropna()

df['emotion'] = df['sentiment'].apply(map_sentiment_emotion)
df.head()

Unnamed: 0,tweet,language,sentiment,emotion
0,"Lionel Messi, que ha estado vinculado con un t...",es,3 stars,Neutral
1,This is a guest post by The Joy of Truth. To r...,en,4 stars,Suprised
2,Nous sommes tous conscients de la popularité d...,fr,5 stars,Happy
3,El baño en el sistema de metro de la ciudad de...,es,4 stars,Suprised
4,"""Ich habe dies seit über 20 Jahren getan und i...",de,5 stars,Happy


In [4]:
print(df['emotion'].value_counts(dropna=False))

emotion
Angry       1461
Suprised    1067
Happy       1015
Neutral      937
Sad          437
Name: count, dtype: int64


In [5]:
tokenizer=AutoTokenizer.from_pretrained("xlm-roberta-base")
tokenized=tokenizer(df['tweet'].tolist(), padding=True, truncation=True, return_tensors="pt")

text_data={
    "input_ids": tokenized['input_ids'],
    "attention_mask": tokenized['attention_mask'],
    "labels": df['emotion'].values,
}

os.makedirs("../datasets/processed_text", exist_ok=True)
torch.save(text_data, "../datasets/processed_text/text_data.pt")
print("successfully saved text data")



successfully saved text data


preparing augmented dataset

In [7]:
sad_df=df[df['emotion']=='Sad']
print(sad_df)

                                                  tweet  ... emotion
8     "Molte persone dicono: 'Oh, beh, non ne abbiam...  ...     Sad
22    Après une série de décès très médiatisés la se...  ...     Sad
40    Le type le plus commun et potentiellement le p...  ...     Sad
41    Nel maggio 2009, ho scritto di una storia che ...  ...     Sad
48    Somewhere in the middle is a woman who is so s...  ...     Sad
...                                                 ...  ...     ...
4821  What if your favorite team wasn't a contender ...  ...     Sad
4830  En los últimos años, muchas organizaciones han...  ...     Sad
4846  \nThe US government has been slow to respond t...  ...     Sad
4887  \nThe "New World Order" is already in an uproa...  ...     Sad
4914  Mit all der Aufmerksamkeit, die dem Thema Abtr...  ...     Sad

[437 rows x 4 columns]


In [8]:
aug=naw.ContextualWordEmbsAug(
    model_path="xlm-roberta-base",
    action="substitute",
    device="cuda" if torch.cuda.is_available() else "cpu"
)



In [10]:
augmented_texts = []
num_aug=200

for i in range(num_aug):
    original_text=random.choice(sad_df['tweet'].tolist())
    try:
        augmented_text = aug.augment(original_text)
        augmented_texts.append({'tweet': augmented_text, 'emotion': 'Sad'})
    except Exception as e:
        print(f"Error during augmentation: {e}")
        continue

augmented_df = pd.DataFrame(augmented_texts)
final_df = pd.concat([df[['tweet', 'emotion']], augmented_df], ignore_index=True)
print(final_df['emotion'].value_counts(dropna=False))

emotion
Angry       1461
Suprised    1067
Happy       1015
Neutral      937
Sad          637
Name: count, dtype: int64


In [12]:
print(augmented_df.head())

                                               tweet emotion
0  [Ich bin ein kleiner Unternehmer in Seattle, u...     Sad
1  [Es ist nicht die beste Zeit. Ich bin enttäusc...     Sad
2  [Un peu plus d'un demi-année après l'annonce i...     Sad
3  ["Esto no es tan serio como esperábamos" "Tuvi...     Sad
4  [The European Union's European Parliament has ...     Sad


In [11]:
tokenizer=AutoTokenizer.from_pretrained("xlm-roberta-base")
tokenized=tokenizer(df['tweet'].tolist(), padding=True, truncation=True, return_tensors="pt")

text_data={
    "input_ids": tokenized['input_ids'],
    "attention_mask": tokenized['attention_mask'],
    "labels": df['emotion'].values,
}

os.makedirs("../datasets/processed_text", exist_ok=True)
torch.save(text_data, "../datasets/processed_text/augmented_text_data.pt")
print("successfully saved text data")



successfully saved text data


In [6]:
def extract_mfcc(file_path, sr=16000, n_mfcc=40):
    '''
    Extracting MFCC features from audio files.
    input: file path of audio file
    output: MFCC features
    '''
    y, sr = librosa.load(file_path, sr=sr)
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    return mfccs.T

ravdess_path="../datasets/ravdess"
features=[]
labels=[]

emotion_map={
    "01": 2,
    "03": 4,
    "04": 1, 
    "05": 0,
    "08": 3,
}

for actor_folder in os.listdir(ravdess_path):
    folder_path=os.path.join(ravdess_path, actor_folder)
    if not os.path.isdir(folder_path):
        continue
    for file in os.listdir(folder_path):
        if file.endswith('.wav'):
            parts=file.split("-")
            if len(parts)!=7:
                continue
                
            modality=parts[0]
            emotion=parts[2]

            if modality=="03" and emotion in emotion_map:
                file_path=os.path.join(folder_path, file)
                mfccs=extract_mfcc(file_path)
                features.append(mfccs)
                labels.append(emotion_map[emotion])

In [7]:
max_len=200
n_mfcc=40

padded_features = []

for mfcc in features:
    if mfcc.shape[0]>max_len:
        padded=mfcc[:max_len, :]
    else:
        pad_width=max_len-mfcc.shape[0]
        padded=np.pad(mfcc, ((0, pad_width), (0, 0)), mode='constant')
    padded_features.append(padded)

padded_features = np.array(padded_features)
labels=np.array(labels)
np.save("../datasets/processed_audio/features.npy", padded_features)
np.save("../datasets/processed_audio/labels.npy", np.array(labels))
print("successfully saved audio data")

successfully saved audio data
