<h1 style="text-align:center; font-family:Georgia; font-weight:bold; ">Imports</h1>

In [2]:
import os
import random
import librosa
import whisper
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

from hmmlearn import hmm
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, Dense, Dropout

<h1 style="text-align:center; font-family:Georgia; font-weight:bold; ">Constants and Global Variables</h1>

In [None]:
EMOTIONS = ['happiness', 'neutral', 'sadness', 'anger', 'fear', ]
MAPPER = {emotion: [] for emotion in EMOTIONS}
DATA = {emotion: [] for emotion in EMOTIONS}
RECORDS = []


TO_TEXT = whisper.load_model("base")

In [4]:
wordMap = pd.read_csv('wordMap.csv')
for emotion in wordMap.columns.to_list():    
    if emotion.lower() in EMOTIONS: MAPPER[emotion.lower()] = wordMap[emotion].to_list()
    if emotion in ['Calm', 'Boredom']: MAPPER['neutral'] += wordMap[emotion].to_list()
    if emotion in ['Excitement', 'Pride']: MAPPER['happiness'] += wordMap[emotion].to_list()
    if emotion in ['Disgust', 'Frustration', 'Contempt']: MAPPER['anger'] += wordMap[emotion].to_list()

<h1 style="text-align:center; font-family:Georgia; font-weight:bold; ">Cleaning and Preprocessing</h1>

In [5]:
many = pd.read_csv('Raw Datasets/emotion_sentimen_dataset.csv')
for emotion, group in many.groupby('Emotion'):
    if emotion.lower() in EMOTIONS: 
        if emotion.lower() == 'neutral': 
            if len(DATA['neutral']) > 30000: 
                continue
                
        DATA[emotion.lower()].extend(group['text'].tolist())

In [6]:
encoded = pd.read_csv('Raw Datasets/text.csv')
for emotion, group in encoded.groupby('label'):
    if emotion == 0: DATA['sadness'].extend(group['text'].tolist()) 
    if emotion == 1: DATA['happiness'].extend(group['text'].tolist())
    if emotion == 3: DATA['anger'].extend(group['text'].tolist())
    if emotion == 4: DATA['fear'].extend(group['text'].tolist())

In [7]:
df = pd.DataFrame([
    {'Text': text, 'Emotion': emotion}
    for emotion, texts in DATA.items()
    for text in texts
])
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)
df.info()
df.to_csv('Collected Datasets/text.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 695131 entries, 0 to 695130
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   Text     695131 non-null  object
 1   Emotion  695131 non-null  object
dtypes: object(2)
memory usage: 10.6+ MB


In [None]:
validEmotions = {
    'a': 'anger',
    'f': 'fear',
    'h': 'happiness',
    'sa': 'sadness',
    'n': 'neutral',
}

for file in os.listdir("Raw Datasets/ALL"):
    if file.endswith('.wav'):
        parts = file.split('_')
        if len(parts) < 2:
            continue

        code = parts[1].lower()

        emotion = None
        for prefix, label in validEmotions.items():
            if code.startswith(prefix):
                emotion = label
                break
        if not emotion:
            continue  

        filePath = os.path.join("Raw Datasets/ALL", file)

        result = TO_TEXT.transcribe(filePath)
        text = result.get('text', '').strip()

        y, sr = librosa.load(filePath, sr=None)
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
        mfccFlat = mfcc.flatten()

        RECORDS.append({
            'text': text,
            'mfcc': ','.join(map(str, mfccFlat)),
            'emotion': emotion
        })


In [9]:
validEmotions = {
    'SAD': 'sadness',
    'ANG': 'anger',
    'FEA': 'fear',
    'HAP': 'happiness',
    'NEU': 'neutral'
}

for file in os.listdir("Raw Datasets/AudioWAV"):
    if file.endswith('.wav'):
        parts = file.split('_')
        if len(parts) < 3:
            continue

        code = parts[2].upper()
        emotion = validEmotions.get(code)
        if not emotion:
            continue

        filePath = os.path.join("Raw Datasets/AudioWAV", file)

        result = TO_TEXT.transcribe(filePath)
        text = result.get('text', '').strip()

        y, sr = librosa.load(filePath, sr=None)
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
        mfccFlat = mfcc.flatten()

        RECORDS.append({
            'text': text,
            'mfcc': ','.join(map(str, mfccFlat)),
            'emotion': emotion
        })


In [10]:
df = pd.DataFrame(RECORDS)
df.drop_duplicates(inplace=True)
df.info()
df.to_csv('Collected Datasets/audioAndText.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6591 entries, 0 to 6590
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   text     6591 non-null   object
 1   mfcc     6591 non-null   object
 2   emotion  6591 non-null   object
dtypes: object(3)
memory usage: 154.6+ KB
