<h1 style="text-align:center; font-family:Georgia; font-weight:bold; ">Imports</h1>

In [None]:
import librosa
import numpy as np

import pandas as pd
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
from tensorflow.keras import layers
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential, Model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tensorflow.keras.layers import (
    Attention,
    Bidirectional,
    Conv2D,
    MaxPooling2D,
    Reshape,
    Dense,
    Dropout,
    Input,
)
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    classification_report,
)

<h1 style="text-align:center; font-family:Georgia; font-weight:bold; ">Constants and Global Variables</h1>

In [None]:
audio = pd.read_csv("Collected Datasets/audio.csv")
audio.info()

EMOTIONS = ["happiness", "neutral", "sadness", "anger", "fear"]
MODELS = [layers.LSTM, layers.GRU, layers.SimpleRNN]


scaler = StandardScaler()
encoder = LabelEncoder()
encoder.classes_ = np.array(EMOTIONS)

BATCH_SIZE = 32
EPOCHS = 20

<h1 style="text-align:center; font-family:Georgia; font-weight:bold; ">Cleaning and Preprocessing</h1>

In [None]:
def _mfccStrToArray(mfccStr):
    vals = np.array(list(map(float, mfccStr.split(","))))
    return vals.reshape(13, vals.size // 13)


def _mfccArrayToStr(mfccArr):
    return ",".join(map(str, mfccArr.flatten()))

In [None]:
def scale(data, f="train"):
    if f == "train":
        data = scaler.fit_transform(data)
    else:
        data = scaler.transform(data)
    return data

In [None]:
def encode(data, f="train"):
    if f == "train":
        data = encoder.fit_transform(data)
    else:
        data = encoder.transform(data)
    return data

In [None]:
def extractMFCC(filePath):
    y, sr = librosa.load(filePath, sr=None)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    mfccFlat = mfcc.flatten()
    return ",".join(map(str, mfccFlat))

In [None]:
def zeroPadding(data, maxLen=120):
    data["mfcc"] = data["mfcc"].apply(_mfccStrToArray)

    def _padToMaxLen(mfccArr):
        if mfccArr.shape[1] < maxLen:
            pad_width = maxLen - mfccArr.shape[1]
            mfccArr = np.pad(mfccArr, ((0, 0), (0, pad_width)), mode="constant")
        return mfccArr

    data["mfcc"] = data["mfcc"].apply(_padToMaxLen)
    data["mfcc"] = data["mfcc"].apply(_mfccArrayToStr)
    return data

In [None]:
def evaluateModel(yTrue, yPred):
    print(classification_report(yTrue, yPred, target_names=EMOTIONS))

    accuracy = accuracy_score(yTrue, yPred)
    print(f"Accuracy: {accuracy:.4f}")

    precision = precision_score(yTrue, yPred, average="weighted")
    print(f"Precision: {precision:.4f}")

    recall = recall_score(yTrue, yPred, average="weighted")
    print(f"Recall: {recall:.4f}")

    f1 = f1_score(yTrue, yPred, average="weighted")
    print(f"F1 Score: {f1:.4f}")

    cm = confusion_matrix(yTrue, yPred)
    print(f"Confusion Matrix:\n{cm}")

    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False)
    plt.title("Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()

In [None]:
def buildModel(shape, modelType):
    print(f"\nBuilding model with {modelType.__name__}")

    model = Sequential()
    model.add(layers.Input(shape=shape))

    model.add(Reshape((shape[0], shape[1], 1)))

    model.add(Conv2D(32, (3, 3), activation="relu", padding="same"))
    model.add(MaxPooling2D((2, 2)))
    model.add(Conv2D(64, (3, 3), activation="relu", padding="same"))
    model.add(MaxPooling2D((2, 2)))

    model.add(Reshape((-1, model.output_shape[-1] * model.output_shape[-2])))

    model.add(Bidirectional(modelType(120, return_sequences=True)))

    def _attention(x):
        attn_layer = Attention()
        attn_output = attn_layer([x, x])
        return tf.reduce_mean(attn_output, axis=1)

    model.add(layers.Lambda(_attention))

    model.add(Dense(64, activation="relu"))
    model.add(Dropout(0.3))
    model.add(Dense(len(EMOTIONS), activation="softmax"))

    model.compile(
        optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"]
    )

    return model

In [None]:
def train(model, xTrain, yTrain, xVal, yVal):
    print(f"\nTraining {model.layers[0].__class__.__name__} model...")
    model.fit(
        xTrain,
        yTrain,
        validation_data=(xVal, yVal),
        epochs=EPOCHS,
        batch_size=BATCH_SIZE,
        verbose=True,
    )
    predictions = model.predict(xVal)
    predictions = np.argmax(predictions, axis=1)
    trueLabels = np.argmax(yVal, axis=1)
    evaluateModel(trueLabels, predictions)
    return model

In [None]:
def compareAudioModels(xTrain, yTrain, xVal, yVal):
    bestModel = None
    for model in MODELS:
        model = buildModel(xTrain.shape[1:], model)
        model = train(model, xTrain, yTrain, xVal, yVal)

        if model.layers[0].__class__.__name__ == "SimpleRNN":
            bestModel = model

    return bestModel

In [None]:
audio = zeroPadding(audio)

features = np.array([_mfccStrToArray(mfcc) for mfcc in audio["mfcc"]])
features = np.transpose(features, (0, 2, 1))

targets = encode(audio["Emotion"], f="train")
targets = to_categorical(targets)
xTrain, xVal, yTrain, yVal = train_test_split(
    features, targets, test_size=0.2, stratify=targets, random_state=42
)

samplesTrain, timeSteps, numFeatures = xTrain.shape
xTrainFlat = xTrain.reshape(-1, numFeatures)
xValFlat = xVal.reshape(-1, numFeatures)

xTrainScaled = scale(xTrainFlat, f="train")
xValScaled = scale(xValFlat, f="test")

xTrainScaled = xTrainScaled.reshape(samplesTrain, timeSteps, numFeatures)
xValScaled = xValScaled.reshape(xVal.shape[0], timeSteps, numFeatures)

bestModel = compareAudioModels(xTrainScaled, yTrain, xValScaled, yVal)
bestModel.save("bestAudioEmotionModel.h5")