In [None]:
import os
import numpy as np
import librosa as lb
from tensorflow.keras import models, layers, optimizers, callbacks # type: ignore
from sklearn.model_selection import train_test_split

In [2]:
PATH = './ravdess'
SAMPLE_RATE = 22050
DURATION = 3
LENGTH = DURATION * SAMPLE_RATE
NOISE_FACTOR = 0.005
EMOTIONS = {
    '01': 0,
    '02': 1,
    '03': 2,
    '04': 3,
    '05': 4,
    '06': 5,
    '07': 6,
    '08': 7
}

features = []
labels = []
genders = []
actors = []

In [None]:
def load(path):
    data, _ = lb.load(path, sr=SAMPLE_RATE, duration=DURATION)
    data, _ = lb.effects.trim(data)
    if len(data) < LENGTH:
        padding = LENGTH - len(data)
        data = np.pad(data, (0, padding), 'constant')
    elif len(data) > LENGTH:
        data = data[:LENGTH]
    return data

In [None]:
def extract(data):
    graph = lb.feature.melspectrogram(y=data, sr=SAMPLE_RATE)
    graph = lb.power_to_db(graph)
    return graph[..., np.newaxis]

In [None]:
def augument(data):
    noise = np.random.randn(LENGTH)
    data_noise = np.array(data + NOISE_FACTOR * noise)
    data_pitch = lb.effects.pitch_shift(data, sr=SAMPLE_RATE, n_steps=-2)
    return (data_noise, data_pitch)

In [None]:
def parse(root, filename):
    if not filename.endswith('.wav'):
        return
    args = filename.removesuffix('.wav').split('-')
    emotion = args[2]
    actor = args[6]
    gender = 0 if int(actor) % 2 == 0 else 1
    if emotion not in EMOTIONS:
        return
    path = os.path.join(root, filename)
    data = load(path)
    data_noise, data_pitch = augument(data)
    features.append(extract(data))
    features.append(extract(data_noise))
    features.append(extract(data_pitch))
    for _ in range(3):
        labels.append(EMOTIONS[emotion])
        actors.append(actor)
        genders.append(gender)

In [None]:
def save():
    x = np.array(features)
    y = np.array(labels)
    z = np.array(genders)
    a = np.array(actors)
    os.makedirs('data', exist_ok=True)
    np.save('data/features.npy', x)
    np.save('data/labels.npy', y)
    np.save('data/genders.npy', z)
    np.save('data/actors.npy', a)

In [None]:
def main():
    progress = 0
    for root, _, files in os.walk(PATH):
        for file in files:
            parse(root, file)
            progress += 1
        print(f'{(progress * 100) / (24 * 60)}% Complete [{24 * 60 - progress} files remaining]')
    save()

if __name__ == "__main__":
    main()


In [None]:
PATH = './data'
EPOCHS = 20
LEARNING_RATE = 0.001

In [None]:
def load():
    x = np.load(os.path.join(PATH, 'features.npy'))
    y = np.load(os.path.join(PATH, 'labels.npy'))
    g = np.load(os.path.join(PATH, 'genders.npy'))
    a = np.load(os.path.join(PATH, 'actors.npy'))
    return x, y, g, a

In [None]:
def split(x, y, a, g):
    x_train, x_temp, y_train, y_temp, a_train, a_temp, g_train, g_temp = train_test_split(x, y, a, g, test_size=0.2, random_state=42, stratify=y)
    x_val, x_test, y_val, y_test, a_val, a_test, g_val, g_test = train_test_split(x_temp, y_temp, a_temp, g_temp, test_size=0.5, random_state=42, stratify=y_temp)
    return (x_train, x_test, x_val), (y_train, y_test, y_val), (a_train, a_test, a_val), (g_train, g_test, g_val)

In [None]:
def init(shape):
    model = models.Sequential()

    # 1. Normalization (The Input Scaler)
    model.add(layers.Normalization(axis=None, input_shape=shape))

    # 2. Convolutional Blocks (The Feature Extractors)
    # Block 1
    model.add(layers.Conv2D(64, (3, 3), padding='same', use_bias=False))
    model.add(layers.BatchNormalization())
    model.add(layers.Activation('relu'))
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.Dropout(0.2))

    # Block 2
    model.add(layers.Conv2D(128, (3, 3), padding='same', use_bias=False))
    model.add(layers.BatchNormalization())
    model.add(layers.Activation('relu'))
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.Dropout(0.3))

    # Block 3
    model.add(layers.Conv2D(256, (3, 3), padding='same', use_bias=False))
    model.add(layers.BatchNormalization())
    model.add(layers.Activation('relu'))
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.Dropout(0.4))

    # 3. Global Average Pooling (The Efficient Summarizer)
    # This averages the features instead of flattening them, saving memory.
    model.add(layers.GlobalAveragePooling2D())

    # 4. Classifier (The Output)
    model.add(layers.Dense(64, activation='relu'))
    model.add(layers.Dense(8, activation='softmax')) # 8 Emotions

    # Compile the model
    opt = optimizers.Adam(learning_rate=LEARNING_RATE)
    model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [None]:
def main():
    x, y, a, g = load()
    (x_train, x_test, x_val), (y_train, y_test, y_val), (a_train, a_test, a_val), (g_train, g_test, g_val) = split(x, y, a, g)
    model = init(x.shape[1:])
    checkpoint = callbacks.ModelCheckpoint('data/weights.keras', save_best_only=True)
    model.layers[0].adapt(x_train)
    history = model.fit(
        x_train, y_train,
        validation_data=(x_val, y_val),
        epochs=EPOCHS,
        batch_size=32,
        callbacks=[checkpoint],
        verbose=2
    )

if __name__ == "__main__":
    main()
