# Create and Train Model

In [1]:
import os

import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

import config
from data_loading import load_data_from_dir


### Load the Features previously extracted as part of main.py

In [2]:
wav_mfccs_dir = config.TRAIN_WAV_MFCCS
lms_mfccs_dir = config.TRAIN_LMS_MFCCS

wav_mfccs_list = []
wav_labels_list = []

for file_name in os.listdir(wav_mfccs_dir):
    if file_name.endswith('.npy'):
        mfccs = np.load(os.path.join(wav_mfccs_dir, file_name))
        wav_mfccs_list.append(mfccs)
        
        label = file_name.split('_')[0]
        wav_labels_list.append(label)

lms_mfccs_list = []
lms_labels_list = []

for file_name in os.listdir(lms_mfccs_dir):
    if file_name.endswith('.npy'):
        mfccs = np.load(os.path.join(lms_mfccs_dir, file_name))
        lms_mfccs_list.append(mfccs)
        
        label = file_name.split('_')[0]
        lms_labels_list.append(label)



Check shape match

In [3]:
print(wav_mfccs_list[0].shape)
print(lms_mfccs_list[0].shape)


(40, 775)
(40, 775)


### Load the Previously Split Directories

Store feature files into tes, val, and train

### Load Training

In [4]:
#train_dirs = [config.TRAIN_WAV_MFCCS, config.TRAIN_AUG_WAV_MFCCS]
train_dirs = [config.TRAIN_LMS_MFCCS, config.TRAIN_AUG_LMS_MFCCS]

X_train = []
y_train = []

for train_dir in train_dirs:
    for file_name in os.listdir(train_dir):
        if file_name.endswith('.npy'):
            file_path = os.path.join(train_dir, file_name)
            mfccs = np.load(file_path)
            X_train.append(mfccs)
            label = file_name.split('.npy')[0]
            y_train.append(label)

X_train = np.array(X_train)
y_train = np.array(y_train)


In [5]:
X_train.shape, y_train.shape

((1716, 40, 775), (1716,))

### Load Testing

In [6]:
#test_dirs = [config.TEST_WAV_MFCCS]
test_dirs = [config.TEST_LMS_MFCCS]

X_test = []
y_test = []

for test_dir in test_dirs:
    for file_name in os.listdir(test_dir):
        if file_name.endswith('.npy'):
            file_path = os.path.join(test_dir, file_name)
            mfccs = np.load(file_path)
            X_test.append(mfccs)
            label = file_name.split('.npy')[0]
            y_test.append(label)

X_test = np.array(X_test)
y_test = np.array(y_test)


In [7]:
X_test.shape, y_test.shape

((394, 40, 775), (394,))

### Load Validation

In [8]:
#val_dirs = [config.VAL_WAV_MFCCS]
val_dirs = [config.VAL_LMS_MFCCS]

X_val = []
y_val = []

for val_dir in val_dirs:
    for file_name in os.listdir(val_dir):
        if file_name.endswith('.npy'):
            file_path = os.path.join(val_dir, file_name)
            mfccs = np.load(file_path)
            X_val.append(mfccs)
            label = file_name.split('.npy')[0]
            y_val.append(label)

X_val = np.array(X_val)
y_val = np.array(y_val)


In [9]:
X_val.shape, y_val.shape

((344, 40, 775), (344,))

### Define the model

Get number of classes from filename

In [10]:
filenames, labels = load_data_from_dir(config.WAV_DIR_PATH)

emotion_labels = [filename.split('-')[2] for filename in filenames] # based on filename identifiers https://www.kaggle.com/datasets/uwrfkaggler/ravdess-emotional-speech-audio

num_classes = len(np.unique(emotion_labels))

print("Number of classes (emotions):", num_classes)

Number of classes (emotions): 8


In [21]:
X_train_reshaped = X_train.reshape(X_train.shape[0], X_train.shape[1], X_train.shape[2], 1)

model = models.Sequential([
    layers.Input(shape=X_train_reshaped.shape[1:]),
    layers.Conv2D(32, (3, 3), activation='relu'),
    #layers.BatchNormalization(),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),
    layers.Flatten(),
    layers.Dense(128, activation='relu'),
    layers.Dense(num_classes, activation='softmax')
])


### Compile the Model

In [22]:
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

### Need to Encode the Labels as they are String Data

In [13]:
# extract emotion labels from filenames
emotions = [label.split('-')[2] for label in y_train]

label_encoder = LabelEncoder()
emotions_encoded = label_encoder.fit_transform(emotions)
emotions_encoded = emotions_encoded.reshape(-1, 1)

encoder = OneHotEncoder(categories='auto', sparse_output=False) # try sparse true
y_train_encoded = encoder.fit_transform(emotions_encoded)

y_test_encoded = encoder.transform(label_encoder.transform([label.split('-')[2] for label in y_test]).reshape(-1, 1))
y_val_encoded = encoder.transform(label_encoder.transform([label.split('-')[2] for label in y_val]).reshape(-1, 1))


In [14]:
print(np.unique(emotions))

['01' '02' '03' '04' '05' '06' '07' '08']


Train a model using waveforms and another using spectrgrams and compare

### Train the Model

* train only wav first
* then train spectrogram
* compare results from model, enhance better performing

In [23]:
X_val_reshaped = X_val.reshape(X_val.shape[0], X_val.shape[1], X_val.shape[2], 1)

history = model.fit(X_train_reshaped, y_train_encoded,
                    epochs=10, 
                    batch_size=32,
                    validation_data=(X_val_reshaped, y_val_encoded)) 


Epoch 1/10
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 1s/step - accuracy: 0.1322 - loss: 10.0273 - val_accuracy: 0.1105 - val_loss: 2.0797
Epoch 2/10
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 1s/step - accuracy: 0.1209 - loss: 2.0775 - val_accuracy: 0.1453 - val_loss: 2.0767
Epoch 3/10
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m982s[0m 19s/step - accuracy: 0.1350 - loss: 2.0749 - val_accuracy: 0.1366 - val_loss: 2.0751
Epoch 4/10
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 2s/step - accuracy: 0.1314 - loss: 2.0718 - val_accuracy: 0.1366 - val_loss: 2.0738
Epoch 5/10
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 2s/step - accuracy: 0.1464 - loss: 2.0696 - val_accuracy: 0.1366 - val_loss: 2.0728
Epoch 6/10
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 2s/step - accuracy: 0.1377 - loss: 2.0679 - val_accuracy: 0.1366 - val_loss: 2.0720
Epoch 7/10
[1m54/54[0m [32m━━━━━━━

When provided feature data from both spectrogram and waveform, validation accuracy and loss seem to fluctuate, might suggest overfitting or insufficient generalistion to the validation set

two models, one to train with wav the other spectrgrams. Then compare on improve the better model. Document!

### Evaluate Model

In [16]:
# Evaluate the model
X_test_reshaped = X_test.reshape(X_test.shape[0], X_test.shape[1], X_test.shape[2], 1)

test_loss, test_accuracy = model.evaluate(X_test_reshaped, y_test_encoded)
print('Test accuracy:', test_accuracy)

[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 139ms/step - accuracy: 0.6638 - loss: 1.7036
Test accuracy: 0.6903553009033203
