In [98]:
import librosa
import soundfile
import os, glob, pickle
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

In [99]:
def extract_feature(file_name, mfcc, chroma, mel):
    with soundfile.SoundFile(file_name) as sound_file:
        X = sound_file.read(dtype = "float32")
        sample_rate = sound_file.samplerate
        if chroma:
            stft = np.abs(librosa.stft(X))
        result = np.array([])
        if mfcc:
            mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
            result = np.hstack((result, mfccs))
        if chroma:
            chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
            result = np.hstack((result, chroma))
        if mel:
            mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
            result = np.hstack((result, mel))
    return result

In [100]:
emotions = {
  '01':'neutral',
  '02':'calm',
  '03':'happy',
  '04':'sad',
  '05':'angry',
  '06':'fearful',
  '07':'disgust',
  '08':'surprised' }

observed_emotions = ['angry', 'happy', 'calm', 'fearful']

In [101]:
def load_data(test_size=0.2):
    x,y = [],[]
    for file in glob.glob("/Users/akhilajoshi/Python_Coding/music/Speech-Emotion-Recognition/SER/Actor_*//*.wav"):
        file_name = os.path.basename(file)
        emotion = emotions[file_name.split("-")[2]]
        if emotion not in observed_emotions:
            continue
        feature = extract_feature(file, mfcc=True, chroma=True, mel=True)
        x.append(feature)
        y.append(emotion)
    return train_test_split(np.array(x), y, test_size=test_size, random_state=101)

In [102]:
X_train, X_test, y_train, y_test = load_data(test_size=0.25)

 -6.1035156e-05 -3.0517578e-05] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
  mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
 -3.0517578e-05 -3.0517578e-05] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
 -3.0517578e-05  0.0000000e+00] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
 0.0000000e+00] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
 0.0000000e+00] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mel =

In [103]:
print((X_train.shape[0], X_test.shape[0]))

(576, 192)


In [104]:
print(f'Features extracted: {X_train.shape[1]}')

Features extracted: 180


In [105]:
model = MLPClassifier(alpha=0.01, batch_size=256, epsilon=1e-08, 
                      hidden_layer_sizes=(300,), learning_rate='adaptive', max_iter=500)

In [106]:
model.fit(X_train,y_train)

In [107]:
predictions = model.predict(X_test)

In [108]:
accuracy = accuracy_score(y_test, predictions)

print("Accuracy: {:.2f}%".format(accuracy*100))

Accuracy: 76.56%


In [109]:
from sklearn.metrics import classification_report,confusion_matrix

In [110]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

       angry       0.79      0.73      0.76        41
        calm       0.80      0.92      0.85        51
     fearful       0.81      0.68      0.74        50
       happy       0.68      0.72      0.70        50

    accuracy                           0.77       192
   macro avg       0.77      0.76      0.76       192
weighted avg       0.77      0.77      0.76       192



In [111]:
print(confusion_matrix(y_test,predictions))

[[30  0  3  8]
 [ 0 47  1  3]
 [ 3  7 34  6]
 [ 5  5  4 36]]
