In [8]:
import soundfile, librosa, pickle, glob, os, parselmouth
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.svm import SVC 

# all emotions on RAVDESS dataset
allemotion = {
    "01": "angry",
    "02": "happy",
    "03": "neutral",
    "04": "sad"
}
# Emotions to observe
observed_emotions={
    "angry",
    "sad",
    "neutral",
    "happy"
}

In [25]:
#Extract features (mfcc, chroma, mel, contrast, tonnetz) from a sound file
def extract_features(file_name, sound_n, **kwargs):
    """
    Extract feature from audio file `file_name`
        Features supported:
            - MFCC (mfcc)
            - Chroma (chroma)
            - MEL Spectrogram Frequency (mel)
            - Contrast (contrast)
            - Tonnetz (tonnetz)
        e.g:
        `features = extract_feature(path, mel=True, mfcc=True)`
    """
    mfcc = kwargs.get("mfcc")
    chroma = kwargs.get("chroma")
    mel = kwargs.get("mel")
    
    with soundfile.SoundFile(file_name) as sound_file:
        X = sound_file.read(dtype="float32")
        sample_rate = sound_file.samplerate        
        #
        stft = np.abs(librosa.stft(X))
        result = np.array([])
        # mfcc:
        mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
        result = np.hstack((result, mfccs))
        # mel:
        mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
        result = np.hstack((result, mel))
        
        #pitch = sound_n.to_pitch()
        pitch = parselmouth.praat.call(sound_n, "To Pitch", 0.0, 75, 500) #create a praat pitch object
        #pitch = parselmouth.praat.call(sound_n, "To Pitch", 0.0, 75, 400)
        meanF0 = parselmouth.praat.call(pitch, "Get mean", 0, 0, "Hertz") # get mean pitch
        stdevF0 = parselmouth.praat.call(pitch, "Get standard deviation", 0 ,0, "Hertz") # get standard deviation
        result = np.hstack((result, meanF0))
        result = np.hstack((result, stdevF0))

    return result

In [26]:
# Load the data
def load_data(test_size):
    x, y = [], []
    try :
        for file in glob.glob("Data/NActor_*/*.wav"):
            ## read the sound
            sound = parselmouth.Sound(file)
            # get the base name of the audio file
            basename = os.path.basename(file)
            # get the emotion label
            emotion = allemotion[basename.split(" ")[0].split("-")[3]] 
            # we allow only observed_emotions we set for both gender
            if emotion not in observed_emotions:
                continue
            # extract speech features
            features = extract_features(file, sound, mfcc=True, chroma=True, mel=True)
            # add to data
            x.append(features)
            y.append(emotion)
    except :
         pass
    # split the data to training and testing and return it
    return train_test_split(np.array(x), np.array(y), test_size=test_size, random_state=9)

In [27]:
#loading_data
print("Both gender 80/20")
x_train,x_test,y_train,y_test = load_data(test_size = 0.19)
svm_model_linear = SVC(kernel = 'linear', C = 1).fit(x_train, y_train)

# print some details
# number of samples in training data
print("[+] Number of training samples:", x_train.shape[0])
# number of samples in testing data
print("[+] Number of testing samples:", x_test.shape[0])
# number of features used
# this is a vector of features extracted 
# using utils.extract_features() method
print("[+] Number of features:", x_train.shape[1])
#________________________________________________________________________
#linear kernel (best result is linear kernel)
svm_model_linear = SVC(kernel = 'linear', C = 1).fit(x_train, y_train) 
svm_predictions = svm_model_linear.predict(x_test) 

# calculate the accuracy
accuracy = accuracy_score(y_true=y_test,y_pred=svm_predictions)
print("Accuracy linear kernel: {:.2f}%".format(accuracy*100))
print(classification_report(y_test,svm_predictions))
print(confusion_matrix(y_test, svm_predictions))

Both gender 80/20
[+] Number of training samples: 466
[+] Number of testing samples: 110
[+] Number of features: 170
Accuracy linear kernel: 60.91%
              precision    recall  f1-score   support

       angry       0.77      0.71      0.74        34
       happy       0.58      0.53      0.55        34
     neutral       0.48      0.50      0.49        22
         sad       0.56      0.70      0.62        20

    accuracy                           0.61       110
   macro avg       0.60      0.61      0.60       110
weighted avg       0.62      0.61      0.61       110

[[24  3  2  5]
 [ 4 18  8  4]
 [ 2  7 11  2]
 [ 1  3  2 14]]


In [28]:
#loading_data
print("Both gender 80/20")
x_train,x_test,y_train,y_test = load_data(test_size = 0.20)
svm_model_linear = SVC(kernel = 'linear', C = 1).fit(x_train, y_train)

# print some details
# number of samples in training data
print("[+] Number of training samples:", x_train.shape[0])
# number of samples in testing data
print("[+] Number of testing samples:", x_test.shape[0])
# number of features used
# this is a vector of features extracted 
# using utils.extract_features() method
print("[+] Number of features:", x_train.shape[1])
#________________________________________________________________________
#linear kernel (best result is linear kernel)
svm_model_linear = SVC(kernel = 'linear', C = 1).fit(x_train, y_train) 
svm_predictions = svm_model_linear.predict(x_test) 

# calculate the accuracy
accuracy = accuracy_score(y_true=y_test,y_pred=svm_predictions)
print("Accuracy linear kernel: {:.2f}%".format(accuracy*100))
print(classification_report(y_test,svm_predictions))
print(confusion_matrix(y_test, svm_predictions))

Both gender 80/20
[+] Number of training samples: 460
[+] Number of testing samples: 116
[+] Number of features: 170
Accuracy linear kernel: 56.03%
              precision    recall  f1-score   support

       angry       0.73      0.69      0.71        35
       happy       0.50      0.43      0.46        35
     neutral       0.39      0.46      0.42        24
         sad       0.60      0.68      0.64        22

    accuracy                           0.56       116
   macro avg       0.56      0.56      0.56       116
weighted avg       0.57      0.56      0.56       116

[[24  4  3  4]
 [ 4 15 12  4]
 [ 3  8 11  2]
 [ 2  3  2 15]]
