In [5]:
import soundfile, librosa, pickle, glob, os, parselmouth, pywt, statistics
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from parselmouth.praat import call
from scipy.io import wavfile
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.svm import SVC 

# all emotions on RAVDESS dataset
allemotion = {
    "01": "angry",
    "02": "happy",
    "03": "neutral",
    "04": "sad"
}
# Emotions to observe
observed_emotions={
    "angry",
    "sad",
    "neutral",
    "happy"
}

In [9]:
#Extract features (mfcc, chroma, mel, contrast, tonnetz) from a sound file
def extract_features(file_name, **kwargs):
    """
    Extract feature from audio file `file_name`
        Features supported:
            - MFCC (mfcc)
            - Chroma (chroma)
            - MEL Spectrogram Frequency (mel)
            - Contrast (contrast)
            - Tonnetz (tonnetz)
        e.g:
        `features = extract_feature(path, mel=True, mfcc=True)`
    """
    mfcc = kwargs.get("mfcc")
    chroma = kwargs.get("chroma")
    mel = kwargs.get("mel")
    with soundfile.SoundFile(file_name) as sound_file:
        X = sound_file.read(dtype="float32")
        sample_rate = sound_file.samplerate
        # read the sound
        sound = parselmouth.Sound(file_name)
        #duration
        duration = sound.get_end_time() # duration
        
        # extract features from librosa (MFCC, Mel)
        
        #if chroma:
        stft = np.abs(librosa.stft(X))
        result = np.array([])
        #if mfcc:
        mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
        result = np.hstack((result, mfccs))
        #if mel:
        mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
        #result = np.hstack((result, mel))
        
        
        # extract features from Praat (pitch, intensity, formant, HNR, LTAS, Jitter, Shimmer)
        
        #pitch
        pitch = sound.to_pitch()
        #pitch = call(sound, "To Pitch", 0.0, 75.0, 600.0) #create a praat pitch object
        mean_F0 = call(pitch, "Get mean", 0, duration, "Hertz") # get mean pitch
        stdev_F0 = call(pitch, "Get standard deviation", 0 ,duration, "Hertz") # get standard deviation
        result = np.hstack((result, mean_F0))
        result = np.hstack((result, stdev_F0))

        #intensity
        intensity = sound.to_intensity()
        mean_intensity = call(intensity, "Get mean", 0, duration) # get mean intensity
        stdev_intensity = call(intensity, "Get standard deviation", 0 ,duration) # get standard deviation intensity
        result = np.hstack((result, mean_intensity))
        result = np.hstack((result, stdev_intensity))
        
        #Formant
        formant= sound.to_formant_burg()
        f1_mean = call(formant, "Get mean", 1, 0, duration, "Hertz") # get mean 1st Formant
        f2_mean = call(formant, "Get mean", 2, 0, duration, "Hertz") # get mean 2st Formant
        f3_mean = call(formant, "Get mean", 3, 0, duration, "Hertz") # get mean 3st Formant
        f1_stdev = call(formant, "Get standard deviation", 1, 0, duration, "Hertz") # get standard deviation 1st Formant
        f2_stdev = call(formant, "Get standard deviation", 2, 0, duration, "Hertz") # get standard deviation 2st Formant
        f3_stdev = call(formant, "Get standard deviation", 3, 0, duration, "Hertz") # get standard deviation 3st Formant
        result = np.hstack((result, f1_mean))
        result = np.hstack((result, f2_mean))
        result = np.hstack((result, f3_mean))
        #result = np.hstack((result, f1_stdev))
        #result = np.hstack((result, f2_stdev))
        #result = np.hstack((result, f3_stdev))
        
        """
        #LTAS
        spectrum = sound.to_spectrum()
        ltas = call(spectrum, "To Ltas (1-to-1)")
        ltas_mean = call(ltas, "Get mean", 0, 0, "dB") # get mean intensity
        result = np.hstack((result, ltas_mean))"""
        
        #HNR
        harmonicity = sound.to_harmonicity() 
        hnr = call(harmonicity, "Get mean", 0, duration)
        result = np.hstack((result, hnr))
        
        
        #Jitter
        pointProcess = call(sound, "To PointProcess (periodic, cc)", 75.0, 600.0)
        
        #local 1
        localJitter = call(pointProcess, "Get jitter (local)", 0, 0, 0.0001, 0.02, 1.3)
        result = np.hstack((result, localJitter))
        #local-absolute 3
        #localabsoluteJitter = call(pointProcess, "Get jitter (local, absolute)", 0, 0, 0.0001, 0.02, 1.3)
        #result = np.hstack((result, localabsoluteJitter))
        #rap 2
        rapJitter = call(pointProcess, "Get jitter (rap)", 0, 0, 0.0001, 0.02, 1.3)
        result = np.hstack((result, rapJitter))
        
        
        #Shimmer
        # localShimmer
        localShimmer =  call([sound, pointProcess], "Get shimmer (local)", 0, 0, 0.0001, 0.02, 1.3, 1.6)
        result = np.hstack((result, localShimmer))
        """#localdbShimmer
        localdbShimmer = call([sound, pointProcess], "Get shimmer (local_dB)", 0, 0, 0.0001, 0.02, 1.3, 1.6)
        result = np.hstack((result, localdbShimmer))"""
        
        """
        
        # extract features from pywt (wavelet features)
        # wavelet features
        sample_rate, data = wavfile.read(file_name)# Reading the audio file
        t = np.arange(len(data)) / float(sample_rate);  # Retrieving Time

        data = data/max(data);  # Normalize Audio Data    

        #cA, cD = pywt.dwt(data, "db4", "per") #DWT

        coeffs = pywt.wavedec(data, 'db4', mode='sym', level=4);  # DWT
        
        cA4, cD4, cD3, cD2, cD1 = coeffs
        result = np.hstack((result, statistics.mean(cD1)))
        result = np.hstack((result, statistics.mean(cD2)))
        result = np.hstack((result, statistics.mean(cD3)))
        result = np.hstack((result, statistics.mean(cD4)))
        result = np.hstack((result, statistics.mean(cA4)))
        
        result = np.hstack((result, statistics.mean(cD)))
        result = np.hstack((result, statistics.mean(cA)))"""
        
    return result

In [10]:
# Load the data
def load_data(test_size):
    x, y = [], []
    try :
        for file in glob.glob("Data/NActor_*/*.wav"):
            # get the base name of the audio file
            basename = os.path.basename(file)
            # get the emotion label
            emotion = allemotion[basename.split(" ")[0].split("-")[3]] 
            # we allow only observed_emotions we set for both gender
            if emotion not in observed_emotions:
                continue
            # extract speech features
            features = extract_features(file, mfcc=True, chroma=True, mel=True)
            # add to data
            x.append(features)
            y.append(emotion)
    except :
         pass
    # split the data to training and testing and return it
    return train_test_split(np.array(x), np.array(y), test_size=test_size, random_state=9)

In [11]:
#loading_data
print("Both gender 80/20")
x_train,x_test,y_train,y_test = load_data(test_size = 0.20)
svm_model_linear = SVC(kernel = 'linear', C = 1).fit(x_train, y_train)

# print some details
# number of samples in training data
print("[+] Number of training samples:", x_train.shape[0])
# number of samples in testing data
print("[+] Number of testing samples:", x_test.shape[0])
# number of features used
# this is a vector of features extracted 
# using utils.extract_features() method
print("[+] Number of features:", x_train.shape[1])
#________________________________________________________________________
#linear kernel (best result is linear kernel)
svm_model_linear = SVC(kernel = 'linear', C = 1).fit(x_train, y_train) 
svm_predictions = svm_model_linear.predict(x_test) 

# calculate the accuracy
accuracy = accuracy_score(y_true=y_test,y_pred=svm_predictions)
print("Accuracy linear kernel: {:.2f}%".format(accuracy*100))
print(classification_report(y_test,svm_predictions))
print(confusion_matrix(y_test, svm_predictions))

Both gender 80/20
[+] Number of training samples: 515
[+] Number of testing samples: 129
[+] Number of features: 51
Accuracy linear kernel: 57.36%
              precision    recall  f1-score   support

       angry       0.57      0.79      0.67        34
       happy       0.53      0.51      0.52        37
     neutral       0.60      0.54      0.57        39
         sad       0.64      0.37      0.47        19

    accuracy                           0.57       129
   macro avg       0.58      0.55      0.56       129
weighted avg       0.58      0.57      0.57       129

[[27  3  2  2]
 [11 19  7  0]
 [ 6 10 21  2]
 [ 3  4  5  7]]
