Project based on : Speech Emotion Recognition Based on Linear Discriminant Analysis and Support Vector Machine Decision Tree
J. Mao, Y. He and Z. Liu, "Speech Emotion Recognition Based on Linear Discriminant Analysis and Support Vector Machine Decision Tree," 2018 37th Chinese Control Conference (CCC), 2018, pp. 5529-5533, doi: 10.23919/ChiCC.2018.8482931.

In [1]:
import librosa
import soundfile
import numpy as np
import glob
import os
from scipy.fftpack import dct
import neurokit
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn import metrics
import pitch
import math
import warnings
warnings.filterwarnings("ignore")
import pickle
from sklearn.model_selection import GridSearchCV

In [2]:
emotions = {
    "01": "neutral", "02": "calm", "03": "sad", "04": "sad", "05": "angry", "06": "fearful",
    "07": "disgust", "08": "surprised"
}
"More emotions can be added to the list as per requirement and data available to train"
# needed_emotions = ["happy", "neutral", "sad", "angry"]
needed_emotions = ["happy","sad"]

X = []
y = []


In [3]:
def feature_extraction(file): 
    signal,sample_rate = librosa.load(file)
    signal = signal.tolist()
#     Padding data to max value of all the data files(keeping all file data of same length)
    while len(signal) != 250000:    
        signal.append(0)
    signal = np.array(signal)
    "PART 3.1"
       #Pre-emphasis (y(t)=x(t)−αx(t−1))
    emphasized_signal = np.append(signal[0], signal[1:] - 0.97 * signal[:-1])
        #HAMMIN WINDOW FOR EACH FRAME (25 ms for the frame size,a 10 ms stride (15 ms overlap))
    frame_size,frame_stride = 0.025,0.01
#     Frame length = 1200, Frame step = 480
    frame_length, frame_step = frame_size * sample_rate, frame_stride * sample_rate  # Convert from seconds to samples
    signal_length = len(emphasized_signal)
    frame_length = int(round(frame_length))
    frame_step = int(round(frame_step))
    num_frames = int(np.ceil(float(np.abs(signal_length - frame_length)) / frame_step))  # Make sure that we have at least 1 frame

    pad_signal_length = num_frames * frame_step + frame_length
    z = np.zeros((pad_signal_length - signal_length))
    pad_signal = np.append(emphasized_signal, z) # Pad Signal to make sure that all frames have equal number of samples without truncating any samples from the original signal

    indices = np.tile(np.arange(0, frame_length), (num_frames, 1)) + np.tile(np.arange(0, num_frames * frame_step, frame_step), (frame_length, 1)).T
    frames = pad_signal[indices.astype(np.int32, copy=False)]

    frames *= np.hamming(frame_length)
    
    "PART 3.2 ,PART 3.3, PART 3.4 & PART 3.5"
#     FFT on each frame
    mag_frames = np.absolute(np.fft.rfft(frames, 256))
#     mag_frames_padded = np.append(mag_frames, z)
    features = []
    zero_crossings = []
    energy = []
    energy_times_zerocrossing = []
    chroma = []  # maybe pitch
    spectral_centroids = []
    spectral_cutoff = []
    mfccs = []
    formant = []
    fractal_dimension = []
    rms = []
#     for frame in frames:
    for frame in frames:
        energy_temp = np.sum(np.power(np.absolute(frame[:]),2))
        energy.append(energy_temp)
        zero_crossings_temp = librosa.feature.zero_crossing_rate(frame)
        zero_crossing_mean = (zero_crossings_temp.flatten())
        zero_crossings.extend(zero_crossing_mean)
        energy_times_zerocrossing_temp = np.dot(energy_temp,zero_crossing_mean)
        energy_times_zerocrossing.extend(energy_times_zerocrossing_temp)
        formant.append(max(frame))
        chroma_temp = librosa.feature.chroma_stft(frame, sr=sample_rate)
        chroma.extend(chroma_temp.flatten())
        mfccs_temp = librosa.feature.mfcc(frame, sr=sample_rate)
        mfccs.append(np.sum(np.square(mfccs_temp)))
        fractal_dimension_temp = neurokit.complexity(frame,sampling_rate=sample_rate, shannon=False, sampen=False, multiscale=False, spectral=False, svd=False, correlation=True, higushi=False, petrosian=False, fisher=False, hurst=False, dfa=False, lyap_r=False, lyap_e=False, emb_dim=2, tolerance="default", k_max=8, bands=None, tau=1)
        if not math.isnan(fractal_dimension_temp["Fractal_Dimension_Correlation"]):
            fractal_dimension.append(fractal_dimension_temp["Fractal_Dimension_Correlation"])
        else:
            fractal_dimension.append(0)
        spectral_centroids_temp = librosa.feature.spectral_centroid(frame, sr=sample_rate)[0]
        spectral_centroids.extend(spectral_centroids_temp)
        spectral_cutoff_temp = librosa.feature.spectral_rolloff(frame, sr=sample_rate)[0]
        spectral_cutoff.extend(spectral_cutoff_temp)   #Including rms
        rms_temp = librosa.feature.rms(y=frame)
        rms.extend(rms_temp.flatten())

    features.extend(zero_crossings)
    features.extend(energy)
    features.extend(energy_times_zerocrossing)
    features.extend(formant)
    features.extend(chroma)
    features.extend(spectral_centroids)
    features.extend(spectral_cutoff)
    features.extend(mfccs)
    features.extend(fractal_dimension)
    features.extend(rms)
    print(file," : ",np.array(zero_crossings).shape,np.array(energy).shape,np.array(energy_times_zerocrossing).shape,np.array(formant).shape,
         np.array(chroma).shape,np.array(spectral_centroids).shape,np.array(spectral_cutoff).shape,np.array(mfccs).shape,np.array(fractal_dimension).shape,np.array(rms).shape)
    return features

In [None]:
dir = "/Users/ak1050588/Downloads/301 - Crying baby/"
for filename in os.listdir(dir):
    if ".wav" in filename or ".ogg" in filename:
        file = os.path.join(dir, filename)
    #     emotion = emotions[filename.split("-")[2]]
    #     if emotion not in needed_emotions:
    #         continue
        features = feature_extraction(file)
        X.append(features)
        y.append(1)       #sad = 1

In [None]:
dir = "/Users/ak1050588/Downloads/903 - Baby laugh/"
for filename in os.listdir(dir):
    if ".wav" in filename or ".ogg" in filename:
        file = os.path.join(dir, filename)
    #     emotion = emotions[filename.split("-")[2]]
    #     if emotion not in needed_emotions:
    #         continue
        features = feature_extraction(file)
        X.append(features)
        y.append(2)     #happy=2

In [8]:
"PART 4.1"
lda = LinearDiscriminantAnalysis()
lda_temp = lda.fit_transform(X,y)
X_train,X_test,y_train,y_test = train_test_split(lda_temp,y,test_size = 0.2,random_state = 42)


In [13]:
"Trying GridSearchCV for hyperparamter tuning"
param_grid = {'C': [0.1, 1, 10, 100, 1000], 
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']} 
  
grid = GridSearchCV(svm.SVC(), param_grid, refit = True, verbose = 3)
  
# fitting the model for grid search
grid.fit(X_train, y_train)
y_pred = grid.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred)*100,"%")
# Model Precision
print("Precision Score : ",metrics.precision_score(y_test, y_pred, 
                                           
                                           average='micro'))
print("Recall Score : ",metrics.recall_score(y_test, y_pred, 
                                           
                                           average='micro'))

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV 1/5] END .....................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV 2/5] END .....................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV 3/5] END .....................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV 4/5] END .....................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV 5/5] END .....................C=0.1, gamma=1, kernel=rbf; total time=   0.0s
[CV 1/5] END ...................C=0.1, gamma=0.1, kernel=rbf; total time=   0.0s
[CV 2/5] END ...................C=0.1, gamma=0.1, kernel=rbf; total time=   0.0s
[CV 3/5] END ...................C=0.1, gamma=0.1, kernel=rbf; total time=   0.0s
[CV 4/5] END ...................C=0.1, gamma=0.1, kernel=rbf; total time=   0.0s
[CV 5/5] END ...................C=0.1, gamma=0.1, kernel=rbf; total time=   0.0s
[CV 1/5] END ..................C=0.1, gamma=0.01, kernel=rbf; total time=   0.0s
[CV 2/5] END ..................C=0.1, gamma=0.0

[CV 2/5] END .................C=1000, gamma=0.01, kernel=rbf; total time=   0.0s
[CV 3/5] END .................C=1000, gamma=0.01, kernel=rbf; total time=   0.0s
[CV 4/5] END .................C=1000, gamma=0.01, kernel=rbf; total time=   0.0s
[CV 5/5] END .................C=1000, gamma=0.01, kernel=rbf; total time=   0.0s
[CV 1/5] END ................C=1000, gamma=0.001, kernel=rbf; total time=   0.0s
[CV 2/5] END ................C=1000, gamma=0.001, kernel=rbf; total time=   0.0s
[CV 3/5] END ................C=1000, gamma=0.001, kernel=rbf; total time=   0.0s
[CV 4/5] END ................C=1000, gamma=0.001, kernel=rbf; total time=   0.0s
[CV 5/5] END ................C=1000, gamma=0.001, kernel=rbf; total time=   0.0s
[CV 1/5] END ...............C=1000, gamma=0.0001, kernel=rbf; total time=   0.0s
[CV 2/5] END ...............C=1000, gamma=0.0001, kernel=rbf; total time=   0.0s
[CV 3/5] END ...............C=1000, gamma=0.0001, kernel=rbf; total time=   0.0s
[CV 4/5] END ...............

In [54]:
"PART 6.1 (SVM) = kernel(Gaussian Radial Basis Function (RBF) )" 
clf = svm.SVC(kernel="rbf")
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred)*100,"%")
# Model Precision
print("Precision Score : ",metrics.precision_score(y_test, y_pred, 
                                           
                                           average='micro'))
print("Recall Score : ",metrics.recall_score(y_test, y_pred, 
                                           
                                           average='micro'))

Accuracy: 100.0 %
Precision Score :  1.0
Recall Score :  1.0


In [None]:
X_new = []
y_new = []
dir = "/Users/ak1050588/Downloads/donateacry-corpus-master/donateacry_corpus_cleaned_and_updated_data/"
for filename in os.listdir(dir):
    if ".wav" in filename:
        file = os.path.join(dir, filename)
        features = feature_extraction(file)
        X_new.append(features)
        y_new.append(1)
dir = "/Users/ak1050588/Downloads/donateacry-corpus-master/donateacry_corpus_cleaned_and_updated_data/Laugh/"
for filename in os.listdir(dir):
    if ".wav" in filename:
        file = os.path.join(dir, filename)
        features = feature_extraction(file)
        X_new.append(features)
        y_new.append(2)

In [15]:
lda_temp_2 = lda.transform(X_new)

In [16]:
y_pred_2 = grid.predict(lda_temp_2)
print("Accuracy:",metrics.accuracy_score(y_new, y_pred_2)*100,"%")
# Model Precision
print("Precision Score : ",metrics.precision_score(y_new, y_pred_2, 
                                        
                                           average='micro'))
print("Recall Score : ",metrics.recall_score(y_new, y_pred_2, 
                                           
                                           average='micro'))

Accuracy: 89.13043478260869 %
Precision Score :  0.8913043478260869
Recall Score :  0.8913043478260869


In [18]:
pickle.dump( grid , open( 'weights.pkl' , 'wb' ) )