In [52]:
# pip install librosa

In [53]:
'''
Please install this specific version of resampy for librosa to work without errors.
'''

'\nPlease install this specific version of resampy for librosa to work without errors.\n'

In [54]:
# pip install resampy==0.3.1

In [55]:
import soundfile
import os
import glob
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import librosa
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import classification_report
import warnings; warnings.filterwarnings('ignore')

In [56]:
emotions ={
  '01':'neutral',
  '02':'calm',
  '03':'happy',
  '04':'sad',
  '05':'angry',
  '06':'fearful',
  '07':'disgust',
  '08':'surprised'
}

### Data for binary classification


In [57]:
def load_extract_features(data_path):

    '''
    load_extract_features() is a function that is used to load all the audio files one at a time, compute their features and return the features as well as the target values.

    There are around 8-10 audio files which are corrupted. We hardcode zero values for such files in order to maintain consistency.

    ['calm', 'happy'] emotion data is categorized into 'positive' and  ['angry', 'fearful'] into 'negative'

    Returns:
    1. Features
    2. Binary Target Values
    '''
    final_features,target_emotions, binary_label = [],[], []
    count = 0
    
    for i in glob.glob(data_path + "/Actor_*/*.wav"): #Loop to read every file.
        
        name = os.path.basename(i)
        #We split the name of the file to understand the emotion associated with the file.
        split = name.split("-")
        #We know that the third identifier is associated with the emotion of the audio file. Hence, we use [2] as it represents the third identifier.
        emotion = emotions[split[2]]

        #Below is the code to categorize the emotions into two classes to make this a binary problem.
        if emotion in ['calm', 'happy']:
            binary_label.append(0)
        elif emotion in ['angry', 'fearful']:
            binary_label.append(1)
        else:
            continue
        
        with soundfile.SoundFile(i) as audio:
            waveform = audio.read(dtype="float32")
            sr = audio.samplerate
            
            #Below is the code to extract the Mel spectrogram features
            #128 is the standard for machine learning applications using Mel spectrograms
            m_feature = librosa.feature.melspectrogram(y=waveform, sr=sr, n_mels=128, fmax=sr / 2.0).T
            melspectrogram = np.mean(m_feature,axis=0)
            if melspectrogram.shape != (128,):
                melspectrogram = np.zeros(128)
            
            #Below is the code to extract the chromagram features
            stft_wave = librosa.stft(waveform)
            stft = np.abs(stft_wave)
            c_feature = librosa.feature.chroma_stft(S=stft, sr=sr).T
            chromagram = np.mean(c_feature,axis=0)
            
            #12 is the number of pitch classes
            if chromagram.shape != (12,):
                chromagram = np.zeros(12)
                
            features=np.array([])
            features=np.hstack((chromagram, melspectrogram))
        
            final_features.append(features)
            target_emotions.append(emotion)
            
            count += 1
            if count % 100 == 0:
                print("Processed Audio File Number: ", count)
    
    #We return the features and the binary target values.
    return np.array(final_features), np.array(binary_label)

In [58]:
#Please change the path below to the path of the folder saved in your computer.
# data_path = './Audio_Speech_Actors_01-24'
# X, binary_label = load_extract_features(data_path)
# print(X.shape)
# print(binary_label.shape)


In [59]:
# np.savetxt('matrix.txt', X, delimiter = ',')  
# np.savetxt('binary_label.txt',binary_label,delimiter = ',')

In [60]:
X = np.loadtxt(open("matrix.txt","rb"), delimiter=",", skiprows=0)
binary_label = np.loadtxt(open("binary_label.txt","rb"), delimiter=",", skiprows=0)

print(X.shape) #should be (768,140)
print(binary_label.shape) # should be (768,)

(768, 140)
(768,)


In [61]:
X_train, X_test, Y_train, Y_test = train_test_split(X, binary_label, test_size=0.3, random_state=0)

In [62]:
def compute_cost(W, X, Y, l):
    # calculate hinge loss
    n_s = X.shape[0]
    distances = 1 - Y * (np.dot(X, W))
    distances[distances < 0] = 0  # equivalent to max(0, distance)
    hinge_loss =  (np.sum(distances) / n_s)
    # calculate cost
    cost = l / 2 * np.dot(W, W) + hinge_loss 
    
    return cost

In [63]:
def calculate_gradient(W, X, Y,l):

    distance = 1 - (Y * np.dot(X, W))
    dw = np.zeros(len(W))

    for ind, d in enumerate(distance):
        if (ind == W.shape[0]):
            if max(0, d) != 0:
                di =  - ( Y[ind] * X[ind])
        else:
            if max(0, d) == 0:
                di = l*W
            else:
                di = l*W - ( Y[ind] * X[ind])
        dw += di

    dw = dw/len(Y)  # average

    return dw

In [64]:
def svm(X, Y, step, l,max_iter):
    X_bia = np.c_[X, np.ones(X.shape[0])]   # Pad 1's for the bias term
    new_Y = np.ones((Y.shape[0]))

    for i in range (Y.shape[0]):
        if Y[i] == 0:
            new_Y[i] = -1

    w = np.zeros(X_bia.shape[1])
    count = 0
    cost = compute_cost(w, X_bia, new_Y, l)
    for i in range(max_iter):
        grad = calculate_gradient(w, X_bia, new_Y, l)
        new_w = w - step * grad
        new_cost = compute_cost(new_w, X_bia, new_Y, l)

        if (new_cost<cost):
            w = new_w
            cost = new_cost
            count= count +1
        else:
            break

    return w

In [65]:
def svm_accuracy(W,X,Y):

    X_bia = np.c_[X, np.ones(X.shape[0])]   # Pad 1's for the bias term
    new_Y = np.ones((Y.shape[0]))

    for i in range (Y.shape[0]):
        if Y[i] == 0:
            new_Y[i] = -1

    count = 0

    for i in range(X.shape[0]):
        yp = np.sign(np.dot(W, X_bia[i])) #model
        if(yp == new_Y[i]):
            count = count +1

    return count / Y.shape[0]

In [66]:
l = 1  # lambda
w_result = svm(X_train, Y_train, 0.00001, l, 5000)
print("The accuracy when applied to the training subset: " + str(svm_accuracy(w_result, X_train, Y_train)))
print("The accuracy when applied to the testing subset: " + str(svm_accuracy(w_result, X_test, Y_test)))

The accuracy when applied to the training subset: 0.7188081936685289
The accuracy when applied to the testing subset: 0.7272727272727273


In [67]:
def pca(X, k):
    mean = np.mean(X, axis = 0)   # the mean value of X
    std = np.std(X, axis=0)       # the standard deviation along axis 0
    X_std = (X - mean) / std      # standardized data
    cov_mat = np.cov(X_std.T)     # calculate the covariance matrix

    values, vectors = np.linalg.eig(cov_mat)   # Eigendecomposition of covariance matrix

    # Make a list of (eigenvalue, eigenvector) tuples
    eig_pairs = [(np.abs(values[i]), vectors[i,:]) for i in range(len(values))]

    # Sort the tuples from the highest to the lowest based on eigenvalues magnitude
    eig_pairs.sort(key=lambda x: x[0], reverse=True)

    # Store the soted eigenvalues and their corresponding eigenvectors
    vecs_sorted = np.array([x[1] for x in eig_pairs])

    W = vecs_sorted[:k, :]   # Projection matrix with the top k eigenvectors selected

    return mean, W

In [68]:
mean, W = pca(X_train, 40)

In [69]:
X_proj = (X_train - mean).dot(W.T)           # transform X via W to obtain a k-dimensional feature subspace.
X_proj_test = (X_test - mean).dot(W.T)       # transform X via W to obtain a k-dimensional feature subspace.
w = svm(X_proj, Y_train, 0.0001, l, 5000)    # Train the resulted data with svm

print("The accuracy when applied to the training subset: " + str(svm_accuracy(w, X_proj, Y_train)))
print("The accuracy when applied to the testing subset: " + str(svm_accuracy(w, X_proj_test, Y_test)))

The accuracy when applied to the training subset: 0.6350093109869647
The accuracy when applied to the testing subset: 0.6926406926406926
