In [311]:
# pip install librosa

In [312]:
'''
Please install this specific version of resampy for librosa to work without errors.
'''

'\nPlease install this specific version of resampy for librosa to work without errors.\n'

In [313]:
# pip install resampy==0.3.1

In [314]:
import soundfile
import os
import glob
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import librosa
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import classification_report
import warnings; warnings.filterwarnings('ignore')

In [315]:
emotions ={
  '01':'neutral',
  '02':'calm',
  '03':'happy',
  '04':'sad',
  '05':'angry',
  '06':'fearful',
  '07':'disgust',
  '08':'surprised'
}

### Data for binary classification


In [316]:
def load_extract_features(data_path):

    '''
    load_extract_features() is a function that is used to load all the audio files one at a time, compute their features and return the features as well as the target values.

    There are around 8-10 audio files which are corrupted. We hardcode zero values for such files in order to maintain consistency.

    ['calm', 'happy'] emotion data is categorized into 'positive' and  ['angry', 'fearful'] into 'negative'

    Returns:
    1. Features
    2. Binary Target Values
    '''
    final_features,target_emotions, binary_label = [],[], []
    count = 0
    
    for i in glob.glob(data_path + "/Actor_*/*.wav"): #Loop to read every file.
        
        name = os.path.basename(i)
        #We split the name of the file to understand the emotion associated with the file.
        split = name.split("-")
        #We know that the third identifier is associated with the emotion of the audio file. Hence, we use [2] as it represents the third identifier.
        emotion = emotions[split[2]]

        #Below is the code to categorize the emotions into two classes to make this a binary problem.
        if emotion in ['calm', 'happy']:
            binary_label.append(0)
        elif emotion in ['angry', 'fearful']:
            binary_label.append(1)
        else:
            continue
        
        with soundfile.SoundFile(i) as audio:
            waveform = audio.read(dtype="float32")
            sr = audio.samplerate
            
            #Below is the code to extract the Mel spectrogram features
            #128 is the standard for machine learning applications using Mel spectrograms
            m_feature = librosa.feature.melspectrogram(y=waveform, sr=sr, n_mels=128, fmax=sr / 2.0).T
            melspectrogram = np.mean(m_feature,axis=0)
            if melspectrogram.shape != (128,):
                melspectrogram = np.zeros(128)
            
            #Below is the code to extract the chromagram features
            stft_wave = librosa.stft(waveform)
            stft = np.abs(stft_wave)
            c_feature = librosa.feature.chroma_stft(S=stft, sr=sr).T
            chromagram = np.mean(c_feature,axis=0)
            
            #12 is the number of pitch classes
            if chromagram.shape != (12,):
                chromagram = np.zeros(12)
                
            features=np.array([])
            features=np.hstack((chromagram, melspectrogram))
        
            final_features.append(features)
            target_emotions.append(emotion)
            
            count += 1
            if count % 100 == 0:
                print("Processed Audio File Number: ", count)
    
    #We return the features and the binary target values.
    return np.array(final_features), np.array(binary_label)

In [317]:
#Please change the path below to the path of the folder saved in your computer.
# data_path = './Audio_Speech_Actors_01-24'
# X, binary_label = load_extract_features(data_path)
# print(X.shape)
# print(binary_label.shape)


In [318]:
# np.savetxt('matrix.txt', X, delimiter = ',')  
# np.savetxt('binary_label.txt',binary_label,delimiter = ',')

In [319]:
X = np.loadtxt(open("matrix.txt","rb"), delimiter=",", skiprows=0)
binary_label = np.loadtxt(open("binary_label.txt","rb"), delimiter=",", skiprows=0)

print(X.shape) #should be (768,140)
print(binary_label.shape) # should be (768,)

(768, 140)
(768,)


In [320]:
X_train, X_test, Y_train, Y_test = train_test_split(X, binary_label, test_size=0.3, random_state=0)

In [321]:
def compute_cost(W, X, Y, l):
    # calculate hinge loss
    n_s = X.shape[0]
    distances = 1 - Y * (np.dot(X, W))
    distances[distances < 0] = 0  # equivalent to max(0, distance)
    hinge_loss =  (np.sum(distances) / n_s)
    # calculate cost
    cost = l / 2 * np.dot(W, W) + hinge_loss 
    
    return cost

In [322]:
def calculate_gradient(W, X, Y,l):

    distance = 1 - (Y * np.dot(X, W))
    dw = np.zeros(len(W))

    for ind, d in enumerate(distance):
        if (ind == W.shape[0]):
            if max(0, d) != 0:
                di =  - ( Y[ind] * X[ind])
        else:
            if max(0, d) == 0:
                di = l*W
            else:
                di = l*W - ( Y[ind] * X[ind])
        dw += di

    dw = dw/len(Y)  # average

    return dw

In [323]:
def svm(X, Y, step, l,max_iter):
    X_bia = np.c_[X, np.ones(X.shape[0])]   # Pad 1's for the bias term
    new_Y = np.ones((Y.shape[0]))

    for i in range (Y.shape[0]):
        if Y[i] == 0:
            new_Y[i] = -1

    w = np.zeros(X_bia.shape[1])
    count = 0
    cost = compute_cost(w, X_bia, new_Y, l)
    for i in range(max_iter):
        grad = calculate_gradient(w, X_bia, new_Y, l)
        new_w = w - step * grad
        new_cost = compute_cost(new_w, X_bia, new_Y, l)
        print(cost)
        if (new_cost<cost):
            w = new_w
            cost = new_cost
            count= count +1
        else:
            break
    print(count)
    return w

In [324]:
def svm_accuracy(W,X,Y):

    X_bia = np.c_[X, np.ones(X.shape[0])]   # Pad 1's for the bias term
    new_Y = np.ones((Y.shape[0]))

    for i in range (Y.shape[0]):
        if Y[i] == 0:
            new_Y[i] = -1

    count = 0

    for i in range(X.shape[0]):
        yp = np.sign(np.dot(W, X_bia[i])) #model
        if(yp == new_Y[i]):
            count = count +1

    return count / Y.shape[0]

In [325]:
l = 13  # lambda
w_result = svm(X_train, Y_train, 0.00001, l,10000)
print(svm_accuracy(w_result, X_train, Y_train))
print(svm_accuracy(w_result, X_test, Y_test))

1.0
0.9998896765502112
0.9997793817559524
0.999669115609784
0.9995588781042685
0.99944866923197
0.9993384889854545
0.9992283373572912
0.9991182143400491
0.9990081199263009
0.9988980541086202
0.9987880168795829
0.9986780082317672
0.9985680281577523
0.9984580766501202
0.9983481537014547
0.9982382593043408
0.9981283934513664
0.9980185561351205
0.9979087473481946
0.9977989670831817
0.9976892153326773
0.9975794920892784
0.9974697973455832
0.997360131094194
0.9972504933277122
0.9971408840387436
0.9970313032198944
0.9969217508637732
0.9968122269629903
0.9967027315101588
0.9965932644978924
0.9964838259188072
0.9963744157655219
0.9962650340306564
0.9961556807068327
0.9960463557866748
0.9959370592628083
0.9958277911278609
0.9957185513744625
0.9956093399952446
0.9955001569828404
0.995391002329886
0.995281876029018
0.995172778072876
0.9950637084541009
0.9949546671653356
0.9948456541992258
0.9947366695484174
0.9946277132055596
0.9945187851633032
0.9944098854143008
0.9943010139512065
0.9941921707666

In [326]:
def pca(X, k):
    mean = np.mean(X, axis = 0)   # the mean value of X
    std = np.std(X, axis=0)       # the standard deviation along axis 0
    X_std = (X - mean) / std      # standardized data
    cov_mat = np.cov(X_std.T)     # calculate the covariance matrix

    values, vectors = np.linalg.eig(cov_mat)   # Eigendecomposition of covariance matrix

    # Make a list of (eigenvalue, eigenvector) tuples
    eig_pairs = [(np.abs(values[i]), vectors[i,:]) for i in range(len(values))]

    # Sort the tuples from the highest to the lowest based on eigenvalues magnitude
    eig_pairs.sort(key=lambda x: x[0], reverse=True)

    # Store the soted eigenvalues and their corresponding eigenvectors
    vecs_sorted = np.array([x[1] for x in eig_pairs])

    W = vecs_sorted[:k, :]   # Projection matrix with the top k eigenvectors selected

    return W

In [327]:
W = pca(X_train, 40)
W.shape

(40, 140)

In [328]:
X_proj = X_train.dot(W.T)             # transform X via W to obtain a k-dimensional feature subspace.
w = svm(X_proj, Y_train, 0.0001, l,2000)
print(svm_accuracy(w, X_proj, Y_train))

1.0
0.9996992390175755
0.9993992587787395
0.9991000572577092
0.998801632433957
0.9985039822921963
0.9982071048223687
0.9979109980196299
0.9976156598843364
0.9973210884220318
0.9970272816434335
0.9967342375644197
0.9964419542060147
0.9961504295943777
0.9958596617607869
0.9955696487416283
0.9952803885783815
0.9949918793176069
0.9947041190109314
0.9944171057150377
0.9941308374916483
0.9938453124075144
0.993560528534402
0.9932764839490794
0.9929931767333035
0.9927106049738076
0.9924287667622885
0.9921476601953925
0.9918672833747041
0.9915876344067325
0.9913087114028981
0.9910305124795211
0.990753035757808
0.9904762793638395
0.9902002414285563
0.9899249200877486
0.9896503134820422
0.9893764197568866
0.989103237062542
0.9888307635540668
0.988558997391306
0.9882879367388773
0.9880175797661609
0.9877479246472837
0.9874789695611116
0.9872107126912326
0.9869431522259482
0.986676286358259
0.9864101132858529
0.9861446312110941
0.985879838341009
0.985615732887276
0.985352313066212
0.985089577098761

In [329]:
X_proj = X_test.dot(W.T)       # transform X via W to obtain a k-dimensional feature subspace.
print(svm_accuracy(w, X_proj, Y_test))

0.6406926406926406
