In [134]:
import tensorflow
import pandas as pd
import time
import numpy as np
import pylab as plt
import multiprocessing as mp
import statistics

import librosa
from librosa import feature
from glob import glob
import os

from sklearn.preprocessing import StandardScaler, OneHotEncoder

# tensorflow libraries
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras import Sequential
from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Dropout, BatchNormalization
from tensorflow.keras.callbacks import Callback
from keras.utils import np_utils

# sklearn libraries are useful for preprocessing, performance measures, etc.
from sklearn import preprocessing
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
min_max_scaler = preprocessing.MinMaxScaler()

In [14]:
# rootdir = '/Users/abhishekvaidyanathan/Desktop/NNDL-project/audio-files'
rootdir = '/Users/abhishekvaidyanathan/Desktop/NNDL-project/audio-files'
audio_files = []
for subdir, dirs, files in os.walk(rootdir):
    for file in files:
        audio_files.append(os.path.join(subdir, file))
audio_files = audio_files[1:]

In [15]:
audio_files[1][-33:-25]

'Actor_16'

In [16]:
dict_actors = {}
for audio_file in audio_files:
    try :
        if (len(dict_actors[audio_file[-33:-25]])>0):
            dict_actors[audio_file[-33:-25]].append(audio_file)
    except:
        dict_actors[audio_file[-33:-25]] = []
        dict_actors[audio_file[-33:-25]].append(audio_file)

In [17]:
data = pd.DataFrame(columns = ['Actor','Modality','Vocal_channel','Emotion','Emotional_intensity','Statement','Repetion','Gender','Audio_file'])

In [18]:
def get_gender(value):
    if(int(value)%2==0):
        return 1
    else :
        return 0

for keys in dict_actors:
    for files in dict_actors[keys]:
        data.loc[len(data)] = [keys,int(files[-24:-22]),int(files[-21:-19]),int(files[-18:-16]),int(files[-15:-13]),int(files[-12:-10]),int(files[-9:-7]),get_gender(files[-6:-4]),files]

In [19]:
data.head()

Unnamed: 0,Actor,Modality,Vocal_channel,Emotion,Emotional_intensity,Statement,Repetion,Gender,Audio_file
0,Actor_16,3,1,5,1,2,1,1,/Users/abhishekvaidyanathan/Desktop/NNDL-proje...
1,Actor_16,3,1,6,1,2,2,1,/Users/abhishekvaidyanathan/Desktop/NNDL-proje...
2,Actor_16,3,1,6,2,1,2,1,/Users/abhishekvaidyanathan/Desktop/NNDL-proje...
3,Actor_16,3,1,5,2,1,1,1,/Users/abhishekvaidyanathan/Desktop/NNDL-proje...
4,Actor_16,3,1,7,1,1,1,1,/Users/abhishekvaidyanathan/Desktop/NNDL-proje...


## can change the below code cell to include different set of features. 

#### right now uses mean values, can change to using raw values for each of the features. 

In [38]:
# y, sr = librosa.load(librosa.util.example_audio_file())
# temp_func = feature.spectral_bandwidth
# temp_res = temp_func(y=y, sr=sr)
# print(temp_res)
# print(temp_res.shape)
# print(np.mean(temp_res))

[[2803.66001659 1379.6672431  1562.99924373 ... 2549.85781492
  2456.23949636 2515.19654634]]
(1, 2647)
1364.8838771312614


In [79]:
# to add:
# mfcc - use n_mfcc=13 and take mean along axis 1 [13 features]
# chroma_stft - take mean along axis 1 [12 features]
# librosa.onset.onset_strength(y=y, sr=sr) - take direct mean
# zero_crossing_rate - take direct mean
# spectral_rolloff - direct mean
# librosa.piptrack - returns pitch and magnitude, take direct means of both
# melspectrogram - take direct mean
# spectral_contrast - use axis=1 [7 features]
# tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T,axis=0) [6 features]
# rms - take direct mean
# spectral_centroid - take direct mean
# spectral_bandwidth - take direct mean


def get_feature_vector(y, sr):
    feature_vector = []
    
    # multi-dim features
    feature_vector.extend(np.mean(feature.mfcc(y=y, sr=sr, n_mfcc=13), axis=1))
    feature_vector.extend(np.mean(feature.chroma_stft(y=y, sr=sr), axis=1))
    feature_vector.extend(np.mean(feature.spectral_contrast(y=y, sr=sr), axis=1))
    feature_vector.extend(np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(y), sr=sr),axis=1))
    
    # single-dim features with special requirements
    feature_vector.append(np.mean(feature.rms(y=y)))
    feature_vector.append(np.mean(feature.zero_crossing_rate(y=y)))
    feature_vector.extend([np.mean(x) for x in librosa.piptrack(y=y, sr=sr)])
    
    # single-dim features
    feat_list = [
        librosa.onset.onset_strength,
        feature.spectral_rolloff,
        feature.melspectrogram,
        feature.spectral_centroid,
        feature.spectral_bandwidth
    ]
    
    for temp_func in feat_list:
        feature_vector.append(np.mean(temp_func(y=y, sr=sr)))
    
    return feature_vector

In [None]:
audio_features = []
for i in range(data.shape[0]):
   y , sr = librosa.load(data.iloc[i]['Audio_file'],sr=None)
   feature_vector = get_feature_vector(y, sr)
   audio_features.append(feature_vector) 

In [43]:
data_features = data.copy()

In [None]:
data_features = data_features.drop("Audio_file",axis=1)
data_features.head()

In [45]:
data_features['librosa'] = audio_features

In [46]:
features = pd.DataFrame(data_features['librosa'].values.tolist())

In [47]:
statement1 = 'Kids are talking by the door'
statement2 = 'Dogs are sitting by the door'

In [48]:
features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,37,38,39,40,41,42,43,44,45,46
0,-719.127808,70.20224,1.169071,13.123216,7.837617,14.41195,-4.110705,4.469619,-3.53873,-3.657982,...,0.004175,0.002258,0.052904,40.032833,0.002662,0.992758,13191.718251,0.003416,7135.753114,5653.712371
1,-714.994934,69.690376,3.925557,11.925324,6.423343,11.014113,-2.874456,4.514386,-4.470305,-2.665093,...,0.016821,0.002707,0.046627,37.690022,0.003359,0.860653,13280.28298,0.004721,7240.619346,5640.892215
2,-710.959839,67.579193,5.783356,13.227695,6.194669,12.640195,-1.662046,5.663977,-4.953693,-3.484669,...,0.0111,0.002521,0.053835,39.524185,0.003164,0.918893,13273.018037,0.004341,7009.490125,5802.602446
3,-759.917847,75.788948,6.028997,14.562723,6.459432,14.636641,-2.999552,4.625813,-5.19535,-0.702961,...,0.002851,0.001579,0.045929,38.651924,0.001842,0.988106,12649.614081,0.001624,6997.114097,5518.781643
4,-735.006592,79.093056,8.141059,11.41356,5.174132,15.393293,-2.752063,2.964593,-5.388961,-1.691822,...,0.032999,0.001676,0.033038,43.06493,0.002004,0.899759,12202.824519,0.001781,6874.562103,5416.338418


In [50]:
features.shape

(1439, 47)

In [49]:
labels = data['Emotion']

In [71]:
actor_labels = pd.Series([int(x[-2:]) for x in data['Actor']], name='Actor')

In [52]:
features_values = features.values

In [53]:
features_scaled_values = min_max_scaler.fit_transform(features_values)
features_normalised = pd.DataFrame(features_scaled_values)

In [54]:
features_normalised.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,37,38,39,40,41,42,43,44,45,46
0,0.285469,0.532704,0.711228,0.566807,0.698543,0.659506,0.64472,0.642219,0.649567,0.615905,...,0.490164,0.021892,0.178884,0.686954,0.020258,0.232647,0.850756,0.000818,0.897211,0.802389
1,0.293124,0.527383,0.74811,0.544651,0.666486,0.574399,0.675547,0.643559,0.615851,0.650318,...,0.621171,0.026986,0.1343,0.64359,0.026162,0.091577,0.859952,0.001138,0.917975,0.798843
2,0.300599,0.505434,0.772967,0.568739,0.661303,0.615128,0.70578,0.67799,0.598355,0.621912,...,0.561902,0.024872,0.185497,0.677539,0.024511,0.153769,0.859198,0.001045,0.872211,0.843582
3,0.209913,0.590785,0.776254,0.593432,0.667304,0.665134,0.672428,0.646897,0.589609,0.718323,...,0.476445,0.014187,0.129345,0.661394,0.013311,0.227678,0.794469,0.000379,0.86976,0.765059
4,0.256056,0.625135,0.804513,0.535186,0.638171,0.684086,0.678599,0.597143,0.582602,0.68405,...,0.788763,0.015293,0.037792,0.743076,0.014689,0.133337,0.748078,0.000417,0.845494,0.736717


In [60]:
X_train, X_test, y_train, y_test = train_test_split(features_normalised, labels, test_size=0.30, random_state=42)

In [73]:
X_train.to_csv('Speaker_Classification_data/X_train.csv', index=False)
X_test.to_csv('Speaker_Classification_data/X_test.csv', index=False)
y_train.to_csv('Speaker_Classification_data/y_train.csv', index=False)
y_test.to_csv('Speaker_Classification_data/y_test.csv', index=False)

In [61]:
X_train.to_csv('SER_data/X_train.csv', index=False)
X_test.to_csv('SER_data/X_test.csv', index=False)
y_train.to_csv('SER_data/y_train.csv', index=False)
y_test.to_csv('SER_data/y_test.csv', index=False)

In [72]:
X_train, X_test, y_train, y_test = train_test_split(features_normalised, actor_labels, test_size=0.30, random_state=42)

In [108]:
X_train = pd.read_csv("./SER_data/X_train.csv")
X_test = pd.read_csv("./SER_data/X_test.csv")
y_train = pd.read_csv("./SER_data/y_train.csv")
y_test = pd.read_csv("./SER_data/y_test.csv")

In [109]:
X_train_array = X_train.to_numpy()
X_test_array = X_test.to_numpy()
y_train_array = y_train.to_numpy()
y_test_array = y_test.to_numpy()
lb = LabelEncoder()
y_train_cnn = np_utils.to_categorical(lb.fit_transform(y_train_array))
y_test_cnn = np_utils.to_categorical(lb.fit_transform(y_test_array))

  return f(*args, **kwargs)


In [137]:
encoder = OneHotEncoder()
Y_train_cnn = encoder.fit_transform(np.array(y_train).reshape(-1,1)).toarray()
Y_test_cnn = encoder.fit_transform(np.array(y_test).reshape(-1,1)).toarray()

In [50]:

class time_for_batch(Callback):
    def on_train_begin(self, logs={}):
        self.times=[]
    def on_train_batch_begin(self, batch, logs={}):
        self.starttime = time.time()
    def on_train_batch_end(self, batch, logs={}):
        self.times.append(time.time()-self.starttime)
        
class time_for_epoch(Callback):
    def on_train_begin(self, logs={}):
        self.times = []
    def on_epoch_begin(self, epoch, logs={}):
        self.epoch_time_start = time.time()
    def on_epoch_end(self, epoch, logs={}):
        self.times.append(time.time() - self.epoch_time_start)

In [98]:
def K_fold_cross_validation(no_folds,no_epochs,batch_size,X_train,y_train,X_test,y_test,epoch_times_dict):
  hidden_neurons = 16
  
  print("Model training for:")
  print("Number of neurons",hidden_neurons)
  print("Number of epcohs:" ,no_epochs)
  print("No folds used for k-fold cross validation:", no_folds)
  print("")

  kf = KFold(n_splits=no_folds,random_state=None, shuffle=False)
  KFold(n_splits=no_folds, random_state=None, shuffle=False)
  history_results = []
  # for train_index, test_index in kf.split(X_train):
  # X_train_K, X_test_K = X[train_index], X[test_index]
  # y_train_K, y_test_K = Y[train_index], Y[test_index]

  model = Sequential()
  model.add(Dense(32, activation='relu'))
  # model.add(Dropout(0.3))
  model.add(Dense(24, activation='relu'))
  # model.add(Dropout(0.3))
  model.add(Dense(16, activation='relu'))
  # model.add(Dropout(0.3))
  model.add(Dense(16, activation='relu'))
  model.add(Dense(8, activation='softmax'))

  model.compile(optimizer='adam',
                loss='sparse_categorical_crossentropy',
                metrics=['accuracy'])

  tb = time_for_batch()
  te = time_for_epoch()

  history = model.fit(X_train, y_train,
                      batch_size=batch_size,
                      epochs=no_epochs,
                      verbose=2,
                      use_multiprocessing=False,
                      callbacks = [tb, te],
                      validation_data=(X_test, y_test))
  epoch_times_dict[hidden_neurons] = te.times
  print("")
  history_results.append(history)
  return history_results

In [99]:
Batch_sizes = [2]
model_results = {}
epoch_times_dict = {}
for size in Batch_sizes:
  history = K_fold_cross_validation(3,100,size,X_train_array,y_train_array-1,X_test_array,y_test_array-1,epoch_times_dict)
  model_results[size] = history

Model training for:
Number of neurons 16
Number of epcohs: 100
No folds used for k-fold cross validation: 3

Epoch 1/100
504/504 - 1s - loss: 2.0630 - accuracy: 0.1351 - val_loss: 1.9898 - val_accuracy: 0.1574
Epoch 2/100
504/504 - 0s - loss: 1.9453 - accuracy: 0.2095 - val_loss: 1.9376 - val_accuracy: 0.1736
Epoch 3/100
504/504 - 0s - loss: 1.8503 - accuracy: 0.2572 - val_loss: 1.7509 - val_accuracy: 0.2940
Epoch 4/100
504/504 - 1s - loss: 1.7929 - accuracy: 0.2602 - val_loss: 1.7692 - val_accuracy: 0.2963
Epoch 5/100
504/504 - 1s - loss: 1.7493 - accuracy: 0.2790 - val_loss: 1.6696 - val_accuracy: 0.3287
Epoch 6/100
504/504 - 1s - loss: 1.7435 - accuracy: 0.2701 - val_loss: 1.7001 - val_accuracy: 0.3403
Epoch 7/100
504/504 - 1s - loss: 1.7424 - accuracy: 0.2860 - val_loss: 1.6677 - val_accuracy: 0.3495
Epoch 8/100
504/504 - 0s - loss: 1.7206 - accuracy: 0.2939 - val_loss: 1.6574 - val_accuracy: 0.3519
Epoch 9/100
504/504 - 0s - loss: 1.6954 - accuracy: 0.2920 - val_loss: 1.6639 - val

In [100]:
X_train_cnn = np.expand_dims(X_train, axis=2)
X_test_cnn = np.expand_dims(X_test,axis=2)

In [170]:
x_train_new = pd.read_csv("./SER_data/X_train.csv")
x_test_new = pd.read_csv("./SER_data/X_test.csv")
y_train_new = pd.read_csv("./SER_data/y_train.csv")
y_test_new = pd.read_csv("./SER_data/y_test.csv")

In [178]:
frames = [x_train_new, x_test_new]
result_train = pd.concat(frames)
frames = [y_train_new, y_test_new]
results_test = pd.concat(frames)

In [206]:
def cnn_layers(no_folds,no_epochs,batch_size,X_train,y_train,X_test,y_test,epoch_times_dict):  
  hidden_neurons = 16
  print("Model training for:")
  print("Number of neurons",hidden_neurons)
  print("Number of epcohs:" ,no_epochs)
  print("No folds used for k-fold cross validation:", no_folds)
  print("")

  kf = KFold(n_splits=no_folds,random_state=None, shuffle=False)
  KFold(n_splits=no_folds, random_state=None, shuffle=False)
  history_results = []
#   for train_index, test_index in kf.split(X_train):
#         X_train_K, X_test_K = X_train[train_index], X_train[test_index]
#         y_train_K, y_test_K = y_train[train_index], y_train[test_index]
  model_cnn = Sequential()
  model_cnn.add(Conv1D(64,kernel_size=5,strides=1,activation='relu',input_shape=(X_train.shape[1],1)))
  model_cnn.add(BatchNormalization())
  # model_cnn.add(Conv1D(64,kernel_size=5,strides=1,activation='relu'))
  # model_cnn.add(Dropout(0.1))
  # model_cnn.add(BatchNormalization())
  # model_cnn.add(MaxPooling1D(pool_size=8,strides=2))
  model_cnn.add(Conv1D(32,kernel_size=5,activation='relu'))
  # model_cnn.add(Conv1D(32,kernel_size=5,activation='relu'))
  # model_cnn.add(Conv1D(32,kernel_size=5))
  model_cnn.add(BatchNormalization())
  model_cnn.add(Dropout(0.2))
  model_cnn.add(Flatten())
  # model_cnn.add(Dense(16))
  # model_cnn.add(Dropout(0.2))
  model_cnn.add(Dense(8))
  model_cnn.compile(optimizer='adam',
        loss='categorical_crossentropy',
        metrics=['accuracy'])
  print(model_cnn.summary())

  history = model.fit(X_train, y_train,
                batch_size=batch_size,
                epochs=no_epochs,
                verbose=2,
                use_multiprocessing=False,
                callbacks = [tb, te],
                validation_data=(X_test, y_test))

  epoch_times_dict[hidden_neurons] = te.times
  print("")
  history_results.append(history)
  return history_results

In [207]:
x_train_cnn_new = np.expand_dims(x_train_new, axis=2)
y_train_cnn_new = encoder.fit_transform(np.array(y_train_new).reshape(-1,1)).toarray()

In [208]:
Batch_sizes = [1]
model_results = {}
epoch_times_dict = {}
for size in Batch_sizes:
  history = cnn_layers(2,100,size,X_train_cnn,Y_train_cnn,X_test_cnn,Y_test_cnn,epoch_times_dict)
  model_results[size] = history

Model training for:
Number of neurons 16
Number of epcohs: 100
No folds used for k-fold cross validation: 2

Model: "sequential_57"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_85 (Conv1D)           (None, 43, 64)            384       
_________________________________________________________________
batch_normalization_50 (Batc (None, 43, 64)            256       
_________________________________________________________________
conv1d_86 (Conv1D)           (None, 39, 32)            10272     
_________________________________________________________________
batch_normalization_51 (Batc (None, 39, 32)            128       
_________________________________________________________________
dropout_89 (Dropout)         (None, 39, 32)            0         
_________________________________________________________________
flatten_31 (Flatten)         (None, 1248)              0         
__________

In [143]:
model=Sequential()
model.add(Conv1D(256, kernel_size=5, strides=1, padding='same', activation='relu', input_shape=(X_train_cnn.shape[1], 1)))
model.add(MaxPooling1D(pool_size=5, strides = 2, padding = 'same'))

model.add(Conv1D(256, kernel_size=5, strides=1, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=5, strides = 2, padding = 'same'))

model.add(Conv1D(128, kernel_size=5, strides=1, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=5, strides = 2, padding = 'same'))
model.add(Dropout(0.2))

model.add(Conv1D(64, kernel_size=5, strides=1, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=5, strides = 2, padding = 'same'))

model.add(Flatten())
model.add(Dense(units=32, activation='relu'))
model.add(Dropout(0.3))

model.add(Dense(units=8, activation='softmax'))
model.compile(optimizer = 'adam' , loss = 'categorical_crossentropy' , metrics = ['accuracy'])

model.summary()

Model: "sequential_33"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_24 (Conv1D)           (None, 47, 256)           1536      
_________________________________________________________________
max_pooling1d_11 (MaxPooling (None, 24, 256)           0         
_________________________________________________________________
conv1d_25 (Conv1D)           (None, 24, 256)           327936    
_________________________________________________________________
max_pooling1d_12 (MaxPooling (None, 12, 256)           0         
_________________________________________________________________
conv1d_26 (Conv1D)           (None, 12, 128)           163968    
_________________________________________________________________
max_pooling1d_13 (MaxPooling (None, 6, 128)            0         
_________________________________________________________________
dropout_35 (Dropout)         (None, 6, 128)          

In [155]:
model_cnn = Sequential()
model_cnn.add(Conv1D(64,kernel_size=5,strides=1,activation='relu',input_shape=(X_train_cnn.shape[1],1)))
model_cnn.add(BatchNormalization())
# model_cnn.add(Conv1D(64,kernel_size=5,strides=1,activation='relu'))
model_cnn.add(Dropout(0.1))
model_cnn.add(BatchNormalization())
model_cnn.add(MaxPooling1D(pool_size=8,strides=2))
model_cnn.add(Conv1D(32,kernel_size=5,activation='relu'))
# model_cnn.add(Conv1D(32,kernel_size=5,activation='relu'))
# model_cnn.add(Conv1D(32,kernel_size=5))
model_cnn.add(BatchNormalization())
model_cnn.add(Dropout(0.2))
model_cnn.add(Flatten())
model_cnn.add(Dense(32))
model_cnn.add(Dropout(0.2))
model_cnn.add(Dense(8))
model_cnn.compile(optimizer='adam',
        loss='categorical_crossentropy',
        metrics=['accuracy'])

model_cnn.summary()

Model: "sequential_38"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_44 (Conv1D)           (None, 43, 64)            384       
_________________________________________________________________
batch_normalization_6 (Batch (None, 43, 64)            256       
_________________________________________________________________
dropout_48 (Dropout)         (None, 43, 64)            0         
_________________________________________________________________
batch_normalization_7 (Batch (None, 43, 64)            256       
_________________________________________________________________
max_pooling1d_19 (MaxPooling (None, 18, 64)            0         
_________________________________________________________________
conv1d_45 (Conv1D)           (None, 14, 32)            10272     
_________________________________________________________________
batch_normalization_8 (Batch (None, 14, 32)          

In [156]:
tb = time_for_batch()
te = time_for_epoch()

history = model_cnn.fit(X_train_cnn, Y_train_cnn,
                      batch_size=4,
                      epochs=100,
                      verbose=2,
                      use_multiprocessing=False,
                      callbacks = [tb, te],
                      validation_data=(X_test_cnn, Y_test_cnn))

Epoch 1/100
252/252 - 2s - loss: 7.4596 - accuracy: 0.1420 - val_loss: 9.3587 - val_accuracy: 0.1250
Epoch 2/100
252/252 - 1s - loss: 6.5239 - accuracy: 0.1480 - val_loss: 5.1971 - val_accuracy: 0.1019
Epoch 3/100
252/252 - 1s - loss: 5.4774 - accuracy: 0.1221 - val_loss: 4.7286 - val_accuracy: 0.1574
Epoch 4/100
252/252 - 1s - loss: 4.8253 - accuracy: 0.1460 - val_loss: 4.6585 - val_accuracy: 0.1343
Epoch 5/100
252/252 - 1s - loss: 4.3213 - accuracy: 0.1460 - val_loss: 4.6593 - val_accuracy: 0.1343
Epoch 6/100
252/252 - 1s - loss: 4.3158 - accuracy: 0.1450 - val_loss: 4.6137 - val_accuracy: 0.1250
Epoch 7/100
252/252 - 1s - loss: 4.3195 - accuracy: 0.1400 - val_loss: 4.6410 - val_accuracy: 0.1481
Epoch 8/100
252/252 - 1s - loss: 4.4920 - accuracy: 0.1400 - val_loss: 5.4338 - val_accuracy: 0.1458
Epoch 9/100
252/252 - 1s - loss: 7.5099 - accuracy: 0.1281 - val_loss: 6.7937 - val_accuracy: 0.1412
Epoch 10/100
252/252 - 1s - loss: 7.3924 - accuracy: 0.1380 - val_loss: 6.6392 - val_accura