In [52]:
import pandas as pd
import numpy as np
import tensorflow as tf
import keras as kr
import librosa as lib
import IPython.display as ipd
import matplotlib.pyplot as plt

In [53]:
# Data processing
valid_data = pd.read_csv("../valid_data.csv")

# we need phone position, affect, file_path, then the local file path
valid_data["file_path"] = valid_data["file_name"].str.replace("/media/data/shout-data/", "/dataset/")
valid_data["file_path"] = valid_data["file_path"].str.replace(".wav", "")
valid_data["file_path"] = valid_data["file_path"].astype(str) + valid_data["chunk_name"].astype(str)

processingSet = valid_data[["affect", "file_path", "phone_position"]]

testSet = valid_data.sample(n=100, random_state=100)
testSet.to_csv("../test_set.csv")

trainingSet = valid_data.drop(testSet.index)
trainingSet.to_csv("../training_set.csv")
valid_data = pd.read_csv("../valid_data.csv")

# we need phone position, affect, file_path, then the local file path
valid_data["file_path"] = valid_data["file_name"].str.replace("/media/data/shout-data/", "../dataset/")
valid_data["file_path"] = valid_data["file_path"].str.replace(".wav", "")
valid_data["file_path"] = valid_data["file_path"].astype(str) + valid_data["chunk_name"].astype(str)

processingSet = valid_data[["affect", "file_path", "phone_position"]]
processingSet
FRAME_SIZE = 512 # the size of the frame
HOP_LENGTH = 256 # the distance each frame jumps by
FRAME_LENGTH = 512 # length of a frame

def getFeatures(audioSignal, sample_rate):
    
    # duration of 1 sample
    oneSampleDuration = 1/sample_rate
    
    # duration of audio signal in seconds
    sampleDuration = oneSampleDuration*len(audioFile)
    
    ae, time = getAmplitudeEnvelope(audioSignal)
    rms = getRMS(audioSignal)
    zcr = getZCR(audioSignal)
    
    return time, ae, rms, zcr
    
def getAmplitudeEnvelope(audioSignal):
    amplitudeEnvelope = []
    
    for i in range(0, len(audioSignal), HOP_LENGTH):
        currentFrameAE = max(audioSignal[i:i+FRAME_SIZE])
        amplitudeEnvelope.append(currentFrameAE)
        
    amplitudeEnvelope = np.array(amplitudeEnvelope)
    amplitudeFrames = range(0, amplitudeEnvelope.size)
    time = lib.frames_to_time(amplitudeFrames, hop_length=HOP_LENGTH)
    
    return amplitudeEnvelope, time
    
def getRMS(audioSignal):
    rms = []
    for i in range(0, len(audioSignal), HOP_LENGTH):
        rms_current = np.sqrt(np.sum(audioSignal[i:i+FRAME_LENGTH]**2) / FRAME_LENGTH)
        rms.append(rms_current)
    return rms

def getZCR(audioSignal):
    return lib.feature.zero_crossing_rate(audioSignal, frame_length=FRAME_LENGTH, hop_length=HOP_LENGTH)[0]

def calc_sfb(spec, freq, sr):
    freq_range = sr/2
    freq_delta = freq_range / spec.shape[0]
    return int(np.floor(freq / freq_delta))
def calc_ber(spec, freq, sr):
    sfb = calc_sfb(spec, freq, sr)
    pSpec = np.abs(spec) ** 2
    pSpec = pSpec.T
    ber = []

    for freqs in pSpec:
        sum_low = np.sum(freqs[:sfb])
        sum_high = np.sum(freqs[sfb:])
        ber_current = sum_low/sum_high
        ber.append(ber_current)

    return np.array(ber)
def getFrequencyFeatures(audioSignal, sample_rate):
    audio_spec = lib.stft(audioFile, n_fft=FRAME_SIZE, hop_length=HOP_LENGTH)
    centroid = lib.feature.spectral_centroid(y=audioFile, sr=sample_rate, n_fft=FRAME_SIZE, hop_length=HOP_LENGTH)[0]
    bandwidth = lib.feature.spectral_bandwidth(y=audioFile, sr=sample_rate, n_fft=FRAME_SIZE, hop_length=HOP_LENGTH)[0]
    ber = calc_ber(audio_spec, 2000, sample_rate)
    return centroid, bandwidth, ber

zeroCrossingRateList = []
amplitudeEnvelopeList = []
rootMeanSquareEnergyList = []
spectralCentroidList = []
spectralBandwidthList = []
bandEnergyRatioList = []
timings = []
audioFileLengths = []

for fileDir in processingSet["file_path"]:
    audioFile, sample_rate = lib.load(fileDir)
    audioFileLengths.append(len(audioFile))

maxAudioLength = max(audioFileLengths)    

for fileDir in processingSet["file_path"]:
    audioFile, sample_rate = lib.load(fileDir)
    audioFile = lib.util.pad_center(audioFile, size=maxAudioLength)
    time, ae, rmse, zcr = getFeatures(audioFile, sample_rate)
    centroid, bandwidth, ber = getFrequencyFeatures(audioFile, sample_rate)
    zeroCrossingRateList.append(zcr)
    amplitudeEnvelopeList.append(ae)
    rootMeanSquareEnergyList.append(rmse)
    spectralCentroidList.append(centroid)
    spectralBandwidthList.append(bandwidth)
    bandEnergyRatioList.append(ber)
    timings.append(time)
    
processingSet["Timings"] = timings
processingSet["Amplitude_Envelope"] = amplitudeEnvelopeList
processingSet["Root_Mean_Square_Energy"] = rootMeanSquareEnergyList
processingSet["Zero-Crossing Rate"] = zeroCrossingRateList
processingSet["Spectral Centroid"] = spectralCentroidList
processingSet["Spectral Bandwidth"] = spectralBandwidthList
processingSet["Band Energy Ratio"] = bandEnergyRatioList
trainingSetProcessed = processingSet.drop(testSet.index)
testSetProcessed = processingSet.drop(trainingSetProcessed.index)
processingSet

  ber_current = sum_low/sum_high
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  processingSet["Timings"] = timings
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  processingSet["Amplitude_Envelope"] = amplitudeEnvelopeList
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  processingSet["Root_Mean_Square_Energy"] = rootMeanSquareEnergyList
A value i

Unnamed: 0,affect,file_path,phone_position,Timings,Amplitude_Envelope,Root_Mean_Square_Energy,Zero-Crossing Rate,Spectral Centroid,Spectral Bandwidth,Band Energy Ratio
0,sadness,../dataset/shout_data_3afd7208-7987-4c73-8e01-...,Place phone on the opposite side of the room i...,"[0.0, 0.011609977324263039, 0.0232199546485260...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ..."
1,neutral,../dataset/shout_data_ae6feb35-cf7a-4805-8d05-...,Hold your phone next to your face but with the...,"[0.0, 0.011609977324263039, 0.0232199546485260...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ..."
2,neutral,../dataset/shout_data_fdb758a5-9c86-40f7-8870-...,Place phone on the opposite side of the room f...,"[0.0, 0.011609977324263039, 0.0232199546485260...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ..."
3,neutral,../dataset/shout_data_fdb758a5-9c86-40f7-8870-...,Place phone on the opposite side of the room f...,"[0.0, 0.011609977324263039, 0.0232199546485260...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ..."
4,disgust,../dataset/shout_data_739dbf1c-ec21-42db-9e49-...,Place phone 1-2 meters away face up on any sur...,"[0.0, 0.011609977324263039, 0.0232199546485260...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ..."
...,...,...,...,...,...,...,...,...,...,...
3048,sadness,../dataset/shout_data_f8d1719f-8a4b-4d7a-a2a7-...,Place phone as far away as possible while on o...,"[0.0, 0.011609977324263039, 0.0232199546485260...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ..."
3049,sadness,../dataset/shout_data_f8d1719f-8a4b-4d7a-a2a7-...,Place phone as far away as possible while on o...,"[0.0, 0.011609977324263039, 0.0232199546485260...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ..."
3050,sadness,../dataset/shout_data_f8d1719f-8a4b-4d7a-a2a7-...,Place phone as far away as possible while on o...,"[0.0, 0.011609977324263039, 0.0232199546485260...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ..."
3051,fear,../dataset/shout_data_d5d4e179-d411-4652-b80b-...,Place phone 1-2 meters away face down on a har...,"[0.0, 0.011609977324263039, 0.0232199546485260...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[nan, nan, nan, nan, nan, nan, nan, nan, nan, ..."


Recurrent Neural Network
========================

In [54]:
emotions = {"neutral": 0, "anger": 1, "joy":2, "fear":3, "disgust":4, "surprise": 5, "sadness": 6}
positions = {"Hold your phone next to your face with the mic facing your mouth as you would in a phone conversation 1/19": 1,
            "Hold your phone next to your face but with the mic/phone facing away from your face 2/19": 2,
            "Hold your phone next to your face with your hand covering the mic 3/19": 3,
            "Hold your phone next to your hip with the back of phone on your palm 4/19": 4,
            "Hold your phone next to your hip with your hand covering the mic 5/19": 5,
            "Place your phone in your pocket 6/19": 6,
            "Place your phone in a bag and hold it next to your hip 7/19": 7,
            "Place phone 1-2 meters away face up on any surface 8/19": 8,
            "Place phone 1-2 meters away face down on a hard surface 9/19": 9,
            "Place phone 1-2 meters away face down on a soft surface 10/19": 10,
            "Place phone 1-2 meters away in a bag 11/19": 11,
            "Place phone on the opposite side of the room face up on any surface 12/19": 12,
            "Place phone on the opposite side of the room face down on a hard surface 13/19": 13,
            "Place phone on the opposite side of the room face down on a soft surface 14/19": 14,
            "Place phone on the opposite side of the room in a bag 15/19": 15,
            "Place phone as far away as possible while on on the opposite side of a wall from you face up on any surface 16/19": 16,
            "Place phone as far away as possible while on on the opposite side of a wall from you face down on a hard surface 17/19": 17,
            "Place phone as far away as possible while on on the opposite side of a wall from you face down on a soft surface 18/19": 18,
            "Place phone as far away as possible while on on the opposite side of a wall from you in a bag 19/19": 19}

In [55]:
# Read in processed audio data
validSet = trainingSetProcessed.sample(n = 100, random_state=100)
trainSet = trainingSetProcessed.drop(validSet.index)
testSet = testSetProcessed
trainSetY = trainSet['affect']
trainSetY = trainSetY.map(emotions)
validSetY = validSet['affect']
validSetY = validSetY.map(emotions)
testSetY = testSet['affect']
testSetY = testSetY.map(emotions)
trainSet.shape

(2853, 10)

In [56]:
# Process feature data into an input
trainData = []
for datapoint in np.array(trainSet[["Amplitude_Envelope", "Root_Mean_Square_Energy", "Zero-Crossing Rate", "Spectral Centroid", "Spectral Bandwidth", "Band Energy Ratio"]]):
    AE, RMSE, ZCR, SC, SB, BER = datapoint
    trainData.append([AE, RMSE, ZCR, SC, SB, BER])
trainX = np.array(trainData)
validData = []
for datapoint in np.array(validSet[["Amplitude_Envelope", "Root_Mean_Square_Energy", "Zero-Crossing Rate", "Spectral Centroid", "Spectral Bandwidth", "Band Energy Ratio"]]):
    AE, RMSE, ZCR, SC, SB, BER = datapoint
    validData.append([AE, RMSE, ZCR, SC, SB, BER])
validX = np.array(validData)
testData = []
for datapoint in np.array(testSet[["Amplitude_Envelope", "Root_Mean_Square_Energy", "Zero-Crossing Rate", "Spectral Centroid", "Spectral Bandwidth", "Band Energy Ratio"]]):
    AE, RMSE, ZCR, SC, SB, BER = datapoint
    testData.append([AE, RMSE, ZCR, SC, SB, BER])
testX = np.array(testData)

In [109]:
# Construct the Recurrent Neural Net
RNN = kr.Sequential()
RNN.add(kr.layers.SimpleRNN(100))
RNN.add(kr.layers.Dense(64))
RNN.add(kr.layers.Dense(16))
RNN.add(kr.layers.Dense(8))
RNN.add(kr.layers.Dense(6, activation="softmax"))
RNN.compile(optimizer='rmsprop', loss='mean_squared_error', metrics=['accuracy'])

In [121]:
# Fit the Recurrent Neural Net to the Data
RNN.fit(trainX, trainSetY, validation_data=(validX, validSetY), epochs=20, batch_size=200)

Epoch 1/20
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.1439 - loss: nan - val_accuracy: 0.1200 - val_loss: nan
Epoch 2/20
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.1406 - loss: nan - val_accuracy: 0.1200 - val_loss: nan
Epoch 3/20
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.1492 - loss: nan - val_accuracy: 0.1200 - val_loss: nan
Epoch 4/20
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.1442 - loss: nan - val_accuracy: 0.1200 - val_loss: nan
Epoch 5/20
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.1534 - loss: nan - val_accuracy: 0.1200 - val_loss: nan
Epoch 6/20
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.1474 - loss: nan - val_accuracy: 0.1200 - val_loss: nan
Epoch 7/20
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4

<keras.src.callbacks.history.History at 0x194220b8b10>

In [122]:
# Evaluate the Recurrent Neural Net
RNN.evaluate(testX, testSetY)

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.1517 - loss: nan 


[nan, 0.15000000596046448]