# DEAM Dataset - Feed Forward Neural Network
## Essentia All & openSMILE eGeMAPS Featureset

## Import relevant libraries

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torcheval.metrics import R2Score

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

import math

import sys
sys.path.insert(1, '../../utils')
from paths import *

## Neural Network Training

### Import annotations dataset

In [2]:
df_annotations = pd.read_csv(get_deam_path('processed/annotations/deam_static_annotations.csv'))
df_annotations

Unnamed: 0,song_id,valence_mean_mapped,arousal_mean_mapped
0,2,-0.475,-0.500
1,3,-0.375,-0.425
2,4,0.175,0.125
3,5,-0.150,0.075
4,7,0.200,0.350
...,...,...,...
1739,1996,-0.275,0.225
1740,1997,0.075,-0.275
1741,1998,0.350,0.300
1742,1999,-0.100,0.100


### Import the featureset

In [3]:
df_essentia_all_opensmile_egemaps_features = pd.read_csv(get_deam_path('processed/features/integrated/essentia_all_opensmile_egemaps_features.csv'))

# drop Unnamed:0 column
df_essentia_all_opensmile_egemaps_features = df_essentia_all_opensmile_egemaps_features[df_essentia_all_opensmile_egemaps_features.columns[1:]]

df_essentia_all_opensmile_egemaps_features

Unnamed: 0,song_id,lowlevel.average_loudness,lowlevel.barkbands_crest.dmean,lowlevel.barkbands_crest.dmean2,lowlevel.barkbands_crest.dvar,lowlevel.barkbands_crest.dvar2,lowlevel.barkbands_crest.max,lowlevel.barkbands_crest.mean,lowlevel.barkbands_crest.median,lowlevel.barkbands_crest.min,...,slopeUV0-500_sma3nz_amean,slopeUV500-1500_sma3nz_amean,spectralFluxUV_sma3nz_amean,loudnessPeaksPerSec,VoicedSegmentsPerSec,MeanVoicedSegmentLengthSec,StddevVoicedSegmentLengthSec,MeanUnvoicedSegmentLength,StddevUnvoicedSegmentLength,equivalentSoundLevel_dBp
0,2,0.960248,2.258284,3.723765,4.063393,11.560330,22.870403,11.212613,10.802777,2.674608,...,-0.024486,-0.007114,2.023631,2.375139,0.311180,3.170000,3.689382,0.037143,0.026573,-19.159882
1,3,0.577547,1.755316,2.888824,3.826239,10.070616,26.567934,16.432838,16.015923,4.081916,...,-0.104570,-0.008604,2.370560,1.865837,1.957295,0.402500,0.548327,0.104521,0.106012,-17.587570
2,4,0.978169,2.483856,4.020749,5.345178,14.364439,22.231218,10.575248,10.239008,2.738014,...,-0.052437,-0.006673,3.990483,3.131941,1.868327,0.473095,0.416942,0.054833,0.033689,-14.016479
3,5,0.902877,2.403809,3.848961,4.573623,11.834862,25.005966,11.478149,10.657066,3.504722,...,0.054624,-0.023808,2.539289,2.310084,0.267082,3.697500,3.349172,0.038571,0.033987,-16.089987
4,7,0.978392,1.974828,3.087666,5.040214,11.790000,24.543375,15.758171,15.867741,2.866729,...,-0.055579,-0.009088,4.034995,4.639290,0.377862,2.605294,3.186398,0.041667,0.039756,-11.279402
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1739,1996,0.983662,3.163405,5.194232,8.003455,18.861530,23.048399,10.288649,9.318178,3.273469,...,-0.004562,-0.020547,2.735432,5.512336,1.023815,0.933913,1.089083,0.032593,0.018377,-17.564531
1740,1997,0.985275,2.380977,3.982027,4.592145,12.498706,21.505104,10.112698,9.651812,3.108957,...,-0.003737,-0.019565,1.726589,4.578795,0.868403,1.104872,1.144158,0.043500,0.041021,-21.888250
1741,1998,0.899922,2.692476,4.314886,6.035235,14.309047,24.130383,9.502740,8.967997,2.275360,...,-0.035106,-0.014033,2.011096,4.512114,2.939212,0.269924,0.482651,0.061154,0.043816,-18.990843
1742,1999,0.962358,2.572920,4.367725,4.447225,12.239031,21.957045,9.768277,9.358503,3.522282,...,-0.012904,-0.014340,2.047864,3.311847,2.448253,0.348182,0.316909,0.051875,0.039784,-20.438614


In [4]:
df_essentia_all_opensmile_egemaps_features.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1744 entries, 0 to 1743
Data columns (total 4602 columns):
 #     Column                                           Dtype  
---    ------                                           -----  
 0     song_id                                          int64  
 1     lowlevel.average_loudness                        float64
 2     lowlevel.barkbands_crest.dmean                   float64
 3     lowlevel.barkbands_crest.dmean2                  float64
 4     lowlevel.barkbands_crest.dvar                    float64
 5     lowlevel.barkbands_crest.dvar2                   float64
 6     lowlevel.barkbands_crest.max                     float64
 7     lowlevel.barkbands_crest.mean                    float64
 8     lowlevel.barkbands_crest.median                  float64
 9     lowlevel.barkbands_crest.min                     float64
 10    lowlevel.barkbands_crest.stdev                   float64
 11    lowlevel.barkbands_crest.var                     flo

Join both the featureset and annotation set together

In [5]:
df_essentia_all_opensmile_egemaps_whole = pd.merge(df_essentia_all_opensmile_egemaps_features, df_annotations, how='inner', on='song_id')
df_essentia_all_opensmile_egemaps_whole = df_essentia_all_opensmile_egemaps_whole.drop('song_id', axis=1)
df_essentia_all_opensmile_egemaps_whole

Unnamed: 0,lowlevel.average_loudness,lowlevel.barkbands_crest.dmean,lowlevel.barkbands_crest.dmean2,lowlevel.barkbands_crest.dvar,lowlevel.barkbands_crest.dvar2,lowlevel.barkbands_crest.max,lowlevel.barkbands_crest.mean,lowlevel.barkbands_crest.median,lowlevel.barkbands_crest.min,lowlevel.barkbands_crest.stdev,...,spectralFluxUV_sma3nz_amean,loudnessPeaksPerSec,VoicedSegmentsPerSec,MeanVoicedSegmentLengthSec,StddevVoicedSegmentLengthSec,MeanUnvoicedSegmentLength,StddevUnvoicedSegmentLength,equivalentSoundLevel_dBp,valence_mean_mapped,arousal_mean_mapped
0,0.960248,2.258284,3.723765,4.063393,11.560330,22.870403,11.212613,10.802777,2.674608,3.560884,...,2.023631,2.375139,0.311180,3.170000,3.689382,0.037143,0.026573,-19.159882,-0.475,-0.500
1,0.577547,1.755316,2.888824,3.826239,10.070616,26.567934,16.432838,16.015923,4.081916,4.687601,...,2.370560,1.865837,1.957295,0.402500,0.548327,0.104521,0.106012,-17.587570,-0.375,-0.425
2,0.978169,2.483856,4.020749,5.345178,14.364439,22.231218,10.575248,10.239008,2.738014,3.461200,...,3.990483,3.131941,1.868327,0.473095,0.416942,0.054833,0.033689,-14.016479,0.175,0.125
3,0.902877,2.403809,3.848961,4.573623,11.834862,25.005966,11.478149,10.657066,3.504722,4.084919,...,2.539289,2.310084,0.267082,3.697500,3.349172,0.038571,0.033987,-16.089987,-0.150,0.075
4,0.978392,1.974828,3.087666,5.040214,11.790000,24.543375,15.758171,15.867741,2.866729,3.723713,...,4.034995,4.639290,0.377862,2.605294,3.186398,0.041667,0.039756,-11.279402,0.200,0.350
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1739,0.983662,3.163405,5.194232,8.003455,18.861530,23.048399,10.288649,9.318178,3.273469,3.827384,...,2.735432,5.512336,1.023815,0.933913,1.089083,0.032593,0.018377,-17.564531,-0.275,0.225
1740,0.985275,2.380977,3.982027,4.592145,12.498706,21.505104,10.112698,9.651812,3.108957,3.459206,...,1.726589,4.578795,0.868403,1.104872,1.144158,0.043500,0.041021,-21.888250,0.075,-0.275
1741,0.899922,2.692476,4.314886,6.035235,14.309047,24.130383,9.502740,8.967997,2.275360,3.772545,...,2.011096,4.512114,2.939212,0.269924,0.482651,0.061154,0.043816,-18.990843,0.350,0.300
1742,0.962358,2.572920,4.367725,4.447225,12.239031,21.957045,9.768277,9.358503,3.522282,2.892330,...,2.047864,3.311847,2.448253,0.348182,0.316909,0.051875,0.039784,-20.438614,-0.100,0.100


#### Prepare dataframes for the neural network

Perform splitting of the dataframe into training and testing sets

In [6]:
features = df_essentia_all_opensmile_egemaps_features.drop('song_id', axis=1)
features

Unnamed: 0,lowlevel.average_loudness,lowlevel.barkbands_crest.dmean,lowlevel.barkbands_crest.dmean2,lowlevel.barkbands_crest.dvar,lowlevel.barkbands_crest.dvar2,lowlevel.barkbands_crest.max,lowlevel.barkbands_crest.mean,lowlevel.barkbands_crest.median,lowlevel.barkbands_crest.min,lowlevel.barkbands_crest.stdev,...,slopeUV0-500_sma3nz_amean,slopeUV500-1500_sma3nz_amean,spectralFluxUV_sma3nz_amean,loudnessPeaksPerSec,VoicedSegmentsPerSec,MeanVoicedSegmentLengthSec,StddevVoicedSegmentLengthSec,MeanUnvoicedSegmentLength,StddevUnvoicedSegmentLength,equivalentSoundLevel_dBp
0,0.960248,2.258284,3.723765,4.063393,11.560330,22.870403,11.212613,10.802777,2.674608,3.560884,...,-0.024486,-0.007114,2.023631,2.375139,0.311180,3.170000,3.689382,0.037143,0.026573,-19.159882
1,0.577547,1.755316,2.888824,3.826239,10.070616,26.567934,16.432838,16.015923,4.081916,4.687601,...,-0.104570,-0.008604,2.370560,1.865837,1.957295,0.402500,0.548327,0.104521,0.106012,-17.587570
2,0.978169,2.483856,4.020749,5.345178,14.364439,22.231218,10.575248,10.239008,2.738014,3.461200,...,-0.052437,-0.006673,3.990483,3.131941,1.868327,0.473095,0.416942,0.054833,0.033689,-14.016479
3,0.902877,2.403809,3.848961,4.573623,11.834862,25.005966,11.478149,10.657066,3.504722,4.084919,...,0.054624,-0.023808,2.539289,2.310084,0.267082,3.697500,3.349172,0.038571,0.033987,-16.089987
4,0.978392,1.974828,3.087666,5.040214,11.790000,24.543375,15.758171,15.867741,2.866729,3.723713,...,-0.055579,-0.009088,4.034995,4.639290,0.377862,2.605294,3.186398,0.041667,0.039756,-11.279402
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1739,0.983662,3.163405,5.194232,8.003455,18.861530,23.048399,10.288649,9.318178,3.273469,3.827384,...,-0.004562,-0.020547,2.735432,5.512336,1.023815,0.933913,1.089083,0.032593,0.018377,-17.564531
1740,0.985275,2.380977,3.982027,4.592145,12.498706,21.505104,10.112698,9.651812,3.108957,3.459206,...,-0.003737,-0.019565,1.726589,4.578795,0.868403,1.104872,1.144158,0.043500,0.041021,-21.888250
1741,0.899922,2.692476,4.314886,6.035235,14.309047,24.130383,9.502740,8.967997,2.275360,3.772545,...,-0.035106,-0.014033,2.011096,4.512114,2.939212,0.269924,0.482651,0.061154,0.043816,-18.990843
1742,0.962358,2.572920,4.367725,4.447225,12.239031,21.957045,9.768277,9.358503,3.522282,2.892330,...,-0.012904,-0.014340,2.047864,3.311847,2.448253,0.348182,0.316909,0.051875,0.039784,-20.438614


In [7]:
targets = df_annotations.drop('song_id', axis=1)
targets

Unnamed: 0,valence_mean_mapped,arousal_mean_mapped
0,-0.475,-0.500
1,-0.375,-0.425
2,0.175,0.125
3,-0.150,0.075
4,0.200,0.350
...,...,...
1739,-0.275,0.225
1740,0.075,-0.275
1741,0.350,0.300
1742,-0.100,0.100


Perform 80-20 train-test split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(features, targets, test_size=0.2, random_state=42)

Create tensors for X_train and X_test

In [9]:
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float64)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float64)

Create tensors for Y_train and Y_test

In [10]:
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float64)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float64)

Define neural network parameters and instantitate neural network

In [11]:
input_size = 1
hidden_size = 20 
output_size = 2  # Output size for valence and arousal
learning_rate = 0.001
criterion = nn.MSELoss()
num_epochs = 300

Define the neural network

In [12]:
class NeuralNetwork(nn.Module):
    def __init__(self, input_size):
        super(NeuralNetwork, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_size, math.ceil((input_size**0.5) * 2)),
            nn.ReLU(),
            nn.Linear(math.ceil((input_size**0.5) * 2), math.ceil((input_size**0.5) * 2)),
            nn.ReLU(),
            nn.Linear(math.ceil((input_size**0.5) * 2), math.ceil((input_size**0.5) * 2)),
            nn.ReLU(),
            nn.Linear(math.ceil((input_size**0.5) * 2), math.ceil((input_size**0.5) * 2)),
            nn.ReLU(),
            nn.Linear(math.ceil((input_size**0.5) * 2), math.ceil((input_size**0.5) * 2)),
            nn.ReLU(),
            nn.Linear(math.ceil((input_size**0.5) * 2), math.ceil((input_size**0.5) * 2)),
            nn.ReLU(),
            nn.Linear(math.ceil((input_size**0.5) * 2), math.ceil((input_size**0.5) * 2)),
            nn.ReLU(),
            nn.Linear(math.ceil((input_size**0.5) * 2), math.ceil((input_size**0.5) * 2)),
            nn.ReLU(),
            nn.Linear(math.ceil((input_size**0.5) * 2), math.ceil((input_size**0.5) * 2)),
            nn.ReLU(),
            nn.Linear(math.ceil((input_size**0.5) * 2), math.ceil((input_size**0.5) * 2)),
            nn.ReLU(),
            nn.Linear(math.ceil((input_size**0.5) * 2), math.ceil((input_size**0.5) * 2)),
            nn.ReLU(),
            nn.Linear(math.ceil((input_size**0.5) * 2), 2)
        )

    def forward(self, x):
        return self.layers(x)

#### Training

Prepare input_train_data and target_train_labels

In [13]:
input_train_data = X_train_tensor.float()

# input_train_data = input_train_data.view(input_train_data.shape[1], -1)
print(input_train_data.shape)

target_train_labels = y_train_tensor

torch.Size([1395, 4601])


Training loop

In [14]:
model = NeuralNetwork(input_size=input_train_data.shape[1])
optimiser = optim.Adam(model.parameters(), lr=learning_rate)

for epoch in range(num_epochs):
  optimiser.zero_grad()
  
  # forward pass
  output = model(input_train_data)

  # calculate loss
  loss = torch.sqrt(criterion(output.float(), target_train_labels.float()))

  # backward pass
  loss.backward()
  # update weights
  optimiser.step()

  print(f'Epoch {epoch + 1}, Loss: {math.sqrt(loss.item())}')

print("Training completed.")

Epoch 1, Loss: 1601.7479514580314
Epoch 2, Loss: 1812.8812702435866
Epoch 3, Loss: 564.1912353094472
Epoch 4, Loss: 1117.0134287464946
Epoch 5, Loss: 559.7532659574216
Epoch 6, Loss: 963.1412798753878
Epoch 7, Loss: 660.2903480666668
Epoch 8, Loss: 784.5933660183471
Epoch 9, Loss: 582.565312862
Epoch 10, Loss: 520.125675918811
Epoch 11, Loss: 458.93358043403185
Epoch 12, Loss: 455.01708415728746
Epoch 13, Loss: 330.40185049799584
Epoch 14, Loss: 378.08611370294994
Epoch 15, Loss: 266.16618399037844
Epoch 16, Loss: 333.32134223223693
Epoch 17, Loss: 311.88528289661247
Epoch 18, Loss: 263.8599741103224
Epoch 19, Loss: 286.67027008394155
Epoch 20, Loss: 111.76946838247018
Epoch 21, Loss: 212.12471456963706
Epoch 22, Loss: 193.2402063333353
Epoch 23, Loss: 132.15153174954122
Epoch 24, Loss: 175.0538533209138
Epoch 25, Loss: 159.8696307249285
Epoch 26, Loss: 214.9783964436892
Epoch 27, Loss: 146.42652274263702
Epoch 28, Loss: 238.9726138206008
Epoch 29, Loss: 203.0593547771193
Epoch 30, Los

#### Testing

Prepare input_test_data and target_test_labels

In [15]:
input_test_data = X_test_tensor.float()

# input_test_data = input_test_data.view(input_test_data.shape[1], -1)
print(input_test_data.shape)

target_test_labels = y_test_tensor

torch.Size([349, 4601])


Generating scores

In [16]:
with torch.no_grad():
  test_pred = model(input_test_data)
  test_loss = criterion(test_pred.float(), target_test_labels)

print(f'Test RMSE: {math.sqrt(test_loss.item())}')

metric = R2Score()
metric.update(test_pred, target_test_labels)
r2_score = metric.compute()
print(f'Test R2 score: {r2_score.item()}')


Test RMSE: 334.56372177206
Test R2 score: -1160778.0994591925


True values (test set)

In [17]:
target_test_labels

tensor([[-0.1500, -0.1500],
        [-0.3000, -0.1000],
        [ 0.2000,  0.3500],
        [ 0.2250,  0.4500],
        [-0.1750, -0.2000],
        [-0.5250, -0.3000],
        [-0.2500, -0.7750],
        [ 0.3000,  0.3000],
        [-0.1750, -0.4000],
        [ 0.4500,  0.1500],
        [ 0.1750,  0.0250],
        [-0.1750, -0.0250],
        [-0.0500, -0.3000],
        [ 0.1250,  0.3000],
        [-0.0750, -0.1500],
        [-0.2000, -0.2750],
        [-0.6000, -0.2250],
        [ 0.1500, -0.2000],
        [ 0.2750,  0.6000],
        [-0.1500, -0.4500],
        [-0.2250, -0.6250],
        [-0.0250, -0.4500],
        [-0.5250, -0.1250],
        [ 0.0000,  0.3250],
        [ 0.1250,  0.3750],
        [ 0.1500, -0.2500],
        [ 0.4500,  0.3250],
        [ 0.2500,  0.2250],
        [-0.1000,  0.0750],
        [ 0.4250,  0.1250],
        [-0.4500, -0.3500],
        [-0.0500,  0.3750],
        [-0.4750, -0.2000],
        [-0.2750, -0.4000],
        [-0.4000, -0.2250],
        [ 0.1000, -0

Predicted values

In [18]:
test_pred

tensor([[-5.8645e+01,  1.4829e+02],
        [ 2.1657e+02, -1.2655e+02],
        [-7.7523e+00,  4.3314e+02],
        [ 4.3176e+02,  2.1941e+01],
        [-9.4945e+01,  6.9571e+01],
        [ 6.8489e+01,  1.6249e+02],
        [ 8.8180e+00,  3.7546e+01],
        [-1.6728e+02, -1.1482e+02],
        [-6.3924e+01,  6.9149e+01],
        [-1.9066e+02,  6.7364e+01],
        [ 2.4771e+00,  3.7352e+02],
        [ 1.4577e+02, -6.9498e+01],
        [ 3.6751e+01,  3.5142e+01],
        [ 3.4457e+01, -3.4889e+01],
        [ 7.2866e+01,  3.0646e+02],
        [-1.2478e+01,  3.2971e+02],
        [-1.3689e+01,  2.6707e+01],
        [ 2.3578e+02, -1.8883e+01],
        [ 1.3781e+02, -8.5089e+01],
        [-7.3347e+01,  1.2548e+02],
        [-1.9619e+01,  7.0544e+01],
        [ 7.0676e+01,  4.6114e+01],
        [-1.2852e+01, -5.5564e+00],
        [-3.5327e+00,  7.9869e+00],
        [-1.0557e+02,  1.2746e+02],
        [-3.0800e+01,  1.1858e+03],
        [ 3.7943e+01,  2.7954e+01],
        [ 7.2564e+01,  2.257