# DEAM Dataset - Convolutional Neural Network

Done on GPU cluster using CUDA

## Import relevant libraries

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torcheval.metrics import R2Score
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import pandas as pd

import numpy as np
import math

import os

import sys
sys.path.insert(1, '../../utils')
from paths import *

## Neural Network training

Import the annotations dataset

In [2]:
df_annotations = pd.read_csv(get_deam_path('processed/annotations/deam_static_annotations.csv'))
df_annotations

Unnamed: 0,song_id,valence_mean_mapped,arousal_mean_mapped
0,2,-0.475,-0.500
1,3,-0.375,-0.425
2,4,0.175,0.125
3,5,-0.150,0.075
4,7,0.200,0.350
...,...,...,...
1739,1996,-0.275,0.225
1740,1997,0.075,-0.275
1741,1998,0.350,0.300
1742,1999,-0.100,0.100


In [3]:
targets = df_annotations.drop('song_id', axis=1)
targets

Unnamed: 0,valence_mean_mapped,arousal_mean_mapped
0,-0.475,-0.500
1,-0.375,-0.425
2,0.175,0.125
3,-0.150,0.075
4,0.200,0.350
...,...,...
1739,-0.275,0.225
1740,0.075,-0.275
1741,0.350,0.300
1742,-0.100,0.100


Prepare a list of filepaths to the .png files of the mel spectrograms

Prepare song_ids first

In [1]:
audio_path = get_deam_path('MEMD_audio')
song_ids_temp = []

# Iterate through all files in the directory
for filename in os.listdir(audio_path):
    # Check if the path is a file (not a subdirectory)
    if os.path.isfile(os.path.join(audio_path, filename)):
        song_ids_temp.append(int(filename[:-4]))

song_ids_temp.sort()
song_ids = []

# remove all song_ids from 2015 (song_id 2001 onwards)
for song_id in song_ids_temp:
    if song_id <= 2000:
        song_ids.append(song_id)

print(song_ids)

NameError: name 'get_deam_path' is not defined

Then prepare the paths for each of the .png of the mel spectrograms

In [5]:
mel_spects_path = get_deam_path('processed/mel_spects')
mel_spect_pngs = []

for song_id in song_ids:
  mel_spect_png = f'{mel_spects_path}/{song_id}.png'
  mel_spect_pngs.append(mel_spect_png)

print(mel_spect_pngs)
print(len(mel_spect_pngs))

['../../data/DEAM/processed/mel_spects/2.png', '../../data/DEAM/processed/mel_spects/3.png', '../../data/DEAM/processed/mel_spects/4.png', '../../data/DEAM/processed/mel_spects/5.png', '../../data/DEAM/processed/mel_spects/7.png', '../../data/DEAM/processed/mel_spects/8.png', '../../data/DEAM/processed/mel_spects/10.png', '../../data/DEAM/processed/mel_spects/12.png', '../../data/DEAM/processed/mel_spects/13.png', '../../data/DEAM/processed/mel_spects/17.png', '../../data/DEAM/processed/mel_spects/18.png', '../../data/DEAM/processed/mel_spects/19.png', '../../data/DEAM/processed/mel_spects/20.png', '../../data/DEAM/processed/mel_spects/21.png', '../../data/DEAM/processed/mel_spects/22.png', '../../data/DEAM/processed/mel_spects/24.png', '../../data/DEAM/processed/mel_spects/25.png', '../../data/DEAM/processed/mel_spects/31.png', '../../data/DEAM/processed/mel_spects/32.png', '../../data/DEAM/processed/mel_spects/35.png', '../../data/DEAM/processed/mel_spects/37.png', '../../data/DEAM/p

Define the CNN for regression

In [6]:
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.layers = nn.Sequential(
            nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),
            nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),
            nn.Flatten(),
            nn.Linear(460800, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, 2)  # Output: 2 continuous values (valence and arousal)
        )

    def forward(self, x):
        return self.layers(x)

In [7]:
class CNNDataset(Dataset):
    def __init__(self, file_paths, targets, transform=None):
        self.file_paths = file_paths
        self.targets = targets
        self.transform = transform

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        img = Image.open(self.file_paths[idx]).convert('RGB')
        img = self.transform(img)
        target = torch.Tensor(self.targets.iloc[idx])
        return img, target

Perform 80-20 train-test split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(mel_spect_pngs,
                                                    targets,
                                                    test_size=0.2,
                                                    random_state=42)

Define some neural network parameters

In [9]:
learning_rate = 0.001
num_epochs = 10

Define the RMSE loss function, iniitalise the CNN and optimiser

In [10]:
# Define RMSE loss function
criterion = nn.MSELoss()

# Initialize the model and optimizer
model = NeuralNetwork()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

Define the data loading and transformation

In [11]:
transformation = transforms.Compose([transforms.ToTensor()])
train_set = CNNDataset(X_train, y_train, transform=transformation)
test_set = CNNDataset(X_test, y_test, transform=transformation)

train_dataloader = DataLoader(train_set, batch_size=256, shuffle=True)
test_dataloader = DataLoader(test_set, batch_size=256)

Training loop

In [12]:
for epoch in range(num_epochs):
    model.train()

    for input, target in train_dataloader:
        optimizer.zero_grad()

        # forward pass
        output = model(input)

        # calculate loss
        loss = torch.sqrt(criterion(output, target))

        # backward pass
        loss.backward()
        #u update weights
        optimizer.step()

        print(f'Epoch {epoch + 1}, Loss: {math.sqrt(loss.item())}')

print('Training completed.')

  target = torch.Tensor(self.targets.iloc[idx])


Epoch 1, Loss: 0.5481259615899484
Epoch 1, Loss: 0.5774208368398639
Epoch 1, Loss: 0.9774830865982139
Epoch 1, Loss: 0.6365082515697106
Epoch 1, Loss: 0.6503265853375648
Epoch 1, Loss: 0.5653073404138884
Epoch 2, Loss: 0.5778863517393857
Epoch 2, Loss: 0.5519543593083084
Epoch 2, Loss: 0.5571229150368044
Epoch 2, Loss: 0.5509143154569389
Epoch 2, Loss: 0.5346850806158677
Epoch 2, Loss: 0.5534728815108578
Epoch 3, Loss: 0.5415335149211893
Epoch 3, Loss: 0.5231591808496078
Epoch 3, Loss: 0.5243459476156849
Epoch 3, Loss: 0.5355891683337899
Epoch 3, Loss: 0.5050447008914449
Epoch 3, Loss: 0.5169723996942869
Epoch 4, Loss: 0.5128194894726947
Epoch 4, Loss: 0.5121152084565376
Epoch 4, Loss: 0.5174025039698523
Epoch 4, Loss: 0.5165451970926609
Epoch 4, Loss: 0.492864839027766
Epoch 4, Loss: 0.49229168167738446
Epoch 5, Loss: 0.5118114591292822
Epoch 5, Loss: 0.49765359647273977
Epoch 5, Loss: 0.48704659578233916
Epoch 5, Loss: 0.48851970190790006
Epoch 5, Loss: 0.5099377688605778
Epoch 5, Lo

## Neural Network testing

Generating scores

In [13]:
metric = R2Score()
test_loss = 0.0
all_predictions = []
all_true_values = []

model.eval()
with torch.no_grad():
    for input, target in test_dataloader:
        prediction = model(input)
        all_predictions.append(prediction)
        all_true_values.append(target)

all_predictions = torch.cat(all_predictions)
all_true_values = torch.cat(all_true_values)

test_loss += criterion(all_predictions, all_true_values).item()
metric.update(all_predictions, all_true_values)

test_loss /= len(test_dataloader)
r2_score = metric.compute()

print(f'Test RMSE: {test_loss}')
print(f'Test R^2 score: {r2_score}')

  target = torch.Tensor(self.targets.iloc[idx])


Test RMSE: 0.024986613541841507
Test R^2 score: 0.4584353566169739


True values (test set)

In [14]:
all_true_values

tensor([[-0.1500, -0.1500],
        [-0.3000, -0.1000],
        [ 0.2000,  0.3500],
        [ 0.2250,  0.4500],
        [-0.1750, -0.2000],
        [-0.5250, -0.3000],
        [-0.2500, -0.7750],
        [ 0.3000,  0.3000],
        [-0.1750, -0.4000],
        [ 0.4500,  0.1500],
        [ 0.1750,  0.0250],
        [-0.1750, -0.0250],
        [-0.0500, -0.3000],
        [ 0.1250,  0.3000],
        [-0.0750, -0.1500],
        [-0.2000, -0.2750],
        [-0.6000, -0.2250],
        [ 0.1500, -0.2000],
        [ 0.2750,  0.6000],
        [-0.1500, -0.4500],
        [-0.2250, -0.6250],
        [-0.0250, -0.4500],
        [-0.5250, -0.1250],
        [ 0.0000,  0.3250],
        [ 0.1250,  0.3750],
        [ 0.1500, -0.2500],
        [ 0.4500,  0.3250],
        [ 0.2500,  0.2250],
        [-0.1000,  0.0750],
        [ 0.4250,  0.1250],
        [-0.4500, -0.3500],
        [-0.0500,  0.3750],
        [-0.4750, -0.2000],
        [-0.2750, -0.4000],
        [-0.4000, -0.2250],
        [ 0.1000, -0

Predicted values

In [15]:
all_predictions

tensor([[-0.0779, -0.1199],
        [-0.2241, -0.2677],
        [ 0.1651,  0.1541],
        [ 0.1176,  0.0682],
        [-0.1214, -0.1660],
        [-0.1765, -0.2310],
        [-0.3832, -0.4670],
        [ 0.2032,  0.2060],
        [-0.2551, -0.3050],
        [ 0.1612,  0.1480],
        [ 0.1248,  0.0895],
        [-0.1541, -0.2079],
        [-0.2488, -0.3168],
        [ 0.1959,  0.1962],
        [-0.0191, -0.0697],
        [-0.0714, -0.1133],
        [-0.3936, -0.4578],
        [-0.1643, -0.2162],
        [ 0.1203,  0.0940],
        [-0.2539, -0.3244],
        [-0.3620, -0.4124],
        [-0.2455, -0.3001],
        [-0.3050, -0.3357],
        [ 0.0886,  0.0443],
        [ 0.1848,  0.1802],
        [-0.0577, -0.1109],
        [ 0.1774,  0.1761],
        [ 0.1994,  0.1924],
        [ 0.0717,  0.0279],
        [ 0.1528,  0.1338],
        [-0.3515, -0.4461],
        [-0.0694, -0.1154],
        [-0.1361, -0.1781],
        [-0.0498, -0.0943],
        [-0.1846, -0.2317],
        [-0.0161, -0

Using a CNN-RNN architecture from this research paper: https://www.semanticscholar.org/paper/DNN-Based-Music-Emotion-Recognition-from-Raw-Audio-Orjesek-Jarina/d709d2ce0019e532f6ae95dd9040121bb1b66581

In [2]:
class CNN_RNN(nn.Module):
    def __init__(self):
        super(CNN_RNN, self).__init__()
        
        # CNN layers
        self.cnn = nn.Sequential(
            nn.Conv2d(in_channels=3, out_channels=8, kernel_size=3),
            nn.ReLU(),
            nn.BatchNorm2d(8)
        )
        
        # Time distributed fully connected layer
        self.fc = nn.Sequential(
            nn.Linear(in_features=8, out_features=8)
        )
        
        # Bidirectional GRU layer
        self.gru = nn.Sequential(
            nn.GRU(input_size=8, hidden_size=8, bidirectional=True),
            nn.Tanh()
        )
        
        # Time distributed maxout layer
        self.maxout = nn.MaxPool1d(kernel_size=2)
        
        # Output layers for valence and arousal
        self.valence_output = nn.Linear(in_features=8, out_features=1)
        self.arousal_output = nn.Linear(in_features=8, out_features=1)
        
    def forward(self, x):
        x = self.cnn(x)
        x = x.view(x.size(0), x.size(1), -1)  # Flatten for RNN
        x = self.fc(x)
        x, _ = self.gru(x)
        x = self.maxout(x)
        
        valence_out = self.valence_output(x)
        arousal_out = self.arousal_output(x)
        
        return valence_out, arousal_out

NameError: name 'nn' is not defined