# DEAM Dataset - CNN with Inception-GRU Residual Structure
## Essentia Best Overall Featureset

### Import relevant libraries

In [None]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import torchaudio
# from torchaudio.models.wav2vec2 import Wav2Vec2Model
from transformers import Wav2Vec2Model
from torcheval.metrics import R2Score

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

import math

import sys
sys.path.insert(1, '../../utils')
from paths import *

### Import the annotations dataset

In [None]:
targets = pd.read_csv(get_deam_path('processed/annotations/deam_static_annotations.csv'))
targets = targets.drop('song_id', axis=1)
targets

### Prepare the song_id of all .mp3 files

In [None]:
audio_path = get_deam_path('MEMD_audio')
song_ids_temp = []

# Iterate through all files in the directory
for filename in os.listdir(audio_path):
    # Check if the path is a file (not a subdirectory)
    if os.path.isfile(os.path.join(audio_path, filename)):
        song_ids_temp.append(int(filename[:-4]))

song_ids_temp.sort()
song_ids = []

# remove all song_ids from 2015 (song_id 2001 onwards)
for song_id in song_ids_temp:
    if song_id <= 2000:
        song_ids.append(song_id)

print(song_ids)

### Perform 80-20 train-test split and create tensors

In [None]:
X_train, X_test, y_train, y_test = train_test_split(song_ids, targets, test_size=0.2, random_state=42)

y_train_tensor = torch.tensor(y_train.values, dtype=torch.float64)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float64)

target_train_labels = y_train_tensor
target_test_labels = y_test_tensor

### Define the dataset class

In [None]:
class DEAMDataset(Dataset):
    def __init__(self, song_ids):
        self.song_ids = song_ids
        self.mp3s = [f'{get_deam_path("MEMD_audio")}/{song_id}.mp3' for song_id in self.song_ids]

        waveforms = []
        
        for mp3 in self.mp3s:
            print(f'Sampling {mp3}')
            waveform, sample_rate = torchaudio.load(mp3, format='mp3')
            waveforms.append(waveform)

        self.waveform_lengths = [waveform.shape[-1] for waveform in waveforms]
        self.max_waveform_length = max(self.waveform_lengths)

    def __len__(self):
        return len(self.song_ids)

    def __getitem__(self, idx):
        mp3_file = self.mp3s[idx]
        waveform, sample_rate = torchaudio.load(mp3_file, format='mp3')
        waveform = F.pad(waveform, (0, self.max_waveform_length - waveform.size(-1)), mode='constant')
        return waveform

### Define the Neural Network

In [None]:
class InceptionBlock(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(InceptionBlock, self).__init__()
        self.branch1 = nn.Conv1d(in_channels, out_channels // 4, kernel_size=1)
        self.branch2 = nn.Sequential(
            nn.Conv1d(in_channels, out_channels // 4, kernel_size=1),
            nn.Conv1d(out_channels // 4, out_channels // 4, kernel_size=3, padding=1)
        )
        self.branch3 = nn.Sequential(
            nn.Conv1d(in_channels, out_channels // 4, kernel_size=1),
            nn.Conv1d(out_channels // 4, out_channels // 4, kernel_size=5, padding=2)
        )
        self.branch4 = nn.MaxPool1d(kernel_size=3, stride=1, padding=1)
        self.branch4_conv = nn.Conv1d(in_channels, out_channels // 4, kernel_size=1)

    def forward(self, x):
        branch1 = self.branch1(x)
        branch2 = self.branch2(x)
        branch3 = self.branch3(x)
        branch4 = self.branch4_conv(self.branch4(x))
        return torch.cat([branch1, branch2, branch3, branch4], dim=1)

class MusicEmotionRegressionModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(MusicEmotionRegressionModel, self).__init__()
        self.conv1 = nn.Conv1d(input_size, 64, kernel_size=3, padding=1)
        self.inception1 = InceptionBlock(64, 128)
        self.inception2 = InceptionBlock(128, 256)
        self.gru = nn.GRU(256, hidden_size, num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(2 * hidden_size, output_size)

    def forward(self, x):
        # x = x.unsqueeze(1)  # Add channel dimension
        # x = x.squeeze(1)  # Squeeze the channel dimension
        x = self.conv1(x)
        x = self.inception1(x)
        x = self.inception2(x)
        x = x.permute(0, 2, 1)  # Reshape for GRU input
        _, hidden = self.gru(x)
        hidden = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)
        output = self.fc(hidden)
        return output

### Define the model parameters

In [None]:
input_size = 2  # Single-channel audio input
hidden_size = 128
num_layers = 2
output_size = 2  # Valence and arousal regression targets
learning_rate = 0.001
criterion = nn.MSELoss()

In [2]:
num_epochs = 10

Set a random seed to ensure consistency

In [None]:
# Set the seed
seed = 42
torch.manual_seed(seed)

### Instantitate the dataset and dataloader

In [None]:
# Prepare your dataset and dataloader
dataset = DEAMDataset(X_train)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

### Training loop

In [None]:
def train_model(num_epochs):
  model = MusicEmotionRegressionModel(input_size, hidden_size, num_layers, output_size)
  optimiser = optim.Adam(model.parameters(), lr=learning_rate)
  
  for epoch in range(num_epochs):
    for waveforms in dataloader:
      optimiser.zero_grad()
      
      # forward pass
      output = model(waveforms)

      # calculate loss
      loss = torch.sqrt(criterion(output.float(), target_train_labels.float()))

      # backward pass
      loss.backward()
      # update weights
      optimiser.step()

    print(f'Epoch {epoch + 1}, Loss: {math.sqrt(loss.item())}')

  print("Training completed.")
  return model

In [12]:
model = train_model(num_epochs)