In [1]:
import IPython.display as ipd
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as skl
import sklearn.utils, sklearn.preprocessing, sklearn.decomposition, sklearn.svm
import librosa
import librosa.display
import os


import torchvision.transforms.v2 as v2
import torch
from torchvision.transforms import Compose, ToTensor
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import random
import utils
import utils_mgr
from utils_mgr import getAudio
import warnings

plt.rcParams['figure.figsize'] = (17, 5)

2024-01-10 17:19:51.390429: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
seed = 0
random.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)

In [3]:
# Load metadata and features.
tracks = utils.load('Data/fma_metadata/tracks.csv')

#Check tracks format
tracks.shape 

(106574, 52)

In [4]:
#Select the desired subset among the entire dataset
sub = 'small'
raw_subset = tracks[tracks['set', 'subset'] <= sub]

In [5]:
#Location of labels of interest for classification
labels = raw_subset['track']['genre_top']
labels = np.array(labels)

In [6]:
#Creation of clean subset for the generation of training, test and validation sets

meta_subset= utils_mgr.create_subset(raw_subset)

In [7]:
meta_subset[:10]

corrupted = [98565, 98567, 98569, 99134, 108925, 133297]

#Remove corrupted songs
for i in corrupted:
    meta_subsest = meta_subset[meta_subset['index']!=i]


In [8]:
#Split between taining, validation and test set according to original FMA split

train_set = meta_subset[meta_subset["split"] == "training"]
val_set = meta_subset[meta_subset["split"] == "validation"]
test_set = meta_subset[meta_subset["split"] == "test"]

In [9]:
train_set['labels'][1]

array([0., 0., 0., 1., 0., 0., 0., 0.], dtype=float32)

In [22]:
#tests with audio loading...


audio, sr = getAudio(5)


start = np.random.randint(0, (audio.shape[0]-2**17))
audio = audio[start:start+2**17]
# create lin-power mel spectrogram (discard last time bin)
#S = librosa.feature.melspectrogram(y=audio, sr=sr, n_fft=4096, hop_length=1024)
        # create log-power mel spectrogram
#S = librosa.power_to_db(S, ref=np.max)

stft = np.abs(librosa.stft(audio, n_fft=2048, hop_length=1024))
mel = librosa.feature.melspectrogram(sr=22050, S=stft**2, n_mels=513)
mel = librosa.power_to_db(mel).T[:128]



mel.shape




(128, 513)

In [11]:
class DataAudio(Dataset):

    def __init__(self, df, transform = None, type = "1D"):
        
        # Get track index
        self.track_ids = df['index'].values

        #Get genre label
        self.label = df['labels'].values

        #Transform
        self.transform = transform

        #Select type of input
        self.type = type



    def __len__(self):

        return len(self.track_ids)


    def create_input(self, i):

        # Get audio

        # load audio track
        with warnings.catch_warnings():
            warnings.simplefilter('ignore')
            audio, sr = getAudio(self.track_ids[i])

        #Select random clip from audio
        start = np.random.randint(0, (audio.shape[0]-2**17))
        audio = audio[start:start+2**17]
        
        if self.type ==  "2D":

            #Get stft
            stft = np.abs(librosa.stft(audio, n_fft=2048, hop_length=1024))
            with warnings.catch_warnings():
                warnings.simplefilter('ignore')
                mel = librosa.feature.melspectrogram(sr=22050, S=stft**2, n_mels=513)[:,:128]
                mel = librosa.power_to_db(mel).T
            return mel
        
        return audio



    def __getitem__(self, idx):

        # get input and label

        x = self.create_input(idx) 
        y = self.label[idx]

        if self.transform:
            x = self.transform(x)

        return x,y


    

In [12]:
transforms = v2.Compose([v2.ToTensor(),
    v2.RandomResizedCrop(size=(128,513), antialias=True), 
    v2.RandomHorizontalFlip(p=0.5),
    v2.ToDtype(torch.float32, scale=True),
    v2.Normalize(mean=[1.0784853], std=[4.0071154]),
    ])



In [13]:
test_dataset = MelDataset(test_set, transform = transforms)

In [14]:
test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=True, num_workers=os.cpu_count())

In [15]:
for x,y in test_dataloader:
    print(x.shape)
    break



torch.Size([64, 1, 128, 513])


In [16]:
import librosa 


print(librosa.__version__)

0.10.1
