## <center> Imports

In [1]:
import torch
from torch.utils.data import Dataset

import torchaudio
import torchaudio.transforms

import sys, os

from pprint import pprint

from tqdm import tqdm

import json

import numpy as np

import matplotlib.pylab as plt
import seaborn as sns

import librosa
import librosa.display

import pandas as pd

from pathlib import Path

import gc



## <center> Data pre-processing </center>

In [2]:
def label_from_str_to_one_hot(label_str: str): 
  
  if label_str == "Pop":
    return torch.tensor([1, 0, 0, 0, 0, 0])
  
  if label_str == "Hip-Hop":
    return torch.tensor([0, 1, 0, 0, 0, 0])
  
  if label_str == "Electronic":
    return torch.tensor([0, 0, 1, 0, 0, 0])
  
  if label_str == "Rock":
    return torch.tensor([0, 0, 0, 1, 0, 0])

  if label_str == "Folk":
    return torch.tensor([0, 0, 0, 0, 1, 0])

  if label_str == "Jazz":
    return torch.tensor([0, 0, 0, 0, 0, 1])

In [3]:
def load_raw_audio_data(
  dataset_path, normalize_audio: bool, audio_num_frames: int
):
  
  audio_tensor_list = []
  
  num_audio_files_unable_to_open = 0

  # TODO use glob instead of this
  for path, subdirs, files in os.walk(dataset_path):
    for name in tqdm(files, colour="red"):
        
      file_audio_path = os.path.join(path, name)
      
      try:
        waveform, sample_rate = torchaudio.load(
          file_audio_path, normalize=normalize_audio,
          num_frames=audio_num_frames
        )
        
        label = file_audio_path.split("/")[-2]
        label_one_hot = label_from_str_to_one_hot(label)
        
        audio_tensor_list.append(
          {
            "waveform": waveform, 
            "og_sample_rate": sample_rate,
            "label_one_hot": label_one_hot,
            "label": label,
            "path": file_audio_path,
            "hop_length": -1
          }
        )

        
      except:
        print(f"[load_audio_data] error while loading {file_audio_path}")
        num_audio_files_unable_to_open += 1
        continue
  
  return pd.DataFrame(audio_tensor_list)
        
        

In [4]:
DATASET_SIZE = "large"
SAMPLE_FREQ = 8000
NUM_CHANNELS = 1
HOP_LENGTH = 16

DATASET_NAME = f"fma_{DATASET_SIZE}_resampled_{SAMPLE_FREQ}_rechanneled_{NUM_CHANNELS}_hopped_{HOP_LENGTH}"

DATASET_FOLDER = "../data/audio"

dataset_path = f"{DATASET_FOLDER}/{DATASET_NAME}"

AUDIO_NUM_FRAMES = 238000

In [6]:
og_audio_pd = load_raw_audio_data(
  dataset_path=dataset_path, 
  normalize_audio=True, 
  audio_num_frames=AUDIO_NUM_FRAMES
)

In [6]:
og_audio_pd.iloc[0]["waveform"].shape

torch.Size([1, 238000])

In [13]:
# "audio_hop" --> take one sample ever hop_length elements
def resample_audio(og_audio_pd, hop_length: int):
  
  resampled_audio_tensor_list = []
  
  for row_index, audio in tqdm(og_audio_pd.iterrows(), colour="green"):
    
    resampled_waveform = torchaudio.functional.resample(
        audio["waveform"], 
        # as per Torch Audio docs, this is the way of performing "hopping" in a 
        # similar way as the Mel Spectrogram transform does
        orig_freq=hop_length, new_freq=1
      )
    
    resampled_audio_tensor_list.append(
      {
        "waveform": resampled_waveform, 
        "og_sample_rate": audio["og_sample_rate"],
        "hop_length": hop_length,
        "label_one_hot": audio["label_one_hot"].numpy(),
        "label": audio["label"],
        "path": audio["path"]
      }
    )
    
  return pd.DataFrame(resampled_audio_tensor_list)

In [14]:
hop_length = 16

hopped_audio_pd = resample_audio(og_audio_pd, hop_length)

30it [00:00, 419.42it/s]


In [15]:
hopped_audio_pd.iloc[0]["waveform"].shape

torch.Size([1, 14875])

In [10]:
# code taken from: https://stackoverflow.com/a/47626762

class NumpyEncoder(json.JSONEncoder):
  
  def default(self, obj):
    
    if isinstance(obj, np.ndarray):
      return obj.tolist()
    
    return json.JSONEncoder.default(self, obj)

def export_audio_data(audio_pd, audio_data_type):
  
  for row_ind, audio in tqdm(audio_pd.iterrows(), colour="yellow"):
    torchaudio.save(
      audio["path"], audio[audio_data_type], audio["og_sample_rate"]
  )
    
def export_mel_spectrogram(audio_pd, path, file_name):
  
  audio_pd.to_json(f"{path}/{file_name}")
    
    
def export_audio_metadata(audio_pd, metadata_path, metadata_cols):
  temp_pd = audio_pd[metadata_cols]
  
  temp_pd.to_json(metadata_path)

In [16]:
export_audio_data(
  hopped_audio_pd, "waveform"
)

# export_audio_metadata(
#   og_audio_pd, 
#   f"./data/audio/fma_{DATASET_SIZE}_organized_by_label_resampled_rechanneled/metadata.json",
#   og_audio_pd.columns
# )

30it [00:00, 88.50it/s]
