In [5]:
import pandas as pd

download_path = '../vocal_patterns/data'
metadata_file = f"{download_path}/dataset_tags.csv"

df = pd.read_csv(metadata_file)
df.head()


Unnamed: 0,path,exercise,technique,filename
0,/Users/jake/code/jchaselubitz/vocal_patterns/v...,Other,vibrato,m6_row_vibrato.wav
1,/Users/jake/code/jchaselubitz/vocal_patterns/v...,Other,vibrato,m6_caro_vibrato.wav
2,/Users/jake/code/jchaselubitz/vocal_patterns/v...,Other,vibrato,m6_dona_vibrato.wav
3,/Users/jake/code/jchaselubitz/vocal_patterns/v...,Other,straight,m6_caro_straight.wav
4,/Users/jake/code/jchaselubitz/vocal_patterns/v...,Other,straight,m6_row_straight.wav


In [42]:
import librosa
from librosa.feature import melspectrogram
import numpy as np
from IPython.display import Audio

class AudioUtilLibrosa():
  # ----------------------------
  # Load an audio file. Return the signal as a numpy array and the sample rate
  # ----------------------------
  @staticmethod
  def open(audio_file):
    # By default, librosa converts the audio to mono (num_channels = 1)
    sig, sr = librosa.load(audio_file, sr=22050, mono=False)
    return (sig, sr)

  # ----------------------------
  # Convert the given audio to the desired number of channels
  # ----------------------------
  @staticmethod
  def rechannel(aud, new_channel):
    sig, sr = aud

    if sig.ndim == 1:
      # If the audio is mono, it will have only one dimension
      num_channels = 1
    else:
      # If the audio is stereo or more, the number of channels is the size of the first dimension
      num_channels = sig.shape[0]

    if (num_channels == new_channel):
      # If the number of channels is already as desired, return the original
      return aud

    if (new_channel == 1):
      # If converting to mono, average the channels
      resig = np.mean(sig, axis=0, keepdims=True)
    else:
      # If converting to stereo, just duplicate the first channel (if originally mono)
      resig = np.vstack([sig, sig]) if num_channels == 1 else sig

    return (resig, sr)
  
  
  @staticmethod
  def pad_trunc(aud, max_ms):
      sig, sr = aud
      if sig.ndim == 1:
          # If the signal is mono
          num_rows = 1
          sig_len = sig.shape[0]
      else:
          # If the signal is stereo or more
          num_rows, sig_len = sig.shape

      max_len = sr // 1000 * max_ms

      if (sig_len > max_len):
          # Truncate the signal to the given length
          sig = sig[:, :max_len] if num_rows > 1 else sig[:max_len]

      elif (sig_len < max_len):
          # Length of padding to add at the beginning and end of the signal
          pad_begin_len = np.random.randint(0, max_len - sig_len)
          pad_end_len = max_len - sig_len - pad_begin_len

          # Pad with 0s
          pad_begin = np.zeros((num_rows, pad_begin_len))
          pad_end = np.zeros((num_rows, pad_end_len))

          sig = np.concatenate((pad_begin, sig, pad_end), axis=1 if num_rows > 1 else 0)

      return (sig, sr)
    
    
  @staticmethod
  def spectro_gram(aud, n_mels=64, n_fft=1024, hop_len=None):
    sig,sr = aud
    top_db = 80

    # spec has shape [channel, n_mels, time], where channel is mono, stereo etc
    spec = melspectrogram(y=sig, sr=sr, n_fft=n_fft, hop_length=hop_len, n_mels=n_mels)

    # Convert to decibels
    # spec = librosa.feature.AmplitudeToDB(top_db=top_db)(spec)
    return (spec)


# Example usage
audio_path = df.iloc[0]['path']
audio_file = Audio(audio_path)
audio_data = AudioUtilLibrosa.open(audio_path)
stereo_audio = AudioUtilLibrosa.rechannel(audio_data, 2)
truncate_audio = AudioUtilLibrosa.pad_trunc(stereo_audio, 3000)
spectrogram = AudioUtilLibrosa.spectro_gram(truncate_audio)

In [44]:
spectrogram

array([[[7.2527339e-04, 1.0437759e-03, 8.2105841e-04, ...,
         2.9717278e-04, 3.6787073e-04, 5.3571846e-04],
        [4.0481711e-05, 4.5241413e-05, 7.7480501e-05, ...,
         1.7673824e-02, 1.9115394e-02, 2.0751132e-02],
        [1.9886307e-05, 2.4756480e-05, 1.2733021e-05, ...,
         3.3418998e-02, 3.4498762e-02, 3.0974934e-02],
        ...,
        [1.3768407e-08, 2.7684589e-08, 1.9972399e-08, ...,
         6.6443326e-06, 5.0011495e-06, 3.8063206e-06],
        [7.7784952e-09, 2.1309575e-08, 2.9308078e-08, ...,
         6.1979767e-07, 6.9433014e-07, 4.8507110e-07],
        [6.4210677e-09, 1.3846731e-08, 1.6351084e-08, ...,
         5.0427047e-08, 2.4790195e-08, 1.0913085e-07]],

       [[7.2527339e-04, 1.0437759e-03, 8.2105841e-04, ...,
         2.9717278e-04, 3.6787073e-04, 5.3571846e-04],
        [4.0481711e-05, 4.5241413e-05, 7.7480501e-05, ...,
         1.7673824e-02, 1.9115394e-02, 2.0751132e-02],
        [1.9886307e-05, 2.4756480e-05, 1.2733021e-05, ...,
         3.341