In [5]:
import numpy as np      
import matplotlib.pyplot as plt 
import scipy.io.wavfile 
import subprocess
import librosa
import librosa.display
import IPython.display as ipd

from pathlib import Path, PurePath   
from tqdm.notebook import tqdm

## Utility functions

In [15]:
def convert_mp3_to_wav(audio:str) -> str:  
    """Convert an input MP3 audio track into a WAV file.

    Args:
        audio (str): An input audio track.

    Returns:
        [str]: WAV filename.
    """
    if audio[-3:] == "mp3":
        wav_audio = audio[:-3] + "wav"
        if not Path(wav_audio).exists():
                subprocess.check_output(f"ffmpeg -i {audio} {wav_audio}", shell=True)
        return wav_audio
    
    return audio

def plot_spectrogram_and_picks(track:np.ndarray, sr:int, peaks:np.ndarray, onset_env:np.ndarray) -> None:
    """[summary]

    Args:
        track (np.ndarray): A track.
        sr (int): Aampling rate.
        peaks (np.ndarray): Indices of peaks in the track.
        onset_env (np.ndarray): Vector containing the onset strength envelope.
    """
    times = librosa.frames_to_time(np.arange(len(onset_env)),
                            sr=sr, hop_length=HOP_SIZE)

    plt.figure(figsize=(20, 5))
    ax = plt.subplot(2, 1, 2)
    D = librosa.stft(track)
    librosa.display.specshow(librosa.amplitude_to_db(np.abs(D), ref=np.max),
                            y_axis='log', x_axis='time')
    plt.subplot(2, 1, 1, sharex=ax)
    plt.plot(times, onset_env, alpha=0.8, label='Onset strength')
    plt.vlines(times[peaks], 0,
            onset_env.max(), color='r', alpha=0.8,
            label='Selected peaks')
    plt.legend(frameon=True, framealpha=0.8)
    plt.axis('tight')
    plt.tight_layout()
    plt.show()

def load_audio_picks(audio, duration, hop_size):
    """[summary]

    Args:
        audio (string, int, pathlib.Path or file-like object): [description]
        duration (int): [description]
        hop_size (int): 

    Returns:
        tuple: Returns the audio time series (track) and sampling rate (sr), a vector containing the onset strength envelope
        (onset_env), and the indices of peaks in track (peaks).
    """
    try:
        track, sr = librosa.load(audio, duration=duration)
        onset_env = librosa.onset.onset_strength(track, sr=sr, hop_length=hop_size)
        peaks = librosa.util.peak_pick(onset_env, 10, 10, 10, 10, 0.5, 0.5)
    except Error as e:
        print('An error occurred processing ', str(audio))
        print(e)

    return track, sr, onset_env, peaks
    
    

## Settings

In [7]:
N_TRACKS = 1413
HOP_SIZE = 512
DURATION = 30 # TODO: to be tuned!
THRESHOLD = 0 # TODO: to be tuned!

In [22]:
data_folder = Path("data/mp3s-32k/")
mp3_tracks = data_folder.glob("*/*/*.mp3")
tracks = data_folder.glob("*/*/*.wav")

## Preprocessing

In [11]:
for track in tqdm(mp3_tracks, total=N_TRACKS):
    convert_mp3_to_wav(str(track))

  0%|          | 0/1413 [00:00<?, ?it/s]

## Audio signals

In [24]:
for idx, audio in enumerate(tracks):
    if idx >= 2:
        break
    print(audio)
    track, sr, onset_env, peaks = load_audio_picks(audio, DURATION, HOP_SIZE)
    print(f"track:\n{track}\n\nsr:\n{sr}\n\nonset_env:\n{onset_env}\n\npeaks:\n{peaks}")
    print()
    print()

    #plot_spectrogram_and_picks(track, sr, peaks, onset_env)
        
        

data/mp3s-32k/metallica/Metallica/08-Nothing_Else_Matters.wav
track:
[ 0.          0.          0.         ... -0.02821597 -0.03107723
 -0.02004772]

sr:
22050

onset_env:
[0.         0.         0.         ... 0.74391496 0.91533065 0.8716322 ]

peaks:
[  13   32   50   67   86  105  122  141  159  177  196  213  232  249
  268  285  303  322  340  359  376  393  412  430  448  467  485  503
  539  556  575  593  611  630  641  652  666  684  702  720  739  757
  774  811  829  847  866  884  901  919  938  974  991 1010 1028 1046
 1082 1119 1137 1174 1191 1209 1228 1247 1264 1284]


data/mp3s-32k/metallica/Metallica/11-My_Friend_of_Misery.wav
track:
[ 0.          0.          0.         ... -0.13603407 -0.19693983
 -0.14158335]

sr:
22050

onset_env:
[0.        0.        0.        ... 0.7102401 0.7900685 1.2807925]

peaks:
[  14   25   47   57   68   90  101  133  145  166  187  198  209  231
  253  263  274  306  339  360  372  383  394  405  437  448  459  480
  491  512  523  545  556

## Minhash

In [None]:
# TODO