In [4]:
# parsing and downloading mp3 files
# youtube_audiolibrary.html was recived from https://www.youtube.com/audiolibrary/music?nv=1

import re
import requests
from tqdm.notebook import tqdm
import os
import time

def download_file(url, dir):
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        local_filename = os.path.join(dir, r.headers['Content-Disposition'][22:-1])
        with open(local_filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192): 
                if chunk: # filter out keep-alive new chunks
                    f.write(chunk)
                    # f.flush()
    return local_filename

with open("youtube_audiolibrary.html") as f:
    page = f.read()

os.makedirs("mp3", exist_ok=True)

urls = (re.findall("\<a href\=\"(.+?download\?vid\=.+?)\" class=", page))
for url in tqdm(urls):
    status = False
    while not status:
      try:
        name = download_file(url, "mp3")
        status = True
      except Exception as e:
        print(name)
        print(e)
        

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




In [0]:
import os
import gc
import pickle

import librosa
from tqdm.notebook import tqdm
import numpy as np
from scipy.ndimage.filters import maximum_filter
import scipy.ndimage as ndimage
import matplotlib.pyplot as plt

In [0]:
# Defining constants

sample_rate = 9000
time_resolution = 0.005 # time to make fingerprint
target = (
    int(sample_rate*time_resolution),
    int(10*sample_rate*time_resolution),
    -50, 100
)    # start, end, low filter, high filter
score_threshold = 30
n_mels = 64
filter_size = (20, 20)
time2fft = 0.2 # in seconds


In [0]:
def read_and_resample(path, sample_rate):
    y, sr = librosa.load(path, sr=sample_rate)   
     
    print(f"{path} length is {y.shape[0] / sample_rate / 60.:.2f} min")
    return y

In [0]:
def read_and_resample_with_noise(path, sample_rate, scale=0.01):
    # read and add noise to audio
    
    y, sr = librosa.load(path, sr=sample_rate)    
    noise = np.random.normal(scale=scale, size=y.shape)
    y+= noise
    return y

In [0]:
def form_constellation(name, wav, sample_rate, time2fft=0.2):
    window_size = int(sample_rate * time2fft)
    hop_length = int(sample_rate * time2fft / 4)
    S = librosa.feature.melspectrogram(
        wav,
        n_fft=window_size,
        hop_length=hop_length,
        n_mels=n_mels,

    )
    S = librosa.power_to_db(S, ref=np.min(S))

    Sb = maximum_filter(S, size=filter_size) == S

    Sbd, num_objects = ndimage.label(Sb)
    objs = ndimage.find_objects(Sbd)
    points = []
    for dy, dx in objs:
        x_center = (dx.start + dx.stop - 1) // 2
        y_center = (dy.start + dy.stop - 1) // 2
        if (dx.stop - dx.start) * (dy.stop - dy.start) == 1:
            points.append((x_center, y_center))

    return points

In [0]:
def build_constellation_index(constellation_collection, target, show_progress=True):
    result_index = {}
    range_ = constellation_collection.items()    
    if show_progress:
      range_ = tqdm(range_) 
    for name, constellation in range_:

        constellation = np.array(constellation)
        for anchor in constellation:
            start_t = anchor[0]+target[0]
            stop_t = anchor[0]+target[1]
            min_f = anchor[1]+target[2]
            max_f = anchor[1]+target[3]

            mask = (
                (constellation[:, 0] >= start_t) &
                (constellation[:, 0] <= stop_t) &
                (constellation[:, 1] >= min_f) &
                (constellation[:, 1] <= max_f)
                )
            points = constellation[mask]
            for point in points:
                key = (anchor[1], point[1], point[0]-anchor[0])
                value = (anchor[0], name)
                if key in result_index:
                    result_index[key].append(value)
                else:
                    result_index[key] = [value]
    
    return result_index

In [11]:
# create and save index
audio_dir = "mp3"
names = sorted(os.listdir(audio_dir))
n_splits = 10
names = np.array_split(names, n_splits)

for i in range(n_splits):
  constellations = {}
  for name in tqdm(names[i]):
    full_path = os.path.join(audio_dir, name)
    wav = read_and_resample(full_path, sample_rate)
    constellations[name] = form_constellation(name, wav, sample_rate, time_resolution)
    gc.collect()

  index = build_constellation_index(constellations, target)

  with open("index%s.pckl"%i, "wb") as f:
    pickle.dump(index, f)
  gc.collect()
  # !cp index{i}.pckl drive/My\ Drive/Colab\ Notebooks/Study/ir/project/

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

mp3/Arkansas_Traveler.mp3 length is 3.27 min







HBox(children=(IntProgress(value=0, max=1), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

mp3/Billy_in_the_Lowground.mp3 length is 3.03 min







HBox(children=(IntProgress(value=0, max=1), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

mp3/Bird_Therapist.mp3 length is 2.59 min







HBox(children=(IntProgress(value=0, max=1), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

mp3/Cats_Searching_for_the_Truth.mp3 length is 3.73 min







HBox(children=(IntProgress(value=0, max=1), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

mp3/Creeping_Spiders.mp3 length is 3.12 min







HBox(children=(IntProgress(value=0, max=1), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

mp3/I_Feel_Like_Partying_Right_Now.mp3 length is 3.00 min







HBox(children=(IntProgress(value=0, max=1), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

mp3/Infiltration_Device.mp3 length is 2.97 min







HBox(children=(IntProgress(value=0, max=1), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

mp3/Late_Night_Drive.mp3 length is 4.88 min







HBox(children=(IntProgress(value=0, max=1), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

mp3/Power_Shutoff.mp3 length is 2.53 min







HBox(children=(IntProgress(value=0, max=1), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

mp3/The_Old_RV.mp3 length is 2.33 min







HBox(children=(IntProgress(value=0, max=1), HTML(value='')))




Compute metrics


In [0]:
score_threshold = 50

def get_scores(index, request, score_threshold=50):
    intersected_keys = (index.keys() & request.keys())
    time_offsets = {}
    scores = {}
    offsets = {}
    for key in intersected_keys:
        for request_time, _ in request[key]:
            for index_time, matched_name in index[key]:
                delta_time = (index_time-request_time)//5

                match = time_offsets.setdefault(matched_name, {})
                for delta in range(delta_time-1, delta_time+2):
                    if delta in match:
                        match[delta]+=1
                    else:
                        match[delta]=1
    
    for name in time_offsets.keys():
        offset, score = max(time_offsets[name].items(), key=lambda x: x[1])
        if score > score_threshold:
            scores[score_threshold < scores] = score
            offsets[name] = offset
    return scores, offsets

In [0]:
audio_dir = "mp3"
names = sorted(os.listdir(audio_dir))

top1 = 0
top5 = 0

for k, name in enumerate(tqdm(names), 1):
  full_path = os.path.join(audio_dir, name)

  wav = read_and_resample_with_noise(full_path, sample_rate, scale=0.1)
  randint = np.random.randint(0, max(len(wav)-sample_rate*30, 1))
  wav = wav[randint:randint+sample_rate*30]

  constellation = form_constellation(name, wav, sample_rate, time_resolution)

  request_index = build_constellation_index(
          {
            name: constellation
           },
           target,
           show_progress=False
           )
  
  scores = get_scores(index, request_index)[0].items()
  scores = list(get_scores(index, request_index)[0].items())  
  scores.sort(key=lambda x: -x[1])

  if len(scores)>0:
    if name == scores[0][0]:
      top1+=1

    answers_top5 = [scores[i][0] for i in range(min(5, len(scores))) ]
    if name in answers_top5:
      top5+=1
    else:
      print(name)
  else:
      print(name)
  if k and k%50==0:
    print("Samples %s: Top1 %s; Top5 %s"%(k, top1/k, top5/k))



HBox(children=(IntProgress(value=0, max=1445), HTML(value='')))

1940_s_Slow_Dance_Sting.mp3
Samples 50: Top1 0.98; Top5 0.98
Samples 100: Top1 0.99; Top5 0.99
Samples 150: Top1 0.9933333333333333; Top5 0.9933333333333333
Samples 200: Top1 0.995; Top5 0.995
Cavern.mp3
Samples 250: Top1 0.988; Top5 0.992
Samples 300: Top1 0.99; Top5 0.9933333333333333
Samples 350: Top1 0.9914285714285714; Top5 0.9942857142857143
Elegy.mp3
Samples 400: Top1 0.99; Top5 0.9925
Fear_The_Wind.mp3
First_Love.mp3
Samples 450: Top1 0.9866666666666667; Top5 0.9888888888888889
Fresno_Alley.mp3
Samples 500: Top1 0.986; Top5 0.988
Samples 550: Top1 0.9872727272727273; Top5 0.9890909090909091
Horror_House.mp3
Samples 600: Top1 0.9866666666666667; Top5 0.9883333333333333
Samples 650: Top1 0.9876923076923076; Top5 0.9892307692307692
Samples 700: Top1 0.9885714285714285; Top5 0.99
Samples 750: Top1 0.9893333333333333; Top5 0.9906666666666667
Macedon_is_Ours.mp3
Samples 800: Top1 0.98875; Top5 0.99
Samples 850: Top1 0.9894117647058823; Top5 0.9905882352941177
Omonia.mp3
Samples 900: 

In [0]:
print("Samples %s: Top1 %s; Top5 %s"%(k, top1/k, top5/k))

Samples 1445: Top1 0.9910034602076124; Top5 0.9937716262975779
