In [17]:
import numpy as np
import librosa as lr
import os
from tqdm import tqdm

import spleeter
from spleeter.separator import Separator
from spleeter.audio.adapter import get_default_audio_adapter

import crepe

import mir_eval
from mir_eval import melody

import jams

EXAMPLE_AUDIO_PATH = "./Excerpt.3.15b.wav"

In [2]:
#spleeter extraction w/ 2stems

#load audio with built-in audio loader
audio_loader = get_default_audio_adapter()
x_t, sr = audio_loader.load(EXAMPLE_AUDIO_PATH)
sr=int(sr)

#call a separator for vox/accompaniment and extract vox
separator = Separator('spleeter:2stems') #default for vox+accomp
prediction_test = separator.separate(x_t)
vox_t = lr.to_mono(prediction_test['vocals'].T)

INFO:tensorflow:Apply unet for vocals_spectrogram
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Apply unet for accompaniment_spectrogram



INFO:spleeter:Downloading model archive https://github.com/deezer/spleeter/releases/download/v1.4.0/2stems.tar.gz
INFO:spleeter:Validating archive checksum
INFO:spleeter:Extracting downloaded 2stems archive
INFO:spleeter:2stems model file(s) extracted



INFO:tensorflow:Restoring parameters from pretrained_models/2stems/model


In [3]:
#extract pyin_curves
frame_length = 2048
hop_length = frame_length//4
pyin_f0, pyin_vox_flag, pyin_vox_prob = lr.pyin(vox_t, lr.note_to_hz('C2'), lr.note_to_hz('C7'),\
                                                sr=sr, frame_length=frame_length, hop_length=hop_length)
pyin_freq, pyin_voc = melody.freq_to_voicing(pyin_f0, voicing=pyin_vox_flag)
pyin_time = melody.constant_hop_timebase(hop_length, hop_length*(len(pyin_f0)-1))/sr

In [4]:
#extract crepe predictions
crepe_time, frequency, confidence, activation = crepe.predict(vox_t, sr)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [5]:
threshold = 0.5
crepe_vox_flag = confidence>threshold
crepe_freq, crepe_voc = melody.freq_to_voicing(frequency, voicing=crepe_vox_flag)

In [6]:
def drop_unvoiced(time, frequency, voicings):
    voiced_idx = np.where(voicings)
    return time[voiced_idx], frequency[voiced_idx]

In [7]:
#compute comparison metrics

#pyin as reference
pyin_ref_time, pyin_ref_freq = drop_unvoiced(pyin_time, pyin_freq, pyin_voc)
pyin_ref_results = melody.evaluate(pyin_ref_time, melody.hz2cents(pyin_ref_freq), crepe_time, melody.hz2cents(crepe_freq), est_voicing=crepe_voc)
pyin_ref_results = dict(pyin_ref_results)
print(pyin_ref_results)

#crepe as reference
crepe_ref_time, crepe_ref_freq = drop_unvoiced(crepe_time, crepe_freq, crepe_voc)
crepe_ref_results = melody.evaluate(crepe_ref_time, melody.hz2cents(crepe_ref_freq), pyin_time, melody.hz2cents(pyin_freq), est_voicing=pyin_voc)
crepe_ref_results = dict(crepe_ref_results)
print(crepe_ref_results)

all_ref_results = {'pyin_ref_metrics':pyin_ref_results, 'crepe_ref_metrics':crepe_ref_results}

{'Voicing Recall': 0.8281838733986435, 'Voicing False Alarm': 0, 'Raw Pitch Accuracy': 0.9035418236623964, 'Raw Chroma Accuracy': 0.9035418236623964, 'Overall Accuracy': 0.7822155237377544}
{'Voicing Recall': 0.8892857142857142, 'Voicing False Alarm': 0, 'Raw Pitch Accuracy': 0.8371428571428572, 'Raw Chroma Accuracy': 0.8371428571428572, 'Overall Accuracy': 0.8371428571428572}


  correct_frequencies = freq_diff_cents < cent_tolerance
  correct_chroma = np.abs(freq_diff_cents - octave) < cent_tolerance
  correct_frequencies = freq_diff_cents < cent_tolerance


In [8]:
jam = jams.JAMS()
jam.sandbox = all_ref_results

track_duration = lr.get_duration(y=x_t, sr=sr)
jam.file_metadata.duration = track_duration

pitch_ann = jams.Annotation(namespace='pitch_contour')

for i in range(len(pyin_freq)):
    value = {'index':0, 'frequency': pyin_freq[i], 'voiced':bool(pyin_voc[i])} 
    pitch_ann.append(time=pyin_time[i], value=value, duration=0, confidence=pyin_vox_prob[i])
    
for i in range(len(crepe_freq)):
    value = {'index':1, 'frequency': crepe_freq[i], 'voiced':bool(crepe_voc[i])} 
    pitch_ann.append(time=crepe_time[i], value=value, duration=0, confidence=confidence[i])
    
jam.annotations.append(pitch_ann)

In [9]:
jam.save('test_countours.jamz')

In [33]:
def drop_unvoiced(time, frequency, voicings):
    voiced_idx = np.where(voicings)
    return time[voiced_idx], frequency[voiced_idx]

def process_contours(inpath, outpath, separator=Separator('spleeter:2stems')):
    #load audio and separate vox
    audio_loader = get_default_audio_adapter()
    x_t, sr = audio_loader.load(inpath)
    sr=int(sr)

    #use separator for vox/accompaniment and extract vox
    prediction_test = separator.separate(x_t)
    vox_t = lr.to_mono(prediction_test['vocals'].T)
    
    #extract pyin_curves
    frame_length = 2048
    hop_length = frame_length//4
    pyin_f0, pyin_vox_flag, pyin_vox_prob = lr.pyin(vox_t, lr.note_to_hz('C2'), lr.note_to_hz('C7'),\
                                                    sr=sr, frame_length=frame_length, hop_length=hop_length)
    pyin_freq, pyin_voc = melody.freq_to_voicing(pyin_f0, voicing=pyin_vox_flag)
    pyin_time = melody.constant_hop_timebase(hop_length, hop_length*(len(pyin_f0)-1))/sr
    
    #extract crepe predictions
    crepe_time, frequency, confidence, activation = crepe.predict(vox_t, sr, verbose=0)
    threshold = 0.5
    crepe_vox_flag = confidence>threshold
    crepe_freq, crepe_voc = melody.freq_to_voicing(frequency, voicing=crepe_vox_flag)
    
    #compute comparison metrics

   #pyin as reference
    pyin_ref_time, pyin_ref_freq = drop_unvoiced(pyin_time, pyin_freq, pyin_voc)
    if len(pyin_ref_time>0):
        pyin_ref_results = melody.evaluate(pyin_ref_time, melody.hz2cents(pyin_ref_freq), crepe_time, melody.hz2cents(crepe_freq), est_voicing=crepe_voc)
        pyin_ref_results = dict(pyin_ref_results)
        print(pyin_ref_results)
    else:
        print("No PYIN voicings")
        pyin_ref_results = float('NaN')

    #crepe as reference
    crepe_ref_time, crepe_ref_freq = drop_unvoiced(crepe_time, crepe_freq, crepe_voc)
    if len(crepe_ref_time)>0:    
        crepe_ref_results = melody.evaluate(crepe_ref_time, melody.hz2cents(crepe_ref_freq), pyin_time, melody.hz2cents(pyin_freq), est_voicing=pyin_voc)
        crepe_ref_results = dict(crepe_ref_results)
        print(crepe_ref_results)
    else:
        print("No CREPE voicings")
        crepe_ref_results = float('NaN')

    all_ref_results = {'pyin_ref_metrics':pyin_ref_results, 'crepe_ref_metrics':crepe_ref_results}
    
    #save results to jams
    jam = jams.JAMS()
    jam.sandbox = all_ref_results

    track_duration = lr.get_duration(y=x_t, sr=sr)
    jam.file_metadata.duration = track_duration

    pitch_ann = jams.Annotation(namespace='pitch_contour')

    for i in range(len(pyin_freq)):
        value = {'index':0, 'frequency': pyin_freq[i], 'voiced':bool(pyin_voc[i])} 
        pitch_ann.append(time=pyin_time[i], value=value, duration=0, confidence=pyin_vox_prob[i])

    for i in range(len(crepe_freq)):
        value = {'index':1, 'frequency': crepe_freq[i], 'voiced':bool(crepe_voc[i])} 
        pitch_ann.append(time=crepe_time[i], value=value, duration=0, confidence=confidence[i])

    jam.annotations.append(pitch_ann)
    jam.save(outpath)

In [34]:
separator = Separator('spleeter:2stems')
out_path_test = 'test_countours_fun.jams'
process_countours(EXAMPLE_AUDIO_PATH, out_path_test, separator=separator)


INFO:tensorflow:Apply unet for vocals_spectrogram
INFO:tensorflow:Apply unet for accompaniment_spectrogram
INFO:tensorflow:Restoring parameters from pretrained_models/2stems/model
{'Voicing Recall': 0.8281838733986435, 'Voicing False Alarm': 0, 'Raw Pitch Accuracy': 0.9035418236623964, 'Raw Chroma Accuracy': 0.9035418236623964, 'Overall Accuracy': 0.7822155237377544}
{'Voicing Recall': 0.8892857142857142, 'Voicing False Alarm': 0, 'Raw Pitch Accuracy': 0.8371428571428572, 'Raw Chroma Accuracy': 0.8371428571428572, 'Overall Accuracy': 0.8371428571428572}


  correct_frequencies = freq_diff_cents < cent_tolerance
  correct_chroma = np.abs(freq_diff_cents - octave) < cent_tolerance
  correct_frequencies = freq_diff_cents < cent_tolerance


In [12]:
jams_load = jams.load(out_path_test)

In [21]:
audio_path = '/scratch/work/sonyc/marl/private_datasets/FMA/fma_small/fma_small/000'
output_path = '/scratch/ci411/rap_data/jams_test/000'

for path, subdirs, files in os.walk(datapath):
    for file in tqdm(files):
        print(file)
        inpath = os.path.join(audio_path, file)
        songname = file.split('.')[0]
        outpath = os.path.join(output_path, songname+'.jamz')
        process_contours(inpath, outpath, separator=separator)

  0%|          | 0/62 [00:00<?, ?it/s]

000853.mp3
{'Voicing Recall': 0.7556608741442865, 'Voicing False Alarm': 0, 'Raw Pitch Accuracy': 0.7198525539757767, 'Raw Chroma Accuracy': 0.7198525539757767, 'Overall Accuracy': 0.6034755134281201}
{'Voicing Recall': 0.8293293293293293, 'Voicing False Alarm': 0, 'Raw Pitch Accuracy': 0.6566566566566566, 'Raw Chroma Accuracy': 0.6566566566566566, 'Overall Accuracy': 0.6566566566566566}


  correct_frequencies = freq_diff_cents < cent_tolerance
  correct_chroma = np.abs(freq_diff_cents - octave) < cent_tolerance
  correct_frequencies = freq_diff_cents < cent_tolerance
  2%|▏         | 1/62 [01:04<1:05:46, 64.70s/it]

000200.mp3
{'Voicing Recall': 0.8811349693251533, 'Voicing False Alarm': 0, 'Raw Pitch Accuracy': 0.8619631901840491, 'Raw Chroma Accuracy': 0.8619631901840491, 'Overall Accuracy': 0.7883435582822086}
{'Voicing Recall': 0.7793594306049823, 'Voicing False Alarm': 0, 'Raw Pitch Accuracy': 0.697508896797153, 'Raw Chroma Accuracy': 0.697508896797153, 'Overall Accuracy': 0.697508896797153}


  correct_frequencies = freq_diff_cents < cent_tolerance
  correct_chroma = np.abs(freq_diff_cents - octave) < cent_tolerance
  correct_frequencies = freq_diff_cents < cent_tolerance
  3%|▎         | 2/62 [02:09<1:04:28, 64.48s/it]

000193.mp3
{'Voicing Recall': 0.876750700280112, 'Voicing False Alarm': 0, 'Raw Pitch Accuracy': 0.9467787114845938, 'Raw Chroma Accuracy': 0.9467787114845938, 'Overall Accuracy': 0.8515406162464986}
{'Voicing Recall': 0.40795454545454546, 'Voicing False Alarm': 0, 'Raw Pitch Accuracy': 0.39545454545454545, 'Raw Chroma Accuracy': 0.39545454545454545, 'Overall Accuracy': 0.39545454545454545}


  correct_frequencies = freq_diff_cents < cent_tolerance
  correct_chroma = np.abs(freq_diff_cents - octave) < cent_tolerance
  correct_frequencies = freq_diff_cents < cent_tolerance
  5%|▍         | 3/62 [03:12<1:03:04, 64.14s/it]

000002.mp3
{'Voicing Recall': 0.7627281460134486, 'Voicing False Alarm': 0, 'Raw Pitch Accuracy': 0.7550432276657061, 'Raw Chroma Accuracy': 0.7550432276657061, 'Overall Accuracy': 0.6340057636887608}
{'Voicing Recall': 0.5511713933415536, 'Voicing False Alarm': 0, 'Raw Pitch Accuracy': 0.46177558569667077, 'Raw Chroma Accuracy': 0.46177558569667077, 'Overall Accuracy': 0.46177558569667077}


  correct_frequencies = freq_diff_cents < cent_tolerance
  correct_chroma = np.abs(freq_diff_cents - octave) < cent_tolerance
  correct_frequencies = freq_diff_cents < cent_tolerance
  6%|▋         | 4/62 [04:17<1:02:14, 64.40s/it]

000707.mp3
{'Voicing Recall': 0.296373779637378, 'Voicing False Alarm': 0, 'Raw Pitch Accuracy': 0.5711297071129707, 'Raw Chroma Accuracy': 0.5711297071129707, 'Overall Accuracy': 0.27266387726638774}
{'Voicing Recall': 0.5719257540603249, 'Voicing False Alarm': 0, 'Raw Pitch Accuracy': 0.5150812064965197, 'Raw Chroma Accuracy': 0.5150812064965197, 'Overall Accuracy': 0.5150812064965197}


  correct_frequencies = freq_diff_cents < cent_tolerance
  correct_chroma = np.abs(freq_diff_cents - octave) < cent_tolerance
  correct_frequencies = freq_diff_cents < cent_tolerance
  8%|▊         | 5/62 [05:22<1:01:15, 64.49s/it]

000203.mp3
{'Voicing Recall': 0.7026476578411406, 'Voicing False Alarm': 0, 'Raw Pitch Accuracy': 0.8533604887983707, 'Raw Chroma Accuracy': 0.8574338085539714, 'Overall Accuracy': 0.6619144602851323}
{'Voicing Recall': 0.6021341463414634, 'Voicing False Alarm': 0, 'Raw Pitch Accuracy': 0.5579268292682927, 'Raw Chroma Accuracy': 0.5579268292682927, 'Overall Accuracy': 0.5579268292682927}


  correct_frequencies = freq_diff_cents < cent_tolerance
  correct_chroma = np.abs(freq_diff_cents - octave) < cent_tolerance
  correct_frequencies = freq_diff_cents < cent_tolerance
 10%|▉         | 6/62 [06:26<1:00:00, 64.29s/it]

000709.mp3
{'Voicing Recall': 0.865902293120638, 'Voicing False Alarm': 0, 'Raw Pitch Accuracy': 0.7956131605184447, 'Raw Chroma Accuracy': 0.7956131605184447, 'Overall Accuracy': 0.734297108673978}
{'Voicing Recall': 0.8546312178387651, 'Voicing False Alarm': 0, 'Raw Pitch Accuracy': 0.7242710120068611, 'Raw Chroma Accuracy': 0.7242710120068611, 'Overall Accuracy': 0.7242710120068611}


  correct_frequencies = freq_diff_cents < cent_tolerance
  correct_chroma = np.abs(freq_diff_cents - octave) < cent_tolerance
  correct_frequencies = freq_diff_cents < cent_tolerance
 11%|█▏        | 7/62 [07:30<58:59, 64.36s/it]  

000690.mp3
{'Voicing Recall': 0.38391502276176026, 'Voicing False Alarm': 0, 'Raw Pitch Accuracy': 0.6122913505311077, 'Raw Chroma Accuracy': 0.6130500758725341, 'Overall Accuracy': 0.30500758725341426}
{'Voicing Recall': 0.7073170731707317, 'Voicing False Alarm': 0, 'Raw Pitch Accuracy': 0.5658536585365853, 'Raw Chroma Accuracy': 0.5658536585365853, 'Overall Accuracy': 0.5658536585365853}


  correct_frequencies = freq_diff_cents < cent_tolerance
  correct_chroma = np.abs(freq_diff_cents - octave) < cent_tolerance
  correct_frequencies = freq_diff_cents < cent_tolerance
 13%|█▎        | 8/62 [08:35<58:01, 64.47s/it]

000197.mp3


 13%|█▎        | 8/62 [09:38<1:05:04, 72.30s/it]


IndexError: index 0 is out of bounds for axis 0 with size 0

In [32]:
inpath = '/scratch/work/sonyc/marl/private_datasets/FMA/fma_small/fma_small/000/000197.mp3'

#load audio and separate vox
audio_loader = get_default_audio_adapter()
x_t, sr = audio_loader.load(inpath)
sr=int(sr)

#use separator for vox/accompaniment and extract vox
prediction_test = separator.separate(x_t)
vox_t = lr.to_mono(prediction_test['vocals'].T)

#extract pyin_curves
frame_length = 2048
hop_length = frame_length//4
pyin_f0, pyin_vox_flag, pyin_vox_prob = lr.pyin(vox_t, lr.note_to_hz('C2'), lr.note_to_hz('C7'),\
                                                sr=sr, frame_length=frame_length, hop_length=hop_length)
pyin_freq, pyin_voc = melody.freq_to_voicing(pyin_f0, voicing=pyin_vox_flag)
pyin_time = melody.constant_hop_timebase(hop_length, hop_length*(len(pyin_f0)-1))/sr

#extract crepe predictions
crepe_time, frequency, confidence, activation = crepe.predict(vox_t, sr, verbose=0)
threshold = 0.5
crepe_vox_flag = confidence>threshold
crepe_freq, crepe_voc = melody.freq_to_voicing(frequency, voicing=crepe_vox_flag)

#compute comparison metrics

#pyin as reference
pyin_ref_time, pyin_ref_freq = drop_unvoiced(pyin_time, pyin_freq, pyin_voc)
if len(pyin_ref_time>0):
    pyin_ref_results = melody.evaluate(pyin_ref_time, melody.hz2cents(pyin_ref_freq), crepe_time, melody.hz2cents(crepe_freq), est_voicing=crepe_voc)
    pyin_ref_results = dict(pyin_ref_results)
    print(pyin_ref_results)
else:
    print("No PYIN voicings")
    pyin_ref_results = float('NaN')

#crepe as reference
crepe_ref_time, crepe_ref_freq = drop_unvoiced(crepe_time, crepe_freq, crepe_voc)
if len(crepe_ref_time)>0:    
    crepe_ref_results = melody.evaluate(crepe_ref_time, melody.hz2cents(crepe_ref_freq), pyin_time, melody.hz2cents(pyin_freq), est_voicing=pyin_voc)
    crepe_ref_results = dict(crepe_ref_results)
    print(crepe_ref_results)
else:
    print("No CREPE voicings")
    crepe_ref_results = float('NaN')

all_ref_results = {'pyin_ref_metrics':pyin_ref_results, 'crepe_ref_metrics':crepe_ref_results}

#save results to jams
jam = jams.JAMS()
jam.sandbox = all_ref_results

track_duration = lr.get_duration(y=x_t, sr=sr)
jam.file_metadata.duration = track_duration

pitch_ann = jams.Annotation(namespace='pitch_contour')

for i in range(len(pyin_freq)):
    value = {'index':0, 'frequency': pyin_freq[i], 'voiced':bool(pyin_voc[i])} 
    pitch_ann.append(time=pyin_time[i], value=value, duration=0, confidence=pyin_vox_prob[i])

for i in range(len(crepe_freq)):
    value = {'index':1, 'frequency': crepe_freq[i], 'voiced':bool(crepe_voc[i])} 
    pitch_ann.append(time=crepe_time[i], value=value, duration=0, confidence=confidence[i])

No PYIN voicings
{'Voicing Recall': 0.0, 'Voicing False Alarm': 0, 'Raw Pitch Accuracy': 0.0, 'Raw Chroma Accuracy': 0.0, 'Overall Accuracy': 0.0}


  correct_frequencies = freq_diff_cents < cent_tolerance
  correct_chroma = np.abs(freq_diff_cents - octave) < cent_tolerance
  correct_frequencies = freq_diff_cents < cent_tolerance
