In [1]:
import soundfile as sf
import argparse
import musdb
import museval
import test
import multiprocessing
import functools
from pathlib import Path
import torch
import json
import tqdm
import numpy as np
import os
import pickle
import crepe
import scipy.io.wavfile
import mir_eval
import essentia.standard as ess
from matplotlib import pyplot as plt
%matplotlib inline

1. Within each experiment folder, there will be folders corresponding to the models being evaluated
2. Within the folder for each model there will be a folder corresponding to each test example
3. Within folder for each test example we will save the model output (vocals and accompaniments) as well as json file containing the evaluation metrics for that example and model

In [2]:
#Create folder within given experiment corresponding to model being evaluated
exp_no = 7
model = '../new_models/test/model_tabla_mse_pretrain1/' #Path to the saved model
model_name = 'model_tabla_mse_pretrain1'

exp_model_res_path = '../test_out/Exp_' + str(exp_no) + '/' + model_name + '/'

if not(os.path.exists(exp_model_res_path)):
    os.mkdir(exp_model_res_path)

In [3]:
#Create folders corresponding to each test examples 
test_data_folder = '../rec_data_final/test/'
for test_eg in os.listdir(test_data_folder):
    test_eg_path = exp_model_res_path + test_eg + '/'
    if not(os.path.exists(test_eg_path)):
        os.mkdir(test_eg_path)

In [4]:
# # Find output of the model for each test example and save in corresponding folder
# for test_eg in os.listdir(test_data_folder):
#     #print(test_eg)
#     test_eg_source = test_data_folder + test_eg #path to test data
#     test_eg_eval = exp_model_res_path + test_eg + '/'
    

In [5]:
#Obtain SDR values for each test example as well as save estimate audios
# Define required functions
def pad_or_truncate(
    audio_reference,
    audio_estimates
):
    """Pad or truncate estimates by duration of references:
    - If reference > estimates: add zeros at the and of the estimated signal
    - If estimates > references: truncate estimates to duration of references

    Parameters
    ----------
    references : np.ndarray, shape=(nsrc, nsampl, nchan)
        array containing true reference sources
    estimates : np.ndarray, shape=(nsrc, nsampl, nchan)
        array containing estimated sources
    Returns
    -------
    references : np.ndarray, shape=(nsrc, nsampl, nchan)
        array containing true reference sources
    estimates : np.ndarray, shape=(nsrc, nsampl, nchan)
        array containing estimated sources
    """
    est_shape = audio_estimates.shape
    ref_shape = audio_reference.shape
    if est_shape[1] != ref_shape[1]:
        if est_shape[1] >= ref_shape[1]:
            audio_estimates = audio_estimates[:, :ref_shape[1], :]
        else:
            # pad end with zeros
            audio_estimates = np.pad(
                audio_estimates,
                [
                    (0, 0),
                    (0, ref_shape[1] - est_shape[1]),
                    (0, 0)
                ],
                mode='constant'
            )

    return audio_reference, audio_estimates

def evaluate(
    references,
    estimates,
    win=1*44100,
    hop=1*44100,
    mode='v4',
    padding=True
):
    """BSS_EVAL images evaluation using metrics module

    Parameters
    ----------
    references : np.ndarray, shape=(nsrc, nsampl, nchan)
        array containing true reference sources
    estimates : np.ndarray, shape=(nsrc, nsampl, nchan)
        array containing estimated sources
    window : int, defaults to 44100
        window size in samples
    hop : int
        hop size in samples, defaults to 44100 (no overlap)
    mode : str
        BSSEval version, default to `v4`
    Returns
    -------
    SDR : np.ndarray, shape=(nsrc,)
        vector of Signal to Distortion Ratios (SDR)
    ISR : np.ndarray, shape=(nsrc,)
        vector of Source to Spatial Distortion Image (ISR)
    SIR : np.ndarray, shape=(nsrc,)
        vector of Source to Interference Ratios (SIR)
    SAR : np.ndarray, shape=(nsrc,)
        vector of Sources to Artifacts Ratios (SAR)
    """

    estimates = np.array(estimates)
    references = np.array(references)

    if padding:
        references, estimates = pad_or_truncate(references, estimates)

    SDR, ISR, SIR, SAR, _ = museval.metrics.bss_eval(
        references,
        estimates,
        compute_permutation=False,
        window=win,
        hop=hop,
        framewise_filters=(mode == "v3"),
        bsseval_sources_version=False
    )

    return SDR, ISR, SIR, SAR



In [6]:
#targets = ['vocals']
targets = ['tabla']
root = '../rec_data_final/'
subset = 'test'
cores = 1
no_cuda = False
is_wav = True
samplerate = 44100
use_cuda = not no_cuda and torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

In [7]:
mus = musdb.DB(
    root=root,
    download=root is None,
    subsets=subset,
    is_wav=is_wav
)

In [8]:
# iterate over all tracks present in test folder
for track in mus.tracks:
    outdir = exp_model_res_path + track.name + '/'    
    print(track.name)
    estimates = test.separate(
        audio=track.audio,
        targets=targets,
        model_name=model,
        niter=2,
        alpha=1,
        softmask=False,
        #device=device
    )    
    
    for target, estimate in estimates.items():
        sf.write(
            outdir / Path(target).with_suffix('.wav'),
            estimate,
            samplerate
        )
    
    print("SAVED SEPARATED VOCALS AND ACCOMPANIMENTS!")
    
#     audio_estimates = []
#     audio_reference = []
#     eval_targets = []

#     for key, target in list(track.targets.items()):
#         try:
#             # try to fetch the audio from the user_results of a given key
#             estimates[key]
#         except KeyError:
#             # ignore wrong key and continue
#             continue
#         eval_targets.append(key)
        
#     mode='v4'
#     win=1.0
#     hop=1.0
#     data = museval.aggregate.TrackStore(win=win, hop=hop, track_name=track.name)

#     # check if vocals and accompaniment is among the targets
#     #has_acc = all(x in eval_targets for x in ['vocals', 'accompaniment'])
#     has_acc = all(x in eval_targets for x in ['tabla', 'accompaniment'])
#     if has_acc:
#         # remove accompaniment from list of targets, because
#         # the voc/acc scenario will be evaluated separately
#         eval_targets.remove('accompaniment')
        
#     #audio_estimates.append(estimates['vocals'])
#     #audio_reference.append(track.targets['vocals'].audio)
    
#     audio_estimates.append(estimates['tabla'])
#     audio_reference.append(track.targets['tabla'].audio)
    
    
#     SDR, ISR, SIR, SAR = evaluate(
#             audio_reference,
#             audio_estimates,
#             win=int(win*track.rate),
#             hop=int(hop*track.rate),
#             mode=mode
#         )
    
#     save_dict = {}
#     save_dict['SDR'] = SDR[0].tolist()
#     save_dict['ISR'] = ISR[0].tolist()
#     save_dict['SDR_median'] = np.median(SDR[0])
#     save_dict['ISR_median'] = np.median(ISR[0])
    
#     save_file = outdir + "evaluation.json"
    
#     print("Saving json file")
    
#     with open(save_file, 'w') as outfile:
#         json.dump(save_dict, outfile)

#     print("Saved!")


  0%|          | 0/1 [00:00<?, ?it/s]

Darbari_10dB
CHECK OpenUnmix(
  (stft): STFT()
  (spec): Spectrogram()
  (transform): Sequential(
    (0): STFT()
    (1): Spectrogram()
  )
  (fc1): Linear(in_features=2974, out_features=512, bias=False)
  (bn1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (lstm): LSTM(512, 256, num_layers=3, dropout=0.4, bidirectional=True)
  (fc2): Linear(in_features=1024, out_features=512, bias=False)
  (bn2): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc3): Linear(in_features=512, out_features=4098, bias=False)
  (bn3): BatchNorm1d(4098, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)


100%|██████████| 1/1 [00:02<00:00,  2.97s/it]
  0%|          | 0/1 [00:00<?, ?it/s]

SAVED SEPARATED VOCALS AND ACCOMPANIMENTS!
Darbari_15dB
CHECK OpenUnmix(
  (stft): STFT()
  (spec): Spectrogram()
  (transform): Sequential(
    (0): STFT()
    (1): Spectrogram()
  )
  (fc1): Linear(in_features=2974, out_features=512, bias=False)
  (bn1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (lstm): LSTM(512, 256, num_layers=3, dropout=0.4, bidirectional=True)
  (fc2): Linear(in_features=1024, out_features=512, bias=False)
  (bn2): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc3): Linear(in_features=512, out_features=4098, bias=False)
  (bn3): BatchNorm1d(4098, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)


100%|██████████| 1/1 [00:02<00:00,  2.13s/it]
  0%|          | 0/1 [00:00<?, ?it/s]

SAVED SEPARATED VOCALS AND ACCOMPANIMENTS!
Darbari_20dB
CHECK OpenUnmix(
  (stft): STFT()
  (spec): Spectrogram()
  (transform): Sequential(
    (0): STFT()
    (1): Spectrogram()
  )
  (fc1): Linear(in_features=2974, out_features=512, bias=False)
  (bn1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (lstm): LSTM(512, 256, num_layers=3, dropout=0.4, bidirectional=True)
  (fc2): Linear(in_features=1024, out_features=512, bias=False)
  (bn2): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc3): Linear(in_features=512, out_features=4098, bias=False)
  (bn3): BatchNorm1d(4098, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)


100%|██████████| 1/1 [00:02<00:00,  2.18s/it]
  0%|          | 0/1 [00:00<?, ?it/s]

SAVED SEPARATED VOCALS AND ACCOMPANIMENTS!
Deepak-pooriya dhanashri_10dB
CHECK OpenUnmix(
  (stft): STFT()
  (spec): Spectrogram()
  (transform): Sequential(
    (0): STFT()
    (1): Spectrogram()
  )
  (fc1): Linear(in_features=2974, out_features=512, bias=False)
  (bn1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (lstm): LSTM(512, 256, num_layers=3, dropout=0.4, bidirectional=True)
  (fc2): Linear(in_features=1024, out_features=512, bias=False)
  (bn2): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc3): Linear(in_features=512, out_features=4098, bias=False)
  (bn3): BatchNorm1d(4098, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)


100%|██████████| 1/1 [00:04<00:00,  4.21s/it]
  0%|          | 0/1 [00:00<?, ?it/s]

SAVED SEPARATED VOCALS AND ACCOMPANIMENTS!
Deepak-pooriya dhanashri_15dB
CHECK OpenUnmix(
  (stft): STFT()
  (spec): Spectrogram()
  (transform): Sequential(
    (0): STFT()
    (1): Spectrogram()
  )
  (fc1): Linear(in_features=2974, out_features=512, bias=False)
  (bn1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (lstm): LSTM(512, 256, num_layers=3, dropout=0.4, bidirectional=True)
  (fc2): Linear(in_features=1024, out_features=512, bias=False)
  (bn2): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc3): Linear(in_features=512, out_features=4098, bias=False)
  (bn3): BatchNorm1d(4098, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)


100%|██████████| 1/1 [00:03<00:00,  3.73s/it]
  0%|          | 0/1 [00:00<?, ?it/s]

SAVED SEPARATED VOCALS AND ACCOMPANIMENTS!
Deepak-pooriya dhanashri_20dB
CHECK OpenUnmix(
  (stft): STFT()
  (spec): Spectrogram()
  (transform): Sequential(
    (0): STFT()
    (1): Spectrogram()
  )
  (fc1): Linear(in_features=2974, out_features=512, bias=False)
  (bn1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (lstm): LSTM(512, 256, num_layers=3, dropout=0.4, bidirectional=True)
  (fc2): Linear(in_features=1024, out_features=512, bias=False)
  (bn2): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc3): Linear(in_features=512, out_features=4098, bias=False)
  (bn3): BatchNorm1d(4098, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)


100%|██████████| 1/1 [00:03<00:00,  3.56s/it]
  0%|          | 0/1 [00:00<?, ?it/s]

SAVED SEPARATED VOCALS AND ACCOMPANIMENTS!
Deepak_jog_kauns_10dB
CHECK OpenUnmix(
  (stft): STFT()
  (spec): Spectrogram()
  (transform): Sequential(
    (0): STFT()
    (1): Spectrogram()
  )
  (fc1): Linear(in_features=2974, out_features=512, bias=False)
  (bn1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (lstm): LSTM(512, 256, num_layers=3, dropout=0.4, bidirectional=True)
  (fc2): Linear(in_features=1024, out_features=512, bias=False)
  (bn2): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc3): Linear(in_features=512, out_features=4098, bias=False)
  (bn3): BatchNorm1d(4098, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)


100%|██████████| 1/1 [00:03<00:00,  3.63s/it]
  0%|          | 0/1 [00:00<?, ?it/s]

SAVED SEPARATED VOCALS AND ACCOMPANIMENTS!
Deepak_jog_kauns_15dB
CHECK OpenUnmix(
  (stft): STFT()
  (spec): Spectrogram()
  (transform): Sequential(
    (0): STFT()
    (1): Spectrogram()
  )
  (fc1): Linear(in_features=2974, out_features=512, bias=False)
  (bn1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (lstm): LSTM(512, 256, num_layers=3, dropout=0.4, bidirectional=True)
  (fc2): Linear(in_features=1024, out_features=512, bias=False)
  (bn2): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc3): Linear(in_features=512, out_features=4098, bias=False)
  (bn3): BatchNorm1d(4098, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)


100%|██████████| 1/1 [00:03<00:00,  3.92s/it]
  0%|          | 0/1 [00:00<?, ?it/s]

SAVED SEPARATED VOCALS AND ACCOMPANIMENTS!
Deepak_jog_kauns_20dB
CHECK OpenUnmix(
  (stft): STFT()
  (spec): Spectrogram()
  (transform): Sequential(
    (0): STFT()
    (1): Spectrogram()
  )
  (fc1): Linear(in_features=2974, out_features=512, bias=False)
  (bn1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (lstm): LSTM(512, 256, num_layers=3, dropout=0.4, bidirectional=True)
  (fc2): Linear(in_features=1024, out_features=512, bias=False)
  (bn2): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc3): Linear(in_features=512, out_features=4098, bias=False)
  (bn3): BatchNorm1d(4098, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)


100%|██████████| 1/1 [00:03<00:00,  3.57s/it]
  0%|          | 0/1 [00:00<?, ?it/s]

SAVED SEPARATED VOCALS AND ACCOMPANIMENTS!
Deepak_maari_bihag_10dB
CHECK OpenUnmix(
  (stft): STFT()
  (spec): Spectrogram()
  (transform): Sequential(
    (0): STFT()
    (1): Spectrogram()
  )
  (fc1): Linear(in_features=2974, out_features=512, bias=False)
  (bn1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (lstm): LSTM(512, 256, num_layers=3, dropout=0.4, bidirectional=True)
  (fc2): Linear(in_features=1024, out_features=512, bias=False)
  (bn2): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc3): Linear(in_features=512, out_features=4098, bias=False)
  (bn3): BatchNorm1d(4098, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)


100%|██████████| 1/1 [00:03<00:00,  3.89s/it]
  0%|          | 0/1 [00:00<?, ?it/s]

SAVED SEPARATED VOCALS AND ACCOMPANIMENTS!
Deepak_maari_bihag_15dB
CHECK OpenUnmix(
  (stft): STFT()
  (spec): Spectrogram()
  (transform): Sequential(
    (0): STFT()
    (1): Spectrogram()
  )
  (fc1): Linear(in_features=2974, out_features=512, bias=False)
  (bn1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (lstm): LSTM(512, 256, num_layers=3, dropout=0.4, bidirectional=True)
  (fc2): Linear(in_features=1024, out_features=512, bias=False)
  (bn2): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc3): Linear(in_features=512, out_features=4098, bias=False)
  (bn3): BatchNorm1d(4098, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)


100%|██████████| 1/1 [00:03<00:00,  3.82s/it]
  0%|          | 0/1 [00:00<?, ?it/s]

SAVED SEPARATED VOCALS AND ACCOMPANIMENTS!
Deepak_maari_bihag_20dB
CHECK OpenUnmix(
  (stft): STFT()
  (spec): Spectrogram()
  (transform): Sequential(
    (0): STFT()
    (1): Spectrogram()
  )
  (fc1): Linear(in_features=2974, out_features=512, bias=False)
  (bn1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (lstm): LSTM(512, 256, num_layers=3, dropout=0.4, bidirectional=True)
  (fc2): Linear(in_features=1024, out_features=512, bias=False)
  (bn2): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc3): Linear(in_features=512, out_features=4098, bias=False)
  (bn3): BatchNorm1d(4098, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)


100%|██████████| 1/1 [00:03<00:00,  3.75s/it]
  0%|          | 0/1 [00:00<?, ?it/s]

SAVED SEPARATED VOCALS AND ACCOMPANIMENTS!
YAMAN_ALAP_10dB
CHECK OpenUnmix(
  (stft): STFT()
  (spec): Spectrogram()
  (transform): Sequential(
    (0): STFT()
    (1): Spectrogram()
  )
  (fc1): Linear(in_features=2974, out_features=512, bias=False)
  (bn1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (lstm): LSTM(512, 256, num_layers=3, dropout=0.4, bidirectional=True)
  (fc2): Linear(in_features=1024, out_features=512, bias=False)
  (bn2): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc3): Linear(in_features=512, out_features=4098, bias=False)
  (bn3): BatchNorm1d(4098, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)


100%|██████████| 1/1 [00:03<00:00,  3.77s/it]
  0%|          | 0/1 [00:00<?, ?it/s]

SAVED SEPARATED VOCALS AND ACCOMPANIMENTS!
YAMAN_ALAP_15dB
CHECK OpenUnmix(
  (stft): STFT()
  (spec): Spectrogram()
  (transform): Sequential(
    (0): STFT()
    (1): Spectrogram()
  )
  (fc1): Linear(in_features=2974, out_features=512, bias=False)
  (bn1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (lstm): LSTM(512, 256, num_layers=3, dropout=0.4, bidirectional=True)
  (fc2): Linear(in_features=1024, out_features=512, bias=False)
  (bn2): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc3): Linear(in_features=512, out_features=4098, bias=False)
  (bn3): BatchNorm1d(4098, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)


100%|██████████| 1/1 [00:03<00:00,  3.66s/it]
  0%|          | 0/1 [00:00<?, ?it/s]

SAVED SEPARATED VOCALS AND ACCOMPANIMENTS!
YAMAN_ALAP_20dB
CHECK OpenUnmix(
  (stft): STFT()
  (spec): Spectrogram()
  (transform): Sequential(
    (0): STFT()
    (1): Spectrogram()
  )
  (fc1): Linear(in_features=2974, out_features=512, bias=False)
  (bn1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (lstm): LSTM(512, 256, num_layers=3, dropout=0.4, bidirectional=True)
  (fc2): Linear(in_features=1024, out_features=512, bias=False)
  (bn2): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc3): Linear(in_features=512, out_features=4098, bias=False)
  (bn3): BatchNorm1d(4098, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)


100%|██████████| 1/1 [00:03<00:00,  3.67s/it]
  0%|          | 0/1 [00:00<?, ?it/s]

SAVED SEPARATED VOCALS AND ACCOMPANIMENTS!
Yaman_res_10dB
CHECK OpenUnmix(
  (stft): STFT()
  (spec): Spectrogram()
  (transform): Sequential(
    (0): STFT()
    (1): Spectrogram()
  )
  (fc1): Linear(in_features=2974, out_features=512, bias=False)
  (bn1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (lstm): LSTM(512, 256, num_layers=3, dropout=0.4, bidirectional=True)
  (fc2): Linear(in_features=1024, out_features=512, bias=False)
  (bn2): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc3): Linear(in_features=512, out_features=4098, bias=False)
  (bn3): BatchNorm1d(4098, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)


100%|██████████| 1/1 [00:06<00:00,  6.21s/it]
  0%|          | 0/1 [00:00<?, ?it/s]

SAVED SEPARATED VOCALS AND ACCOMPANIMENTS!
Yaman_res_15dB
CHECK OpenUnmix(
  (stft): STFT()
  (spec): Spectrogram()
  (transform): Sequential(
    (0): STFT()
    (1): Spectrogram()
  )
  (fc1): Linear(in_features=2974, out_features=512, bias=False)
  (bn1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (lstm): LSTM(512, 256, num_layers=3, dropout=0.4, bidirectional=True)
  (fc2): Linear(in_features=1024, out_features=512, bias=False)
  (bn2): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc3): Linear(in_features=512, out_features=4098, bias=False)
  (bn3): BatchNorm1d(4098, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)


100%|██████████| 1/1 [00:06<00:00,  6.02s/it]
  0%|          | 0/1 [00:00<?, ?it/s]

SAVED SEPARATED VOCALS AND ACCOMPANIMENTS!
Yaman_res_20dB
CHECK OpenUnmix(
  (stft): STFT()
  (spec): Spectrogram()
  (transform): Sequential(
    (0): STFT()
    (1): Spectrogram()
  )
  (fc1): Linear(in_features=2974, out_features=512, bias=False)
  (bn1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (lstm): LSTM(512, 256, num_layers=3, dropout=0.4, bidirectional=True)
  (fc2): Linear(in_features=1024, out_features=512, bias=False)
  (bn2): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc3): Linear(in_features=512, out_features=4098, bias=False)
  (bn3): BatchNorm1d(4098, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)


100%|██████████| 1/1 [00:06<00:00,  6.32s/it]


SAVED SEPARATED VOCALS AND ACCOMPANIMENTS!


In [None]:
# Begin evaluation of pitch accuracy (PYIN)
W = 1024
H = 512

for track in os.listdir(exp_model_res_path):
    print(track)
    print("Extracting Pitch")
    
    clean_vocs_path = test_data_folder + track + '/vocals.wav'
    extr_vocs_path = exp_model_res_path + track + '/vocals.wav'
    
    clean_vocs = ess.MonoLoader(sampleRate=44100 , filename=clean_vocs_path)()
    extr_vocs = ess.MonoLoader(sampleRate=44100 , filename=extr_vocs_path)()

    #     t_clean_vocs, f_clean_vocs, conf_clean_vocs, act_clean_vocs = crepe.predict(clean_vocs, r1, viterbi=True)
    #     t_extr_vocs, f_extr_vocs, conf_extr_vocs, act_extr_vocs = crepe.predict(extr_vocs, r2, viterbi=True)

    pyin = ess.PitchYinProbabilistic(frameSize=W, hopSize=H)

    f_clean_vocs, conf_clean_vocs = pyin(clean_vocs)
    f_extr_vocs, conf_extr_vocs = pyin(extr_vocs)
    
    print("Pitch extracted!")
    
    
    #Make entire frequency array postivie
    f_clean_vocs = np.absolute(f_clean_vocs)
    f_extr_vocs = np.absolute(f_extr_vocs)    

    if (f_clean_vocs.shape[0]>f_extr_vocs.shape[0]):
        f_clean_vocs = f_clean_vocs[:f_extr_vocs.shape[0]]
        conf_clean_vocs = conf_clean_vocs[:f_extr_vocs.shape[0]]
        #t_clean_vocs = t_extr_vocs
    elif (f_clean_vocs.shape[0]<=f_extr_vocs.shape[0]):
        f_extr_vocs = f_extr_vocs[:f_clean_vocs.shape[0]]
        conf_extr_vocs = conf_extr_vocs[:f_clean_vocs.shape[0]]
        #t_extr_vocs = t_clean_vocs
    
    #plt.plot((f_clean_vocs-f_extr_vocs)**2)
    #plt.plot(f_clean_vocs)
    #plt.show()
    
    #Convert frequency to cents
    f_clean_vocs_cents = mir_eval.melody.hz2cents(f_clean_vocs)
    f_extr_vocs_cents = mir_eval.melody.hz2cents(f_extr_vocs)
    
    plt.plot((f_clean_vocs_cents-f_extr_vocs_cents)**2)
    outdir = exp_model_res_path + track + '/'
    plt.savefig(outdir + "freq_mse.png")
    
#     np.savetxt(outdir+'clean_pitch.csv', f_clean_vocs, delimiter=',')
#     np.savetxt(outdir+'extr_pitch.csv', f_extr_vocs , delimiter=',')


    #Convert voicing probabilites to binary vector, with a given threshold
    conf_clean_vocs_bin = np.zeros(np.shape(conf_clean_vocs))
    conf_extr_vocs_bin = np.zeros(np.shape(conf_extr_vocs))
    thresh = 0.5
    for i in range(conf_clean_vocs_bin.shape[0]):
        if (conf_clean_vocs[i]>thresh):
            conf_clean_vocs_bin[i]=1
        if (conf_extr_vocs[i]>thresh):
            conf_extr_vocs_bin[i]=1
            
    print("Calculating pitch accuracy measures")
    
    #Voicing accuracy measures    
    v_rec, fal_al = mir_eval.melody.voicing_measures(conf_clean_vocs_bin, conf_extr_vocs_bin)

    #Calculate raw pitch accuracy - Default cent_tolerance=50
    rpa = mir_eval.melody.raw_pitch_accuracy(conf_clean_vocs_bin, f_clean_vocs_cents, conf_extr_vocs_bin, f_extr_vocs_cents)

    #Calculate raw chroma accuracy - Default cent_tolerance=50
    rca = mir_eval.melody.raw_chroma_accuracy(conf_clean_vocs_bin, f_clean_vocs_cents, conf_extr_vocs_bin, f_extr_vocs_cents)

    #Calculate overall accuracy
    ov_acc = mir_eval.melody.overall_accuracy(conf_clean_vocs_bin, f_clean_vocs_cents, conf_extr_vocs_bin, f_extr_vocs_cents)

    outdir = exp_model_res_path + track + '/'
    save_file = outdir + "pitch_evaluation.json"
    
    #Save pitch evaluations in json file
    pitch_track_eval = {}
    pitch_track_eval['Voicing recall'] = v_rec
    pitch_track_eval['False alarm'] = fal_al
    pitch_track_eval['Raw pitch accuracy'] = rpa
    pitch_track_eval['Raw chroma accuracy'] = rca
    pitch_track_eval['Overall accuracy'] = ov_acc

#     with open(save_file, 'w') as fp:
#         json.dump(pitch_track_eval, fp)
    
#     print("Pitch accuracy measures saved!")
    
    


In [None]:
# Begin evaluation of pitch accuracy (CREPE)
W = 1024
H = 512

for track in os.listdir(exp_model_res_path):
    print(track)
    print("Extracting Pitch")
    
    clean_vocs_path = test_data_folder + track + '/vocals.wav'
    extr_vocs_path = exp_model_res_path + track + '/vocals.wav'
    
    clean_vocs = ess.MonoLoader(sampleRate=44100 , filename=clean_vocs_path)()
    extr_vocs = ess.MonoLoader(sampleRate=44100 , filename=extr_vocs_path)()

    t_clean_vocs, f_clean_vocs, conf_clean_vocs, act_clean_vocs = crepe.predict(clean_vocs, 44100, viterbi=True)
    t_extr_vocs, f_extr_vocs, conf_extr_vocs, act_extr_vocs = crepe.predict(extr_vocs, 44100, viterbi=True)

#     pyin = ess.PitchYinProbabilistic(frameSize=W, hopSize=H)

#     f_clean_vocs, conf_clean_vocs = pyin(clean_vocs)
#     f_extr_vocs, conf_extr_vocs = pyin(extr_vocs)

    print(f_clean_vocs.shape)
    print(t_clean_vocs.shape)
    
    print("Pitch extracted!")
    
    
    #Make entire frequency array postivie
    f_clean_vocs = np.absolute(f_clean_vocs)
    f_extr_vocs = np.absolute(f_extr_vocs)    

    if (f_clean_vocs.shape[0]>f_extr_vocs.shape[0]):
        f_clean_vocs = f_clean_vocs[:f_extr_vocs.shape[0]]
        conf_clean_vocs = conf_clean_vocs[:f_extr_vocs.shape[0]]
        t_clean_vocs = t_extr_vocs
    elif (f_clean_vocs.shape[0]<=f_extr_vocs.shape[0]):
        f_extr_vocs = f_extr_vocs[:f_clean_vocs.shape[0]]
        conf_extr_vocs = conf_extr_vocs[:f_clean_vocs.shape[0]]
        t_extr_vocs = t_clean_vocs
    
    #plt.plot((f_clean_vocs-f_extr_vocs)**2)
    #plt.plot(f_clean_vocs)
    #plt.show()
    
    #Convert frequency to cents
    f_clean_vocs_cents = mir_eval.melody.hz2cents(f_clean_vocs)
    f_extr_vocs_cents = mir_eval.melody.hz2cents(f_extr_vocs)
    
    plt.plot((f_clean_vocs_cents-f_extr_vocs_cents)**2)
    outdir = exp_model_res_path + track + '/'
    plt.savefig(outdir + "freq_mse_CREPE.png")

    clean_pitch_track =  np.transpose(np.concatenate((np.atleast_2d(t_clean_vocs), np.atleast_2d(f_clean_vocs)), axis=0))
    extr_pitch_track =  np.transpose(np.concatenate((np.atleast_2d(t_extr_vocs), np.atleast_2d(f_extr_vocs)), axis=0))

    np.savetxt(outdir+'clean_pitch_crepe.csv', clean_pitch_track, delimiter=',')
    np.savetxt(outdir+'extr_pitch_crepe.csv', extr_pitch_track , delimiter=',')
    
    
    
#     np.savetxt(outdir+'clean_pitch.csv', f_clean_vocs, delimiter=',')
#     np.savetxt(outdir+'extr_pitch.csv', f_extr_vocs , delimiter=',')


    #Convert voicing probabilites to binary vector, with a given threshold
    conf_clean_vocs_bin = np.zeros(np.shape(conf_clean_vocs))
    conf_extr_vocs_bin = np.zeros(np.shape(conf_extr_vocs))
    thresh = 0.5
    for i in range(conf_clean_vocs_bin.shape[0]):
        if (conf_clean_vocs[i]>thresh):
            conf_clean_vocs_bin[i]=1
        if (conf_extr_vocs[i]>thresh):
            conf_extr_vocs_bin[i]=1
            
    print("Calculating pitch accuracy measures")
    
    #Voicing accuracy measures    
    v_rec, fal_al = mir_eval.melody.voicing_measures(conf_clean_vocs_bin, conf_extr_vocs_bin)

    #Calculate raw pitch accuracy - Default cent_tolerance=50
    rpa = mir_eval.melody.raw_pitch_accuracy(conf_clean_vocs_bin, f_clean_vocs_cents, conf_extr_vocs_bin, f_extr_vocs_cents)

    #Calculate raw chroma accuracy - Default cent_tolerance=50
    rca = mir_eval.melody.raw_chroma_accuracy(conf_clean_vocs_bin, f_clean_vocs_cents, conf_extr_vocs_bin, f_extr_vocs_cents)

    #Calculate overall accuracy
    ov_acc = mir_eval.melody.overall_accuracy(conf_clean_vocs_bin, f_clean_vocs_cents, conf_extr_vocs_bin, f_extr_vocs_cents)

    outdir = exp_model_res_path + track + '/'
    save_file = outdir + "pitch_evaluation_CREPE.json"
    
    #Save pitch evaluations in json file
    pitch_track_eval = {}
    pitch_track_eval['Voicing recall'] = v_rec
    pitch_track_eval['False alarm'] = fal_al
    pitch_track_eval['Raw pitch accuracy'] = rpa
    pitch_track_eval['Raw chroma accuracy'] = rca
    pitch_track_eval['Overall accuracy'] = ov_acc

#     with open(save_file, 'w') as fp:
#         json.dump(pitch_track_eval, fp)
    
#     print("Pitch accuracy measures saved!")
    
    


In [None]:
conf_clean_vocs

In [None]:
track = m