In [19]:
import argparse
import musdb
import museval
import test
import multiprocessing
import functools
from pathlib import Path
import torch
import tqdm

In [20]:
def separate_and_evaluate(
    track,
    targets,
    model_name,
    niter,
    alpha,
    softmask,
    output_dir,
    eval_dir,
    device='cpu'
):
    estimates = test.separate(
        audio=track.audio,
        targets=targets,
        model_name=model_name,
        niter=niter,
        alpha=alpha,
        softmask=softmask,
        device=device
    )
    if output_dir:
        mus.save_estimates(estimates, track, output_dir)

    scores = museval.eval_mus_track(
        track, estimates, output_dir=eval_dir
    )
    return scores


In [21]:
parser = argparse.ArgumentParser(
    description='MUSDB18 Evaluation',
    add_help=False
)

parser.add_argument(
    '--targets',
    nargs='+',
    #default=['vocals', 'drums', 'bass', 'other'],
    default=['vocals'],
    type=str,
    help='provide targets to be processed. \
          If none, all available targets will be computed'
)

parser.add_argument(
    '--model',
    default='umxhq',
    type=str,
    help='path to mode base directory of pretrained models',
    #default = '../out_unmix/'
)

parser.add_argument(
    '--outdir',
    default='../out_dir_estimates/Exp1_umxhq',
    type=str,
    help='Results path where audio evaluation results are stored'
)

parser.add_argument(
    '--evaldir',
    default = '../out_dir_evals/Exp1_umxhq',
    type=str,
    help='Results path for museval estimates'
)

parser.add_argument(
    '--root',
    default = '../test_out/Exp_1/exp1_tracks/',
    type=str,
    help='Path to MUSDB18'
)

parser.add_argument(
    '--subset',
    type=str,
    default='train',
    help='MUSDB subset (`train`/`test`)'
)

parser.add_argument(
    '--cores',
    type=int,
    default=1
)

parser.add_argument(
    '--no-cuda',
    action='store_true',
    default=False,
    help='disables CUDA inference'
)

parser.add_argument(
    '--is-wav',
    action='store_true', default=True,
    help='flags wav version of the dataset'
)

args, _ = parser.parse_known_args()
args = test.inference_args(parser, args)

use_cuda = not args.no_cuda and torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")


usage: ipykernel_launcher.py [-h] [--targets TARGETS [TARGETS ...]]
                             [--model MODEL] [--outdir OUTDIR]
                             [--evaldir EVALDIR] [--root ROOT]
                             [--subset SUBSET] [--cores CORES] [--no-cuda]
                             [--is-wav] [--softmask] [--niter NITER]
                             [--alpha ALPHA] [--samplerate SAMPLERATE]
                             [--residual-model]
ipykernel_launcher.py: error: unrecognized arguments: -f /run/user/1010/jupyter/kernel-015d3d98-1663-43da-94a2-1350847ad1de.json


SystemExit: 2

In [22]:
mus = musdb.DB(
    root=args.root,
    download=args.root is None,
    subsets=args.subset,
    is_wav=args.is_wav
)

In [23]:
mus


<musdb.DB at 0x7ff13e892278>

In [24]:
results = museval.EvalStore()

In [25]:
results

KeyError: 'target'

In [26]:
for track in tqdm.tqdm(mus.tracks):
    scores = separate_and_evaluate(
        track,
        targets=args.targets,
        model_name=args.model,
        niter=2,
        alpha=1,
        softmask=False,
        output_dir=None,
        eval_dir=args.evaldir
        #device=device
    )
    results.add_track(scores)

  0%|          | 0/3 [00:00<?, ?it/s]
  0%|          | 0/1 [00:00<?, ?it/s][A

CHECK OpenUnmix(
  (stft): STFT()
  (spec): Spectrogram()
  (transform): Sequential(
    (0): STFT()
    (1): Spectrogram()
  )
  (fc1): Linear(in_features=2974, out_features=512, bias=False)
  (bn1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (lstm): LSTM(512, 256, num_layers=3, dropout=0.4, bidirectional=True)
  (fc2): Linear(in_features=1024, out_features=512, bias=False)
  (bn2): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc3): Linear(in_features=512, out_features=4098, bias=False)
  (bn3): BatchNorm1d(4098, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)



100%|██████████| 1/1 [00:00<00:00,  2.88it/s][A
  0%|          | 0/3 [00:01<?, ?it/s]


KeyError: "The following 'id_vars' are not present in the DataFrame: ['name', 'time']"

In [None]:
track = mus.tracks[0]

In [27]:
track

musdb_eg1

In [28]:
scores = separate_and_evaluate(
    track,
    targets=args.targets,
    model_name=args.model,
    niter=2,
    alpha=1,
    softmask=False,
    output_dir=args.outdir,
    eval_dir=args.evaldir
    #device=device
)

  0%|          | 0/1 [00:00<?, ?it/s]

CHECK OpenUnmix(
  (stft): STFT()
  (spec): Spectrogram()
  (transform): Sequential(
    (0): STFT()
    (1): Spectrogram()
  )
  (fc1): Linear(in_features=2974, out_features=512, bias=False)
  (bn1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (lstm): LSTM(512, 256, num_layers=3, dropout=0.4, bidirectional=True)
  (fc2): Linear(in_features=1024, out_features=512, bias=False)
  (bn2): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc3): Linear(in_features=512, out_features=4098, bias=False)
  (bn3): BatchNorm1d(4098, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)


100%|██████████| 1/1 [00:00<00:00,  1.53it/s]


In [50]:
#targets = ['vocals', 'bass' , 'drums']
targets = ['vocals']

In [51]:
estimates = test.separate(
    audio=track.audio,
    targets=targets,
    model_name=args.model,
    niter=2,
    alpha=1,
    softmask=False,
    #device=device
)

  0%|          | 0/1 [00:00<?, ?it/s]

CHECK OpenUnmix(
  (stft): STFT()
  (spec): Spectrogram()
  (transform): Sequential(
    (0): STFT()
    (1): Spectrogram()
  )
  (fc1): Linear(in_features=2974, out_features=512, bias=False)
  (bn1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (lstm): LSTM(512, 256, num_layers=3, dropout=0.4, bidirectional=True)
  (fc2): Linear(in_features=1024, out_features=512, bias=False)
  (bn2): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc3): Linear(in_features=512, out_features=4098, bias=False)
  (bn3): BatchNorm1d(4098, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)


100%|██████████| 1/1 [00:00<00:00,  2.90it/s]


In [52]:
estimates

{'vocals': array([[-0.06713487,  0.06091061],
        [-0.05950908,  0.0561809 ],
        [-0.03747256, -0.00544321],
        ...,
        [ 0.05709865,  0.06730049],
        [ 0.05682895,  0.0662374 ],
        [ 0.0542154 ,  0.06809763]]),
 'accompaniment': array([[ 0.02007374,  0.09089083],
        [ 0.02619111,  0.09239524],
        [ 0.05245196,  0.05696818],
        ...,
        [-0.05764364, -0.06863255],
        [-0.05842136, -0.06709738],
        [-0.05447776, -0.06814776]])}

In [53]:
scores = museval.eval_mus_track(
        track, estimates, output_dir=args.evaldir
    )

  "Incorrect usage of BSSeval : at least two estimates must be provided. Target score will be empty."


In [54]:
scores



In [55]:
track.targets['vocals'].audio

array([[-0.03469849,  0.09182739],
       [-0.02337646,  0.07556152],
       [ 0.00708008, -0.00714111],
       ...,
       [-0.0005188 , -0.00039673],
       [-0.00073242, -0.00045776],
       [-0.00091553, -0.00045776]])

In [56]:
audio_estimates = []
audio_reference = []

In [57]:
eval_targets = []

In [58]:
a = list(track.targets.items())

In [59]:
a

[('vocals', vocals), ('linear_mixture', vocals)]

In [60]:
for key, target in a:
    try:
        # try to fetch the audio from the user_results of a given key
        estimates[key]
    except KeyError:
        # ignore wrong key and continue
        continue
    eval_targets.append(key)

In [61]:
mode='v4'
win=1.0
hop=1.0
data = museval.aggregate.TrackStore(win=win, hop=hop, track_name=track.name)

In [62]:
# check if vocals and accompaniment is among the targets
has_acc = all(x in eval_targets for x in ['vocals', 'accompaniment'])
if has_acc:
    # remove accompaniment from list of targets, because
    # the voc/acc scenario will be evaluated separately
    eval_targets.remove('accompaniment')


In [65]:
eval_targets


['vocals']

In [66]:
audio_estimates.append(estimates['vocals'])

In [67]:
audio_reference.append(track.targets['vocals'].audio)

In [69]:
audio_reference

[array([[-0.03469849,  0.09182739],
        [-0.02337646,  0.07556152],
        [ 0.00708008, -0.00714111],
        ...,
        [-0.0005188 , -0.00039673],
        [-0.00073242, -0.00045776],
        [-0.00091553, -0.00045776]])]

In [70]:
audio_estimates

[array([[-0.06713487,  0.06091061],
        [-0.05950908,  0.0561809 ],
        [-0.03747256, -0.00544321],
        ...,
        [ 0.05709865,  0.06730049],
        [ 0.05682895,  0.0662374 ],
        [ 0.0542154 ,  0.06809763]])]

In [74]:
def pad_or_truncate(
    audio_reference,
    audio_estimates
):
    """Pad or truncate estimates by duration of references:
    - If reference > estimates: add zeros at the and of the estimated signal
    - If estimates > references: truncate estimates to duration of references

    Parameters
    ----------
    references : np.ndarray, shape=(nsrc, nsampl, nchan)
        array containing true reference sources
    estimates : np.ndarray, shape=(nsrc, nsampl, nchan)
        array containing estimated sources
    Returns
    -------
    references : np.ndarray, shape=(nsrc, nsampl, nchan)
        array containing true reference sources
    estimates : np.ndarray, shape=(nsrc, nsampl, nchan)
        array containing estimated sources
    """
    est_shape = audio_estimates.shape
    ref_shape = audio_reference.shape
    if est_shape[1] != ref_shape[1]:
        if est_shape[1] >= ref_shape[1]:
            audio_estimates = audio_estimates[:, :ref_shape[1], :]
        else:
            # pad end with zeros
            audio_estimates = np.pad(
                audio_estimates,
                [
                    (0, 0),
                    (0, ref_shape[1] - est_shape[1]),
                    (0, 0)
                ],
                mode='constant'
            )

    return audio_reference, audio_estimates

In [78]:
def evaluate(
    references,
    estimates,
    win=1*44100,
    hop=1*44100,
    mode='v4',
    padding=True
):
    """BSS_EVAL images evaluation using metrics module

    Parameters
    ----------
    references : np.ndarray, shape=(nsrc, nsampl, nchan)
        array containing true reference sources
    estimates : np.ndarray, shape=(nsrc, nsampl, nchan)
        array containing estimated sources
    window : int, defaults to 44100
        window size in samples
    hop : int
        hop size in samples, defaults to 44100 (no overlap)
    mode : str
        BSSEval version, default to `v4`
    Returns
    -------
    SDR : np.ndarray, shape=(nsrc,)
        vector of Signal to Distortion Ratios (SDR)
    ISR : np.ndarray, shape=(nsrc,)
        vector of Source to Spatial Distortion Image (ISR)
    SIR : np.ndarray, shape=(nsrc,)
        vector of Source to Interference Ratios (SIR)
    SAR : np.ndarray, shape=(nsrc,)
        vector of Sources to Artifacts Ratios (SAR)
    """

    estimates = np.array(estimates)
    references = np.array(references)

    if padding:
        references, estimates = pad_or_truncate(references, estimates)

    SDR, ISR, SIR, SAR, _ = museval.metrics.bss_eval(
        references,
        estimates,
        compute_permutation=False,
        window=win,
        hop=hop,
        framewise_filters=(mode == "v3"),
        bsseval_sources_version=False
    )

    return SDR, ISR, SIR, SAR

In [79]:
SDR, ISR, SIR, SAR = evaluate(
        audio_reference,
        audio_estimates,
        win=int(win*track.rate),
        hop=int(hop*track.rate),
        mode=mode
    )

In [95]:
SDR[0]


(6,)

In [101]:
audio_reference[0][:,0].shape

(300032,)

In [104]:
300032/44100.0

6.8034467120181406

In [105]:
SDR

array([[4.27694499, 5.7724543 , 7.05702004, 7.77372717, 6.27157259,
        5.96342381]])

In [106]:
ISR

array([[ 9.11331123, 12.35062742, 12.56146264, 12.74114344, 13.33562681,
        13.0517256 ]])

In [109]:
SIR

array([[inf, inf, inf, inf, inf, inf]])

In [108]:
SAR

array([[5.12293843, 6.95114838, 6.70250868, 8.22178515, 6.48200614,
        5.927545  ]])

In [110]:
ISR

array([[ 9.11331123, 12.35062742, 12.56146264, 12.74114344, 13.33562681,
        13.0517256 ]])