In [1]:
import argparse
import musdb
import museval
import test
import multiprocessing
import functools
from pathlib import Path
import torch
import tqdm

In [2]:
def separate_and_evaluate(
    track,
    targets,
    model_name,
    niter,
    alpha,
    softmask,
    output_dir,
    eval_dir,
    device='cpu'
):
    estimates = test.separate(
        audio=track.audio,
        targets=targets,
        model_name=model_name,
        niter=niter,
        alpha=alpha,
        softmask=softmask,
        device=device
    )
    if output_dir:
        mus.save_estimates(estimates, track, output_dir)

    scores = museval.eval_mus_track(
        track, estimates, output_dir=eval_dir
    )
    return scores


In [33]:
parser = argparse.ArgumentParser(
    description='MUSDB18 Evaluation',
    add_help=False
)

parser.add_argument(
    '--targets',
    nargs='+',
    #default=['vocals', 'drums', 'bass', 'other'],
    default=['vocals'],
    type=str,
    help='provide targets to be processed. \
          If none, all available targets will be computed'
)

parser.add_argument(
    '--model',
    default='umxhq',
    type=str,
    help='path to mode base directory of pretrained models',
    #default = '../out_unmix/'
)

parser.add_argument(
    '--outdir',
    default='../out_dir_estimates/Exp1_umxhq',
    type=str,
    help='Results path where audio evaluation results are stored'
)

parser.add_argument(
    '--evaldir',
    default = '../out_dir_evals/Exp1_umxhq',
    type=str,
    help='Results path for museval estimates'
)

parser.add_argument(
    '--root',
    default = '../test_out/Exp_1/exp1_tracks/',
    type=str,
    help='Path to MUSDB18'
)

parser.add_argument(
    '--subset',
    type=str,
    default='train',
    help='MUSDB subset (`train`/`test`)'
)

parser.add_argument(
    '--cores',
    type=int,
    default=1
)

parser.add_argument(
    '--no-cuda',
    action='store_true',
    default=False,
    help='disables CUDA inference'
)

parser.add_argument(
    '--is-wav',
    action='store_true', default=True,
    help='flags wav version of the dataset'
)

args, _ = parser.parse_known_args()
args = test.inference_args(parser, args)

use_cuda = not args.no_cuda and torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")


usage: ipykernel_launcher.py [-h] [--targets TARGETS [TARGETS ...]]
                             [--model MODEL] [--outdir OUTDIR]
                             [--evaldir EVALDIR] [--root ROOT]
                             [--subset SUBSET] [--cores CORES] [--no-cuda]
                             [--is-wav] [--softmask] [--niter NITER]
                             [--alpha ALPHA] [--samplerate SAMPLERATE]
                             [--residual-model]
ipykernel_launcher.py: error: unrecognized arguments: -f /run/user/1010/jupyter/kernel-3ade00a1-d4d4-4f4d-a631-23fb5ba096fa.json


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [34]:
mus = musdb.DB(
    root=args.root,
    download=args.root is None,
    subsets=args.subset,
    is_wav=args.is_wav
)

In [35]:
results = museval.EvalStore()

In [36]:
for track in tqdm.tqdm(mus.tracks):
    scores = separate_and_evaluate(
        track,
        targets=args.targets,
        model_name=args.model,
        niter=2,
        alpha=1,
        softmask=False,
        output_dir=None,
        eval_dir=args.evaldir
        #device=device
    )
    results.add_track(scores)

  0%|          | 0/3 [00:00<?, ?it/s]
  0%|          | 0/1 [00:00<?, ?it/s][A

CHECK OpenUnmix(
  (stft): STFT()
  (spec): Spectrogram()
  (transform): Sequential(
    (0): STFT()
    (1): Spectrogram()
  )
  (fc1): Linear(in_features=2974, out_features=512, bias=False)
  (bn1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (lstm): LSTM(512, 256, num_layers=3, dropout=0.4, bidirectional=True)
  (fc2): Linear(in_features=1024, out_features=512, bias=False)
  (bn2): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc3): Linear(in_features=512, out_features=4098, bias=False)
  (bn3): BatchNorm1d(4098, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)



100%|██████████| 1/1 [00:00<00:00,  3.00it/s][A
  "Incorrect usage of BSSeval : at least two estimates must be provided. Target score will be empty."
  0%|          | 0/3 [00:01<?, ?it/s]


KeyError: "The following 'id_vars' are not present in the DataFrame: ['name', 'time']"

In [None]:
scores

In [37]:
track

musdb_eg1

In [38]:
scores = separate_and_evaluate(
    track,
    targets=args.targets,
    model_name=args.model,
    niter=2,
    alpha=1,
    softmask=False,
    output_dir=args.outdir,
    eval_dir=args.evaldir
    #device=device
)

  0%|          | 0/1 [00:00<?, ?it/s]

CHECK OpenUnmix(
  (stft): STFT()
  (spec): Spectrogram()
  (transform): Sequential(
    (0): STFT()
    (1): Spectrogram()
  )
  (fc1): Linear(in_features=2974, out_features=512, bias=False)
  (bn1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (lstm): LSTM(512, 256, num_layers=3, dropout=0.4, bidirectional=True)
  (fc2): Linear(in_features=1024, out_features=512, bias=False)
  (bn2): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc3): Linear(in_features=512, out_features=4098, bias=False)
  (bn3): BatchNorm1d(4098, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)


100%|██████████| 1/1 [00:00<00:00,  3.38it/s]


In [39]:
type(scores)

museval.aggregate.TrackStore

In [40]:
scores



In [41]:
type(results)

museval.aggregate.EvalStore

In [42]:
track

musdb_eg1

In [43]:
estimates = test.separate(
    audio=track.audio,
    targets=['vocals'],
    model_name='umxhq',
    niter=2,
    alpha=1.0,
    softmask=False
    #device=device
)

  0%|          | 0/1 [00:00<?, ?it/s]

CHECK OpenUnmix(
  (stft): STFT()
  (spec): Spectrogram()
  (transform): Sequential(
    (0): STFT()
    (1): Spectrogram()
  )
  (fc1): Linear(in_features=2974, out_features=512, bias=False)
  (bn1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (lstm): LSTM(512, 256, num_layers=3, dropout=0.4, bidirectional=True)
  (fc2): Linear(in_features=1024, out_features=512, bias=False)
  (bn2): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc3): Linear(in_features=512, out_features=4098, bias=False)
  (bn3): BatchNorm1d(4098, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)


100%|██████████| 1/1 [00:00<00:00,  3.22it/s]


In [44]:
estimates

{'vocals': array([[-0.06713487,  0.06091061],
        [-0.05950908,  0.0561809 ],
        [-0.03747256, -0.00544321],
        ...,
        [ 0.05709865,  0.06730049],
        [ 0.05682895,  0.0662374 ],
        [ 0.0542154 ,  0.06809763]]),
 'accompaniment': array([[ 0.02007374,  0.09089083],
        [ 0.02619111,  0.09239524],
        [ 0.05245196,  0.05696818],
        ...,
        [-0.05764364, -0.06863255],
        [-0.05842136, -0.06709738],
        [-0.05447776, -0.06814776]])}

In [45]:
estimates['tabla'] = estimates['accompaniment']

In [46]:
estimates

{'vocals': array([[-0.06713487,  0.06091061],
        [-0.05950908,  0.0561809 ],
        [-0.03747256, -0.00544321],
        ...,
        [ 0.05709865,  0.06730049],
        [ 0.05682895,  0.0662374 ],
        [ 0.0542154 ,  0.06809763]]),
 'accompaniment': array([[ 0.02007374,  0.09089083],
        [ 0.02619111,  0.09239524],
        [ 0.05245196,  0.05696818],
        ...,
        [-0.05764364, -0.06863255],
        [-0.05842136, -0.06709738],
        [-0.05447776, -0.06814776]]),
 'tabla': array([[ 0.02007374,  0.09089083],
        [ 0.02619111,  0.09239524],
        [ 0.05245196,  0.05696818],
        ...,
        [-0.05764364, -0.06863255],
        [-0.05842136, -0.06709738],
        [-0.05447776, -0.06814776]])}

In [47]:
scores = museval.eval_mus_track(track, estimates, output_dir=args.evaldir)

In [48]:
track.targets.items()

odict_items([('vocals', vocals), ('linear_mixture', vocals)])

In [49]:
museval.

SyntaxError: invalid syntax (<ipython-input-49-dd0a85768bc5>, line 1)

In [50]:
track.audio


array([[-0.04705811,  0.15176392],
       [-0.0333252 ,  0.14862061],
       [ 0.01498413,  0.05148315],
       ...,
       [-0.0005188 , -0.0012207 ],
       [-0.00167847, -0.0010376 ],
       [-0.00015259,  0.00015259]])

In [52]:
track.stems

(2, 300032, 2)