In [3]:
import torch
import subprocess, os


In [4]:
def separate_vocals(input_path, output_dir="separated"):
    os.makedirs(output_dir, exist_ok=True)
    subprocess.run([
        "python", "-m", "demucs.separate",
        "--two-stems", "vocals",
        "-o", output_dir,
        input_path
    ], check=True)
    name = os.path.splitext(os.path.basename(input_path))[0]
    return os.path.join(output_dir, "htdemucs", name, "vocals.wav")

In [None]:
base_path = '../Music'
song_path = 'wav_files/stargazing-kygo.wav'
output_path = 'raw_vocals'

final_song_path = os.path.join(base_path, song_path)
final_output_path = os.path.join(base_path, output_path)

input_path = os.path.abspath("../Music/wav_files/waitingforlove-avicii.wav")
output_dir = os.path.abspath("../Music/raw_vocals")

separate_vocals(input_path, output_dir)

# this works extremely well

[1mImportant: the default model was recently changed to `htdemucs`[0m the latest Hybrid Transformer Demucs model. In some cases, this model can actually perform worse than previous models. To get back the old default model use `-n mdx_extra_q`.
Selected model is a bag of 1 models. You will see that many progress bars per track.
Separated tracks will be stored in /Users/alexpower/Documents/Music-Dataset-Tool/Music/raw_vocals/htdemucs
Separating track /Users/alexpower/Documents/Music-Dataset-Tool/Music/wav_files/waitingforlove-avicii.wav


100%|████████████████████████████████████████████████████████████████████████| 234.0/234.0 [01:34<00:00,  2.47seconds/s]
  return save_with_torchcodec(
  return save_with_torchcodec(


'/Users/alexpower/Documents/Music-Dataset-Tool/Music/raw_vocals/htdemucs/waitingforlove-avicii/vocals.wav'

In [33]:
import librosa
import numpy as np

In [83]:
# now we can use those vocals to more accurate check for vocals at a specific moment
def vocal_activity_segments_spectral(vocals_path, frame_sec=0.25,
                                     rms_factor=1.2, centroid_factor=0.9,
                                     bandpass=True):
    """
    Hybrid detector: uses RMS + spectral centroid to suppress background bleed.
    """
    y, sr = librosa.load(vocals_path, sr=16000, mono=True)
    frame_len = int(sr * frame_sec)
    hop = frame_len // 2

    if bandpass:
        y = librosa.effects.preemphasis(y)  # mild high-freq emphasis
        y = librosa.effects.hpss(y)[0]      # harmonic component only

    rms = librosa.feature.rms(y=y, frame_length=frame_len, hop_length=hop)[0]
    centroid = librosa.feature.spectral_centroid(y=y, sr=sr,
                                                 n_fft=frame_len,
                                                 hop_length=hop)[0]

    times = librosa.frames_to_time(np.arange(len(rms)), sr=sr, hop_length=hop)

    # Normalize both to [0,1] for stability
    rms_n = (rms - rms.min()) / (rms.max() - rms.min() + 1e-9)
    cent_n = (centroid - centroid.min()) / (centroid.max() - centroid.min() + 1e-9)

    # Dynamic thresholds
    rms_thr = np.median(rms_n) * rms_factor
    cent_thr = np.median(cent_n) * centroid_factor

    active = (rms_n > rms_thr) & (cent_n > cent_thr)
    segments = []
    start = None
    for t, a in zip(times, active):
        if a and start is None:
            start = t
        elif not a and start is not None:
            segments.append((start, t))
            start = None
    if start is not None:
        segments.append((start, times[-1]))

    return segments

def has_vocals_at(vocals_path, t, window=0.35, **kwargs):
    segs = vocal_activity_segments_spectral(vocals_path, **kwargs)
    start, end = t - window, t + window
    return any(not (seg_end < start or seg_start > end)
               for seg_start, seg_end in segs)

In [84]:
example_path = "../Music/raw_vocals/htdemucs/waitingforlove-avicii/vocals.wav"

# since this expects 60 seconds to be 60.00, lets make a function that converts 
# minute times to seconds

def time_to_secs(time):
    str_representation = str(time)
    minutes, seconds = str_representation.split(':')
    
    minutes = minutes[1]
    
    return float((int(minutes) * 60) + (int(seconds)))
    
# has_vocals_at(example_path, time_to_secs(1.40))

In [88]:
# so now we to devise a pipeline that for each song is results.join, gets the phrase boundaries,
# finds the corresponding no_vocal track, check if vocals exist at that points, then export
# lets make it so that phrase_boundary[i] corresponds to has_vocals[i]
# have it so that we do not overwrite the current json, but instead just make a new file so we do not
# corrupt the original
import json

json_path = os.path.abspath('../results.json') 
raw_vocals_path = os.path.abspath('../Music/raw_vocals/htdemucs')

# lets also just make these absolute as to not screw this up and have it work more broadly
def update_json_phrase_boundaries_vocals(json_path, raw_vocals_path=None, output_path=None):
    with open(json_path, 'r') as f:
        data = json.load(f)
        
    # now foreach song, get the name and load both phrase boundaries
    # song will be a dict here
    for song in data:
        song_name = song['song_name']
        entry_phrase_boundaries = song['features']['first_phrase_boundaries']
        exit_phrase_boundaries = song['features']['last_phrase_boundaries']
        
        # recall the phrase boundaries are in the format "XX:XX"
        vocal_file_path = os.path.join(raw_vocals_path, song_name.replace(".wav", ""), 'vocals.wav')
        
        # the following could break if the len of entry and exit phrase boundaries is not the same
        has_vocals_entry = []
        has_vocals_exit = []
        
        for i in range(len(entry_phrase_boundaries)):
            entry_bool = has_vocals_at(vocal_file_path, time_to_secs(entry_phrase_boundaries[i]))
            exit_bool = has_vocals_at(vocal_file_path, time_to_secs(exit_phrase_boundaries[i]))
            
            has_vocals_entry.append(entry_bool)
            has_vocals_exit.append(exit_bool)
            
        print(f"song name: {song_name}")
        print(f"entry phrase boundaries: {entry_phrase_boundaries}")
        print(f"bool results: {has_vocals_entry}")
        print(f"exit phrase boundaries: {exit_phrase_boundaries}")
        print(f"bool results: {has_vocals_exit}")
        # just check results now
        

update_json_phrase_boundaries_vocals(json_path, raw_vocals_path)

song name: waitingforlove-avicii.wav
entry phrase boundaries: ['00:00', '00:15', '00:30', '00:46']
bool results: [False, True, True, True]
exit phrase boundaries: ['03:05', '03:20', '03:35', '03:49']
bool results: [False, False, False, False]
song name: wakemeup-avicii.wav
entry phrase boundaries: ['00:00', '00:15', '00:31', '00:47']
bool results: [False, False, True, True]
exit phrase boundaries: ['03:06', '03:22', '03:37', '03:53']
bool results: [True, False, False, False]
song name: stargazing-kygo.wav
entry phrase boundaries: ['00:00', '00:17', '00:34', '00:52']
bool results: [False, False, True, True]
exit phrase boundaries: ['02:49', '03:08', '03:27', '03:47']
bool results: [True, True, True, True]
