# Separating A Capella Songs into their Separated Vocal Tracks using Spleeter

In [1]:
# Turn mono songs into stereo songs
from IPython.display import Audio
import csv
import os
import pandas as pd
import sox

paths = []
with open('configs/jacapella_train.csv', newline='') as csvfile:
    # ignore header
    reader = csv.reader(csvfile, delimiter=',')
    next(reader)
    for row in reader:
        paths.append(row[0])
        paths.append(row[1])
        paths.append(row[2])
        paths.append(row[3])
        paths.append(row[4])
        paths.append(row[5])
        paths.append(row[6])
 
for path in paths:
    !sox {path} abc\\{path} channels 2
    

In [None]:
# Train model using CPU
!spleeter train -p configs/jacapella_config.json --verbose -d C:\Users\ssyda\DataspellProjects\IS424\Notebooks\configs\jacapelladb                                  

In [2]:
# Train model using GPU
!docker run --gpus all -v "%cd%"/output:/output -v "%cd%"/configs:/configs -v "%cd%"/jacapella_model:/jacapella_model deezer/spleeter-gpu:3.8 train -p configs/jacapella_config.json -d /configs/jacapelladb

INFO:spleeter:Start model training
INFO:spleeter:Loading audio b'/configs/jacapelladb/Dataset/Jacapella/popular/koinobori/mixture.wav' from 3.901919083422843 to 23.901919083422843
INFO:spleeter:Loading audio b'/configs/jacapelladb/Dataset/Jacapella/reggae/shoujoujinotanukibayashi/mixture.wav' from 14.238966463778494 to 34.238966463778496
INFO:spleeter:Loading audio b'/configs/jacapelladb/Dataset/Jacapella/bossa_nova/ryousen/mixture.wav' from 2.664351354576919 to 22.66435135457692
INFO:spleeter:Loading audio b'/configs/jacapelladb/Dataset/Jacapella/neutral/harugakita/mixture.wav' from 1.4068791025182006 to 21.4068791025182
INFO:spleeter:Audio data loaded successfully
INFO:spleeter:Loading audio b'/configs/jacapelladb/Dataset/Jacapella/neutral/hato/mixture.wav' from 9.289970163504 to 29.289970163504
INFO:spleeter:Audio data loaded successfully
INFO:spleeter:Loading audio b'/configs/jacapelladb/Dataset/Jacapella/neutral/harugakita/mixture.wav' from 5.034395512591003 to 25.034395512591004


In [None]:
# Split the song into its vocal tracks
!spleeter separate -p configs/jacapella_config.json -o output Dataset/Jacapella/popular/akatonbo/mixture.wav

In [16]:
# Evaluate the model
import mir_eval
import numpy as np

def eval_audio(signal1_path, signal2_path):
    
    
    signal1, _ = mir_eval.io.load_wav(signal1_path)
    signal2, _ = mir_eval.io.load_wav(signal2_path)

    # Make sure both signals have the same length
    min_length = min(len(signal1), len(signal2))
    signal1 = signal1[:min_length]
    signal2 = signal2[:min_length]
    
    # Compute SDR
    sdr, sir, sar, _ = mir_eval.separation.bss_eval_sources(reference_sources=np.array([signal1, signal2]), estimated_sources=np.array([signal1, signal2]))
    
    print(f"SDR: {sdr[0]} dB")

print("Alto")
eval_audio('Dataset/Jacapella/popular/akatonbo/alto.wav', 'output/mixture/alto.wav')
print("Bass")
eval_audio('Dataset/Jacapella/popular/akatonbo/bass.wav','output/mixture/bass.wav')
print("Lead Vocal")
eval_audio('Dataset/Jacapella/popular/akatonbo/lead_vocal.wav','output/mixture/lead_vocal.wav')
print("Soprano")
eval_audio('Dataset/Jacapella/popular/akatonbo/soprano.wav','output/mixture/soprano.wav')
print("Tenor")
eval_audio('Dataset/Jacapella/popular/akatonbo/tenor.wav','output/mixture/tenor.wav')
print("Vocal Percussion")
eval_audio('Dataset/Jacapella/popular/akatonbo/vocal_percussion.wav','output/mixture/vocal_percussion.wav')



Alto
SDR: 242.2009451705427 dB
Bass
SDR: 229.24778453305157 dB
Lead Vocal
SDR: 242.92454291889453 dB
Soprano
SDR: 196.82245528493604 dB
Tenor
SDR: 226.91228244413938 dB
Vocal Percussion
SDR: 201.29209448601856 dB


In [4]:
# HELPER FUNCTIONS
# get duration of a song
import librosa
import soundfile as sf
import numpy as np
import os

def get_duration(filename):
    y, sr = librosa.load(filename)
    duration = librosa.get_duration(y=y, sr=sr)
    return duration

# Add to csv file
def add_to_csv(mix_path, alto_path, bass_path, lead_vocal_path, soprano_path, tenor_path, vocal_percussion_path, csv_path):
    print("adding")
    duration = get_duration(mix_path)
    with open(csv_path, 'a') as f:
        f.write(mix_path + ',' + alto_path + ',' + bass_path + ',' + lead_vocal_path + ',' + soprano_path + ',' + tenor_path + ',' + vocal_percussion_path + ',' + str(duration) + '\n')

# Dataset/Jacapella/popular/akatonbo/mixture.wav
# for loop through all of the possible paths in Jacapella folder
# for loop through all of the possible genre folders in Jacapella folder
# for loop through all of the possible song folders in each genre folder
# add the paths to the csv file
for genre in [filename for filename in os.listdir('Dataset/Jacapella') if os.path.isdir(os.path.join('Dataset/Jacapella',filename))]:
    for song in [filename for filename in os.listdir('Dataset/Jacapella/'+genre) if os.path.isdir(os.path.join('Dataset/Jacapella/'+genre,filename))]:
        mix_path = 'Dataset/Jacapella/' + genre + '/' + song + '/mixture.wav'
        alto_path = 'Dataset/Jacapella/' + genre + '/' + song + '/alto.wav'
        bass_path = 'Dataset/Jacapella/' + genre + '/' + song + '/bass.wav'
        lead_vocal_path = 'Dataset/Jacapella/' + genre + '/' + song + '/lead_vocal.wav'
        soprano_path = 'Dataset/Jacapella/' + genre + '/' + song + '/soprano.wav'
        tenor_path = 'Dataset/Jacapella/' + genre + '/' + song + '/tenor.wav'
        vocal_percussion_path = 'Dataset/Jacapella/' + genre + '/' + song + '/vocal_percussion.wav'
        add_to_csv(mix_path, alto_path, bass_path, lead_vocal_path, soprano_path, tenor_path, vocal_percussion_path, 'configs/jacapella_train.csv')

adding
adding
adding
adding
adding
adding
adding
adding
adding
adding
adding
adding
adding
adding
adding
adding
adding
adding
adding
adding
adding
adding
adding
adding
adding
adding
adding
adding
adding
adding
adding
adding
adding
adding
adding
