# Audio Source Separation

## Setup

In [1]:
# Imports and setup

import os
import glob
import json

import numpy as np
import pandas as pd
import soundfile as sf
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
from scipy.stats import kruskal, mannwhitneyu

import musdb
import museval
import torch, torchaudio

import openunmix.predict as openunmix

from demucs.pretrained import get_model
from demucs.apply import apply_model
from demucs.audio import convert_audio

from asteroid.models import XUMX

In [2]:
# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    try:
        print("CUDA device:", torch.cuda.get_device_name(0))
    except Exception as e:
        print("Could not get CUDA device name:", e)
else:
    print("Using CPU")

CUDA available: True
CUDA device: AMD Radeon RX 9070 XT


In [3]:
# Paths
path_to_folder = "/home/teraflops/Documents/code/music_source_separation"
musdb_root = os.path.join(path_to_folder,"datasets", "musdb18hq")
estimates_base_path = os.path.join(path_to_folder, "estimates")
output_base_path = os.path.join(path_to_folder, "outputs")

# Ensure directories exist
os.makedirs(estimates_base_path, exist_ok=True)
os.makedirs(output_base_path, exist_ok=True)
!ls

[1;34md[33mr[31mw[32mx[0m[33mr[1;90m-[0m[32mx[33mr[1;90m-[0m[32mx[0m    [1;90m-[0m [1;33mteraflops[0m [34m 6 paź 19:31[0m [34m [1m.git[0m
[1;34md[33mr[31mw[32mx[0m[33mr[1;90m-[0m[32mx[33mr[1;90m-[0m[32mx[0m    [1;90m-[0m [1;33mteraflops[0m [34m11 paź 00:50[0m [34m [1mexamples[0m
.[1;33mr[31mw[90m-[0m[33mr[1;90m--[0m[33mr[1;90m--[0m   [32m18[0m [1;33mteraflops[0m [34m 5 wrz 01:02[0m 󰊢 .gitignore
.[1;33mr[31mw[90m-[0m[33mr[1;90m--[0m[33mr[1;90m--[0m    [32m8[0m [1;33mteraflops[0m [34m 4 wrz 23:19[0m  .python-version
.[1;33mr[31mw[90m-[0m[33mr[1;90m--[0m[33mr[1;90m--[0m [1;32m4,1k[0m [1;33mteraflops[0m [34m14 paź 01:30[0m  median_mean_scores.csv
.[1;33mr[31mw[90m-[0m[33mr[1;90m--[0m[33mr[1;90m--[0m [1;32m2,9k[0m [1;33mteraflops[0m [34m14 paź 01:30[0m  median_median_scores.csv
.[1;33mr[31mw[4;32mx[0m[33mr[1;90m-[0m[32mx[33mr[1;90m-[0m[32mx[0m [1;32m112k[0m [1;3

In [4]:
# Load MUSDB dataset
mus = musdb.DB(
    root=musdb_root,
    is_wav=True,
    subsets="test",
)

## Open-unmix

In [None]:
# --- SETTINGS ---
model_name = "openunmix"
targets = ["vocals", "drums", "bass", "other"]
device = "cuda" if torch.cuda.is_available() else "cpu"

# --- PATHS ---
estimates_path = os.path.join(estimates_base_path, model_name)
output_path = os.path.join(output_base_path, model_name)
os.makedirs(estimates_path, exist_ok=True)
os.makedirs(output_path, exist_ok=True)

# --- CREATE TRACK SUBDIRECTORIES BEFORE SEPARATION ---
for track in mus:
    track_path = os.path.join(estimates_path, "test", track.name)
    os.makedirs(track_path, exist_ok=True)

print(f"[✓] Created subdirectories for all {len(mus)} tracks.")

In [None]:
# --- SEPARATION LOOP ---
for track in mus:
    print(f"[→] Separating: {track.name}")

    track_path = os.path.join(estimates_path, "test", track.name)

    # Skip if target already exists
    out_file = f"{targets[0]}.wav"
    if os.path.exists(os.path.join(track_path, out_file)):
        print("   ↳ Already exists, skipping.")
        continue

    # Load audio and separate
    audio = torch.tensor(track.audio.T).float().to(device)  # (2, samples)
    estimates = openunmix.separate(
        audio=audio,
        rate=track.rate,
        targets=targets,
        residual=False,
        device=device,
    )

    # Rename residual -> accompaniment
    # if "residual" in estimates:
    #     estimates["accompaniment"] = estimates.pop("residual")

    # --- Save separated stems ---
    for target, audio_tensor in estimates.items():
        audio_np = np.squeeze(audio_tensor.detach().cpu().numpy().T)
        out_file = f"{target}.wav"
        out_path = os.path.join(track_path, out_file)
        sf.write(out_path, audio_np, track.rate)

print("\n[✓] Separation finished.")

In [None]:
# --- EVALUATION ---
print("\n[✓] Evaluating using museval.eval_mus_dir...")

eval_results = museval.eval_mus_dir(
    dataset=mus,
    estimates_dir=os.path.join(estimates_path),
    output_dir=output_path,
    ext="wav",
)

print("[✓] Evaluation complete.")

## Htdemucs4

In [9]:
# --- SETTINGS ---
model_name = "htdemucs_ft"
targets = ["vocals", "drums", "bass", "other"]
device = "cuda" if torch.cuda.is_available() else "cpu"

# --- PATHS ---
estimates_path = os.path.join(estimates_base_path, model_name)
output_path = os.path.join(output_base_path, model_name)
os.makedirs(estimates_path, exist_ok=True)
os.makedirs(output_path, exist_ok=True)

# --- CREATE TRACK SUBDIRECTORIES BEFORE SEPARATION ---
for track in mus:
    track_path = os.path.join(estimates_path, track.subset, track.name)
    os.makedirs(track_path, exist_ok=True)

print(f"[✓] Created subdirectories for all {len(mus)} tracks ({model_name}).")

[✓] Created subdirectories for all 50 tracks (htdemucs_ft).


In [10]:
# --- LOAD MODEL ---
demucs_model = get_model("htdemucs_ft")
demucs_model.to(device)

# --- SEPARATION LOOP ---
for track in mus:
    print(f"[→] Separating: {track.name}")

    track_path = os.path.join(estimates_path, track.subset, track.name)
    out_file = f"{targets[0]}.wav"
    if os.path.exists(os.path.join(track_path, out_file)):
        print("   ↳ Already exists, skipping.")
        continue

    # Prepare audio for demucs (expects shape: batch, channels, samples)
    audio = torch.tensor(track.audio.T).float().unsqueeze(0)  # shape (1, 2, samples)
    audio = audio.to(device)
    rate = track.rate
    subset = track.subset

    # Convert audio to model's expected sample rate if needed
    audio = convert_audio(
        audio, rate, demucs_model.samplerate, demucs_model.audio_channels
    )

    # Apply separation
    with torch.no_grad():
        sources = apply_model(demucs_model, audio, device=device)

    # Remove batch dimension from sources
    sources = sources.squeeze(0)  # Now shape: (n_sources, channels, samples)

    # Map demucs sources to canonical targets and save each separately
    estimates = {}
    source_names = demucs_model.sources

    canonical_targets = ["vocals", "drums", "bass", "other"]

    for i, source_name in enumerate(source_names):
        name = source_name.lower()
        # Map known source names to canonical targets; unknowns go into 'other'
        if name in canonical_targets:
            estimates[name] = sources[i]
        else:
            # accumulate any unexpected source into 'other'
            if "other" not in estimates:
                estimates["other"] = sources[i].clone()
            else:
                estimates["other"] += sources[i]

    # Ensure all canonical targets exist (fill missing with zeros of appropriate shape)
    n_channels = sources.shape[1] if sources.ndim >= 2 else demucs_model.audio_channels
    n_samples = sources.shape[2] if sources.ndim >= 3 else 0
    for tgt in canonical_targets:
        if tgt not in estimates:
            if n_samples > 0:
                estimates[tgt] = torch.zeros(
                    (n_channels, n_samples), device=audio.device
                )
            else:
                estimates[tgt] = torch.zeros((n_channels, 0), device=audio.device)

    # Convert back to original sample rate if needed and prepare for saving
    for key in list(estimates.keys()):
        tensor = estimates[key]
        # demucs outputs shape: (channels, samples)
        if demucs_model.samplerate != rate and tensor.numel() > 0:
            tensor = torchaudio.functional.resample(
                tensor, demucs_model.samplerate, rate
            )
        # Ensure tensor is detached and on CPU
        tensor = tensor.detach().cpu()
        estimates[key] = tensor

    # Save separated audio files (each as stereo/interleaved numpy array)
    for target in canonical_targets:
        audio_tensor = estimates[target]
        # audio_tensor shape expected (channels, samples) -> convert to (samples, channels)
        if audio_tensor.numel() == 0:
            audio_np = np.zeros((0, n_channels), dtype=np.float32)
        else:
            audio_np = np.squeeze(audio_tensor.numpy()).T

        out_file = f"{target}.wav"
        out_path = os.path.join(track_path, out_file)
        # Write using original rate
        sf.write(out_path, audio_np, rate)

print("\n[✓] Separation finished for HTDemucs.")

[→] Separating: AM Contra - Heart Peripheral
[→] Separating: Al James - Schoolboy Facination
[→] Separating: Angels In Amplifiers - I'm Alright
[→] Separating: Arise - Run Run Run
[→] Separating: BKS - Bulldozer
[→] Separating: BKS - Too Much
[→] Separating: Ben Carrigan - We'll Talk About It All Tonight
[→] Separating: Bobby Nobody - Stitch Up
[→] Separating: Buitraker - Revo X
[→] Separating: Carlos Gonzalez - A Place For Us
[→] Separating: Cristina Vane - So Easy
[→] Separating: Detsky Sad - Walkie Talkie
[→] Separating: Enda Reilly - Cur An Long Ag Seol
[→] Separating: Forkupines - Semantics
[→] Separating: Georgia Wonder - Siren
[→] Separating: Girls Under Glass - We Feel Alright
[→] Separating: Hollow Ground - Ill Fate
[→] Separating: James Elder & Mark M Thompson - The English Actor
[→] Separating: Juliet's Rescue - Heartbeats
[→] Separating: Little Chicago's Finest - My Own
[→] Separating: Louis Cressy Band - Good Time
[→] Separating: Lyndsey Ollard - Catching Up
[→] Separating

In [30]:
# --- EVALUATION ---
print("\n[✓] Evaluating using museval.eval_mus_dir...")

eval_results = museval.eval_mus_dir(
    dataset=mus,
    estimates_dir=os.path.join(estimates_path),
    output_dir=output_path,
    ext="wav",
)

print("[✓] Evaluation complete.")


[✓] Evaluating using museval.eval_mus_dir...
[✓] Evaluation complete.
[✓] Evaluation complete.


## Combination

In [5]:
# --- COMBINE ESTIMATES BY AVERAGING TRANSMITTANCES (magnitude + phase) ---
# This cell computes per-model transmittance T = S_est / S_mix (complex STFT),
# averages magnitude and phase across specified models and targets, applies the
# combined transmittance to the mixture STFT, reconstructs waveforms and
# evaluates using museval.

import math
import glob

models_to_combine = [
    "bs_roformer",
    "htdemucs",
    "htdemucs_ft",
    "htdemucs_zf",
    "mel_band_roformer",
    "openunmix",
    "scnet_masked_xl_ihf",
    "scnet_xl",
    "scnet_xl_ihf",
    "XUMXL",
]
canonical_targets = ["vocals", "drums", "bass", "other"]
combined_model_name = "combined_transmittance_all"

# STFT parameters (tunable)
n_fft = 4096
hop_length = 1024
win_length = n_fft
window = torch.hann_window(n_fft)

estimates_combined_path = os.path.join(estimates_base_path, combined_model_name)
output_combined_path = os.path.join(output_base_path, combined_model_name)
os.makedirs(estimates_combined_path, exist_ok=True)
os.makedirs(output_combined_path, exist_ok=True)

# small epsilon to avoid division by zero
EPS = 1e-9


def stft_per_channel(wave_np, device="cpu"):
    """Compute complex STFT for each channel.
    Input: wave_np shape (samples, channels) or (channels, samples)
    Returns: complex Tensor shape (channels, freq_bins, frames)
    """
    # Normalize input shape to (channels, samples)
    wave = torch.tensor(wave_np, dtype=torch.float32)
    if wave.ndim == 2 and wave.shape[0] < wave.shape[1]:
        # probably (channels, samples)
        if wave.shape[0] <= 2:
            wave_ch = wave
        else:
            wave_ch = wave
    if wave.ndim == 2 and wave.shape[0] > wave.shape[1]:
        # maybe (samples, channels)
        wave = wave.t()
    if wave.ndim == 1:
        wave = wave.unsqueeze(0)  # mono -> (1, samples)

    channels = wave.shape[0]
    stfts = []
    for c in range(channels):
        x = wave[c]
        X = torch.stft(
            x,
            n_fft=n_fft,
            hop_length=hop_length,
            win_length=win_length,
            window=window,
            center=True,
            return_complex=True,
        )
        stfts.append(X)
    return torch.stack(stfts, dim=0)


def istft_per_channel(stft_complex, length=None):
    """Inverse STFT for tensor shape (channels, freq_bins, frames)
    Returns numpy array shape (samples, channels)
    """
    channels = stft_complex.shape[0]
    outs = []
    for c in range(channels):
        X = stft_complex[c]
        x = torch.istft(
            X,
            n_fft=n_fft,
            hop_length=hop_length,
            win_length=win_length,
            window=window,
            center=True,
            length=length,
        )
        outs.append(x)
    out = torch.stack(outs, dim=0)  # (channels, samples)
    return out.detach().cpu().numpy().T  # return (samples, channels)


print(f"[→] Combining models: {models_to_combine} -> {combined_model_name}")

for track in mus:
    print(f"Processing track: {track.name}")
    subset = track.subset if hasattr(track, "subset") else "test"
    track_mix = track.audio  # (samples, channels)
    n_samples = track_mix.shape[0]

    # compute STFT of mixture
    mix_stft = stft_per_channel(track_mix)

    # prepare output directory for this track
    track_out_dir = os.path.join(estimates_combined_path, subset, track.name)
    os.makedirs(track_out_dir, exist_ok=True)

    for target in canonical_targets:
        # collect model stfts (as transmittance T = S_est / S_mix)
        Ts = []
        any_model_found = False
        for model_name in models_to_combine:
            # try to find the target wav in this model's estimates
            model_base = os.path.join(estimates_base_path, model_name)
            # Look for track directories named exactly track.name under this model
            matches = glob.glob(
                os.path.join(model_base, "**", track.name), recursive=True
            )
            model_track_dir = matches[0] if matches else None
            est_path = None
            if model_track_dir:
                candidate = os.path.join(model_track_dir, f"{target}.wav")
                if os.path.exists(candidate):
                    est_path = candidate
            # fallback: maybe file stored as <model>/<track>/<subset>/<target>.wav
            if est_path is None:
                alt_matches = glob.glob(
                    os.path.join(model_base, "**", f"{track.name}", f"{target}.wav"),
                    recursive=True,
                )
                est_path = alt_matches[0] if alt_matches else None

            if est_path and os.path.exists(est_path):
                any_model_found = True
                est_audio, sr = sf.read(est_path)
                # ensure same sample count and channels
                if est_audio.shape[0] != n_samples:
                    # trim or pad
                    if est_audio.shape[0] > n_samples:
                        est_audio = est_audio[:n_samples]
                    else:
                        pad = np.zeros(
                            (
                                n_samples - est_audio.shape[0],
                                est_audio.shape[1] if est_audio.ndim > 1 else 1,
                            )
                        )
                        est_audio = np.vstack([est_audio, pad])

                # convert to (channels, samples)
                if est_audio.ndim == 1:
                    est_audio = est_audio[:, None]
                est_audio_T = est_audio.T
                est_stft = stft_per_channel(est_audio_T)

                # compute T = S_est / S_mix
                # ensure shapes align
                # If shapes differ (freq/time), try to pad/truncate time dimension
                if est_stft.shape != mix_stft.shape:
                    # align frames on time axis
                    min_freq = min(est_stft.shape[1], mix_stft.shape[1])
                    min_time = min(est_stft.shape[2], mix_stft.shape[2])
                    est_stft = est_stft[:, :min_freq, :min_time]
                    mix_ref = mix_stft[:, :min_freq, :min_time]
                else:
                    mix_ref = mix_stft

                T = est_stft / (mix_ref + EPS)
                Ts.append(T)

            else:
                # model estimate not found for this target
                print(f"   [!] {model_name} missing {target} for {track.name}")

        if not any_model_found:
            # write zeros
            print(
                f"   [i] No estimates found for target {target} on track {track.name}; writing zeros."
            )
            zero_wav = np.zeros((n_samples, track_mix.shape[1]), dtype=np.float32)
            sf.write(os.path.join(track_out_dir, f"{target}.wav"), zero_wav, track.rate)
            continue

        # Now average magnitude and phase across Ts
        if len(Ts) == 1:
            T_comb = Ts[0]
        else:
            # align shapes
            min_freq = min([t.shape[1] for t in Ts])
            min_time = min([t.shape[2] for t in Ts])
            Ts_al = [t[:, :min_freq, :min_time] for t in Ts]
            mix_ref = mix_stft[:, :min_freq, :min_time]

            # magnitude average
            mags = [torch.abs(t) for t in Ts_al]
            mag_avg = sum(mags) / len(mags)

            # phase average via vector sum
            vecs = [torch.exp(1j * torch.angle(t)) for t in Ts_al]
            vec_sum = sum(vecs)
            phase_avg = torch.angle(vec_sum)

            T_comb = mag_avg * torch.exp(1j * phase_avg)

        # apply combined transmittance to mixture
        # align mix to T_comb shape
        mix_al = mix_stft[:, : T_comb.shape[1], : T_comb.shape[2]]
        S_comb = T_comb * (mix_al + EPS)

        # ISTFT back to waveform
        wav_np = istft_per_channel(S_comb, length=n_samples)

        # save
        out_path = os.path.join(track_out_dir, f"{target}.wav")
        sf.write(out_path, wav_np, track.rate)

print("\n[✓] Combined estimates saved.")

[→] Combining models: ['bs_roformer', 'htdemucs', 'htdemucs_ft', 'htdemucs_zf', 'mel_band_roformer', 'openunmix', 'scnet_masked_xl_ihf', 'scnet_xl', 'scnet_xl_ihf', 'XUMXL'] -> combined_transmittance_all
Processing track: AM Contra - Heart Peripheral
Processing track: Al James - Schoolboy Facination
Processing track: Angels In Amplifiers - I'm Alright
Processing track: Arise - Run Run Run
Processing track: BKS - Bulldozer
Processing track: BKS - Too Much
Processing track: Ben Carrigan - We'll Talk About It All Tonight
Processing track: Bobby Nobody - Stitch Up
Processing track: Buitraker - Revo X
Processing track: Carlos Gonzalez - A Place For Us
Processing track: Cristina Vane - So Easy
Processing track: Detsky Sad - Walkie Talkie
Processing track: Enda Reilly - Cur An Long Ag Seol
Processing track: Forkupines - Semantics
Processing track: Georgia Wonder - Siren
Processing track: Girls Under Glass - We Feel Alright
Processing track: Hollow Ground - Ill Fate
Processing track: James Eld

In [16]:
# Evaluate combined model
print("\n[→] Running museval evaluation on combined estimates...")
eval_results = museval.eval_mus_dir(
    dataset=mus,
    estimates_dir=estimates_combined_path,
    output_dir=output_combined_path,
    ext="wav",
)
print("[✓] Museval evaluation complete. Results saved to:", output_combined_path)


[→] Running museval evaluation on combined estimates...
[✓] Museval evaluation complete. Results saved to: /home/teraflops/Documents/code/music_source_separation/outputs/combined_transmittance


## Comparison

In [6]:
import json
import os
import pandas as pd
import math
import numpy as np
from collections import defaultdict

# Define the base directory where the JSON files are stored
base_dir = "/home/teraflops/Documents/code/music_source_separation/outputs"

model_list = [
    "combined_transmittance",
    "combined_transmittance_all",
    "bs_roformer",
    "htdemucs",
    "htdemucs_ft",
    "htdemucs_zf",
    "mel_band_roformer",
    "openunmix",
    "scnet_masked_xl_ihf",
    "scnet_xl",
    "scnet_xl_ihf",
    "XUMXL",
]

# Initialize dictionaries to hold track-level metrics for each model
median_over_frames = {
    model: defaultdict(list)
    for model in model_list
}
mean_over_frames = {
    model: defaultdict(list)
    for model in model_list
}


def process_json_file(file_path):
    try:
        with open(file_path, "r") as file:
            data = json.load(file)

        track_median = defaultdict(dict)
        track_mean = defaultdict(dict)

        for target in data.get("targets", []):
            stem = target["name"]
            frames = target.get("frames", [])
            metrics_list = [frame.get("metrics", {}) for frame in frames]

            # Filter out frames with NaN values
            valid_metrics = [
                m for m in metrics_list if not any(math.isnan(v) for v in m.values())
            ]
            if not valid_metrics:
                continue

            df = pd.DataFrame(valid_metrics)

            # Calculate median and mean over frames for each metric
            for metric in ["SDR", "SIR", "SAR", "ISR"]:
                if metric in df.columns:
                    track_median[stem][metric] = df[metric].median()
                    track_mean[stem][metric] = df[metric].mean()

        return track_median, track_mean
    except Exception as e:
        print(f"Error loading file {file_path}: {e}")
        return {}, {}


# Traverse the directory structure and process each JSON file
for model in model_list:
    model_dir = os.path.join(base_dir, model, "test")
    try:
        for json_file in os.listdir(model_dir):
            if json_file.endswith(".json"):
                file_path = os.path.join(model_dir, json_file)
                track_median, track_mean = process_json_file(file_path)

                for stem in track_median:
                    median_over_frames[model][stem].append(track_median[stem])
                for stem in track_mean:
                    mean_over_frames[model][stem].append(track_mean[stem])
    except FileNotFoundError:
        print(f"Directory not found: {model_dir}")

# Calculate median over tracks for each stem and model
median_over_tracks_median_over_frames = {}
median_over_tracks_mean_over_frames = {}

for model in median_over_frames:
    median_over_tracks_median_over_frames[model] = {}
    median_over_tracks_mean_over_frames[model] = {}

    for stem in median_over_frames[model]:
        df_median = pd.DataFrame(median_over_frames[model][stem])
        df_mean = pd.DataFrame(mean_over_frames[model][stem])

        # Calculate median over tracks for each metric
        median_over_tracks_median_over_frames[model][stem] = {
            metric: df_median[metric].median()
            for metric in ["SDR", "SIR", "SAR", "ISR"]
        }
        median_over_tracks_mean_over_frames[model][stem] = {
            metric: df_mean[metric].median() for metric in ["SDR", "SIR", "SAR", "ISR"]
        }

# Calculate overall values by averaging over the four stems
overall_median_median = {}
overall_median_mean = {}

for model in median_over_tracks_median_over_frames:
    overall_median_median[model] = {
        metric: np.mean(
            [
                median_over_tracks_median_over_frames[model][stem][metric]
                for stem in median_over_tracks_median_over_frames[model]
            ]
        )
        for metric in ["SDR", "SIR", "SAR", "ISR"]
    }
    overall_median_mean[model] = {
        metric: np.mean(
            [
                median_over_tracks_mean_over_frames[model][stem][metric]
                for stem in median_over_tracks_mean_over_frames[model]
            ]
        )
        for metric in ["SDR", "SIR", "SAR", "ISR"]
    }

# Prepare data for DataFrames
median_median_data = []
median_mean_data = []

for model in median_over_tracks_median_over_frames:
    for stem in median_over_tracks_median_over_frames[model]:
        median_median_data.append(
            {
                "Model": model,
                "Stem": stem,
                **median_over_tracks_median_over_frames[model][stem],
            }
        )
        median_mean_data.append(
            {
                "Model": model,
                "Stem": stem,
                **median_over_tracks_mean_over_frames[model][stem],
            }
        )

median_median_df = pd.DataFrame(median_median_data)
median_mean_df = pd.DataFrame(median_mean_data)

# Prepare overall data
overall_median_median_data = []
overall_median_mean_data = []

for model in overall_median_median:
    overall_median_median_data.append(
        {"Model": model, "Metric": "Overall", **overall_median_median[model]}
    )
    overall_median_mean_data.append(
        {"Model": model, "Metric": "Overall", **overall_median_mean[model]}
    )

overall_median_median_df = pd.DataFrame(overall_median_median_data)
overall_median_mean_df = pd.DataFrame(overall_median_mean_data)

# Display the DataFrames
print("Aggregated Scores (median over frames, median over tracks):")
display(median_median_df)
print("\nAggregated Scores (mean over frames, median over tracks):")
display(median_mean_df)
print("\nOverall Aggregated Scores (median over frames, median over tracks):")
display(overall_median_median_df)
print("\nOverall Aggregated Scores (mean over frames, median over tracks):")
display(overall_median_mean_df)

# Save the DataFrames to CSV files
median_median_df.to_csv("median_median_scores.csv", index=False)
median_mean_df.to_csv("median_mean_scores.csv", index=False)
overall_median_median_df.to_csv("overall_median_median_scores.csv", index=False)
overall_median_mean_df.to_csv("overall_median_mean_scores.csv", index=False)

Aggregated Scores (median over frames, median over tracks):


Unnamed: 0,Model,Stem,SDR,SIR,SAR,ISR
0,combined_transmittance,vocals,10.74956,16.650982,9.551745,16.933788
1,combined_transmittance,drums,11.492297,17.883713,10.82826,17.478515
2,combined_transmittance,bass,10.187785,16.478967,7.509482,8.473768
3,combined_transmittance,other,7.8206,7.675952,7.222433,14.536465
4,combined_transmittance_all,vocals,9.965483,15.300665,8.986767,15.875892
5,combined_transmittance_all,drums,10.96246,17.095688,10.37999,16.296875
6,combined_transmittance_all,bass,9.479975,15.05557,7.7849,8.616758
7,combined_transmittance_all,other,7.181712,7.506863,6.838673,13.336305
8,bs_roformer,vocals,10.783645,17.552183,9.40347,19.766777
9,bs_roformer,drums,11.681195,18.863063,9.824075,19.063923



Aggregated Scores (mean over frames, median over tracks):


Unnamed: 0,Model,Stem,SDR,SIR,SAR,ISR
0,combined_transmittance,vocals,8.04249,12.058033,8.237939,16.36052
1,combined_transmittance,drums,11.271803,16.59426,10.245366,17.024008
2,combined_transmittance,bass,9.801693,14.594459,6.995798,7.77304
3,combined_transmittance,other,7.903289,8.344181,7.002936,14.364417
4,combined_transmittance_all,vocals,7.188614,10.080268,8.042457,15.111002
5,combined_transmittance_all,drums,10.851897,13.921134,9.876769,15.951941
6,combined_transmittance_all,bass,9.10552,12.699079,7.234027,6.855695
7,combined_transmittance_all,other,7.267335,8.062198,6.615754,13.124293
8,bs_roformer,vocals,9.347908,13.816514,8.389284,18.071669
9,bs_roformer,drums,11.425283,17.980565,9.75113,18.873249



Overall Aggregated Scores (median over frames, median over tracks):


Unnamed: 0,Model,Metric,SDR,SIR,SAR,ISR
0,combined_transmittance,Overall,10.062561,14.672404,8.77798,14.355634
1,combined_transmittance_all,Overall,9.397407,13.739696,8.497583,13.531457
2,bs_roformer,Overall,10.063626,15.710638,7.902784,15.927524
3,htdemucs,Overall,8.7725,14.295657,6.908044,13.579781
4,htdemucs_ft,Overall,9.066902,14.834339,7.16578,13.600759
5,htdemucs_zf,Overall,8.749767,14.127484,7.03331,12.969189
6,mel_band_roformer,Overall,9.164934,14.633258,7.291147,14.288508
7,openunmix,Overall,6.246601,10.672748,5.605714,11.123643
8,scnet_masked_xl_ihf,Overall,7.955321,12.357814,7.305193,12.612414
9,scnet_xl,Overall,9.073537,13.347804,7.370276,13.540144



Overall Aggregated Scores (mean over frames, median over tracks):


Unnamed: 0,Model,Metric,SDR,SIR,SAR,ISR
0,combined_transmittance,Overall,9.254819,12.897733,8.12051,13.880496
1,combined_transmittance_all,Overall,8.603342,11.19067,7.942252,12.760733
2,bs_roformer,Overall,9.649467,14.487252,7.647589,15.291023
3,htdemucs,Overall,8.161914,11.973731,6.588637,12.939043
4,htdemucs_ft,Overall,8.57794,12.48166,6.820523,13.174173
5,htdemucs_zf,Overall,8.139569,11.756625,6.532645,12.391735
6,mel_band_roformer,Overall,8.691874,12.397237,7.118164,13.632419
7,openunmix,Overall,5.810622,8.201712,5.345229,10.391487
8,scnet_masked_xl_ihf,Overall,7.318106,10.461763,6.773893,11.941685
9,scnet_xl,Overall,8.220995,11.33524,7.137635,12.954423
