In [1]:
import os
import sys
import warnings

ffmpeg_bin_path = r"C:\Users\arezk\AppData\Local\Microsoft\WinGet\Packages\Gyan.FFmpeg_Microsoft.Winget.Source_8wekyb3d8bbwe\ffmpeg-8.0.1-full_build\bin"
os.environ["PATH"] += os.pathsep + ffmpeg_bin_path

warnings.filterwarnings("ignore")
sys.path.append('..')

In [2]:
from src.utils import spectrogram_to_audio
from src.model import Unet
import torch 
import numpy as np
import IPython.display as ipd
import musdb
import librosa
from scipy.io import wavfile


In [3]:
# Load the first test spectrogram

mixture_path = '../spectrograms/musdb18/test/mixture/Al_James_-_Schoolboy_Facination_spec.npy'
vocal_path = '../spectrograms/musdb18/test/vocal/Al_James_-_Schoolboy_Facination_spec.npy'
mix_phase_path= '../spectrograms/musdb18/test/phase/Al_James_-_Schoolboy_Facination_phase.npy'

# Load the first one
mix_spec = np.load(mixture_path)
vocal_spec = np.load(vocal_path)
mix_phase=np.load(mix_phase_path)

In [4]:
# Load the model
model = Unet()
model_path = '../checkpoints/musdb18_V2/model_last.pth'
model.load(model_path)
model_device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Loading model from ../checkpoints/musdb18_V2/model_last.pth
Model loaded successfully!


In [5]:
# Inference parameters
patch_size = 128
hop_size = 64  # 50% overlap 

# Get dimensions
freq_bins, total_frames = mix_spec.shape
print(f"Total frames to process: {total_frames}")

# Normalize per-song
norm = mix_spec.max()
mix_normalized = mix_spec / norm

vocal_spec_sum = np.zeros_like(mix_spec)
weight_sum = np.zeros(total_frames)

# Sliding window inference
start = 0
patch_count = 0


Total frames to process: 2137


In [6]:
model.eval()
with torch.no_grad():
    while start < total_frames:
        end = min(start + patch_size, total_frames)
        
        # Extract patch
        patch = mix_normalized[:, start:end]
        original_patch_size = patch.shape[1]
        
        # Pad if necessary 
        if patch.shape[1] < patch_size:
            padding = patch_size - patch.shape[1]
            patch = np.pad(patch, ((0, 0), (0, padding)), mode='constant')
        
        # Remove first frequency bin (513 → 512)
        patch_512 = patch[1:, :]
        
        # Convert to tensor 
        patch_tensor = torch.from_numpy(patch_512[np.newaxis, np.newaxis, :, :]).float()
        patch_tensor = patch_tensor.to(model_device)
        
        # Predict mask
        mask = model.forward(patch_tensor)
        
        # Convert back to numpy
        mask_np = mask.cpu().numpy()[0, 0, :, :]
        
        # Add first frequency bin back (512 → 513)
        mask_full = np.zeros((513, patch_size))
        mask_full[1:, :] = mask_np
        
        # Apply mask to patch
        vocal_patch = mask_full * patch
        
        # remove padding
        vocal_patch = vocal_patch[:, :original_patch_size]
        
        # Accumulate in full spectrogram
        vocal_spec_sum[:, start:end] += vocal_patch
        weight_sum[start:end] += 1
        
        patch_count += 1
        if patch_count % 10 == 0:
            print(f"Processed {patch_count} patches, frames {start}-{end}/{total_frames}")
        
        start += hop_size

print(f"\nTotal patches processed: {patch_count}")


Processed 10 patches, frames 576-704/2137
Processed 20 patches, frames 1216-1344/2137
Processed 30 patches, frames 1856-1984/2137

Total patches processed: 34


In [7]:

# Average overlapping regions
vocal_spec_full = vocal_spec_sum / weight_sum[np.newaxis, :]

# Denormalize
vocal_spec_full = vocal_spec_full * norm

print(f"Final vocal spec shape: {vocal_spec_full.shape}")
print(f"Vocal spec range: [{vocal_spec_full.min():.3f}, {vocal_spec_full.max():.3f}]")

# Reconstruct audio
vocal_audio_predicted = spectrogram_to_audio(vocal_spec_full, mix_phase)

print(f"Reconstructed audio shape: {vocal_audio_predicted.shape}")

# save the audio
output_path = '../outputs/Al_James_-_Schoolboy_Facination_vocals_predicted.wav'

wavfile.write(output_path, 44100, vocal_audio_predicted)
print(f"Saved predicted vocals to: {output_path}")



Final vocal spec shape: (513, 2137)
Vocal spec range: [0.000, 1.000]
Reconstructed audio shape: (8831025,)
Saved predicted vocals to: ../outputs/Al_James_-_Schoolboy_Facination_vocals_predicted.wav


In [8]:
# load test track the same as the spectrogram above

mus_test = musdb.DB(root='../data/musdb18', is_wav=False, subsets='test')
ground_truth_track = mus_test[1] 


rate = ground_truth_track.rate
mix_audio = ground_truth_track.audio  
vocal_audio = ground_truth_track.targets['vocals'].audio  


mix_mono = np.mean(mix_audio, axis=1)  
vocal_mono = np.mean(vocal_audio, axis=1)  


print("name:", ground_truth_track.name)
print("Duree:", ground_truth_track.duration)
print("Shape of mix:", mix_mono.shape)  #  stéréo
print("Shape vocals:", vocal_mono.shape)
print("Sample rate:", ground_truth_track.rate) 


name: Al James - Schoolboy Facination
Duree: 200.327007
Shape of mix: (8835072,)
Shape vocals: (8835072,)
Sample rate: 44100


In [10]:
# display the mix and vocals from the ground truth track and the pridected vocal

print("Ground Truth Mix:")
# display(ipd.Audio(ground_truth_track.audio.T, rate=ground_truth_track.rate))

print("Ground Truth Vocals:")
# display(ipd.Audio(ground_truth_track.targets['vocals'].audio.T, rate=ground_truth_track.rate))

print("predicted vocal")
# display(ipd.Audio(vocal_audio_predicted, rate=ground_truth_track.rate))


Ground Truth Mix:
Ground Truth Vocals:
predicted vocal
