In [None]:
# Cell to check versions
import torch
import sys

print(f"Python version: {sys.version}")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")  # Should be False
print(f"Device being used: {torch.device('cpu')}")

In [None]:
import torch
import torchaudio
from openunmix import simulate
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
import IPython.display as ipd
%matplotlib inline
device = torch.device('cpu')

In [None]:
def separate_audio_unmix(audio_path):
    """
    Separate audio using OpenUnmix
    Returns estimates dictionary containing 'vocals' and 'accompaniment'
    """
    # Load audio
    print("Loading audio...")
    waveform, sample_rate = torchaudio.load(audio_path)
    
    # Convert to mono if stereo
    if waveform.shape[0] > 1:
        waveform = waveform.mean(0, keepdim=True)
    
    # Separate
    print("Separating audio (this may take a few minutes)...")
    model = torch.hub.load('sigsep/open-unmix-pytorch', 'umxhq', pretrained=True)
    estimates = model(waveform)
    
    return estimates, sample_rate

In [None]:
audio_path = "infer3.mp3"

# Separate audio
estimates, sr = separate_audio_unmix(audio_path)

# Convert to numpy for visualization
vocals = estimates['vocals'].squeeze().numpy()
accompaniment = estimates['accompaniment'].squeeze().numpy()

# Plot and play each component
components = {
    "Vocals": vocals,
    "Instrumental": accompaniment
}

for name, signal in components.items():
    print(f"\nAnalyzing {name}...")
    plt.figure(figsize=(15, 8))
    
    # Waveform
    plt.subplot(2, 1, 1)
    librosa.display.waveshow(signal, sr=sr)
    plt.title(f"{name} - Waveform")
    
    # Spectrogram
    plt.subplot(2, 1, 2)
    D = librosa.amplitude_to_db(np.abs(librosa.stft(signal)), ref=np.max)
    librosa.display.specshow(D, y_axis='log', x_axis='time', sr=sr)
    plt.colorbar(format='%+2.0f dB')
    plt.title(f"{name} - Spectrogram")
    
    plt.tight_layout()
    plt.show()
    
    print(f"Playing {name}...")
    ipd.display(ipd.Audio(signal, rate=sr))