# StarGANv2-VC Demo (VCTK 20 Speakers)

### Utils

In [None]:
# load packages
import random
import yaml
from munch import Munch
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
import torchaudio
import librosa

from Utils.ASR.models import ASRCNN
from Utils.JDC.model import JDCNet
from models import Generator, MappingNetwork, StyleEncoder

%matplotlib inline

In [None]:
# Source: http://speech.ee.ntu.edu.tw/~jjery2243542/resource/model/is18/en_speaker_used.txt
# Source: https://github.com/jjery2243542/voice_conversion

speakers = [225,228,229,230,231,233,236,239,240,244,226,227,232,243,254,256,258,259,270,273]

to_mel = torchaudio.transforms.MelSpectrogram(
    n_mels=80, n_fft=2048, win_length=1200, hop_length=300)
mean, std = -4, 4

def preprocess(wave):
    wave_tensor = torch.from_numpy(wave).float()
    mel_tensor = to_mel(wave_tensor)
    mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - mean) / std
    return mel_tensor

def build_model(model_params={}):
    args = Munch(model_params)
    generator = Generator(args.dim_in, args.style_dim, args.max_conv_dim, w_hpf=args.w_hpf, F0_channel=args.F0_channel)
    mapping_network = MappingNetwork(args.latent_dim, args.style_dim, args.num_domains, hidden_dim=args.max_conv_dim)
    style_encoder = StyleEncoder(args.dim_in, args.style_dim, args.num_domains, args.max_conv_dim)
    
    nets_ema = Munch(generator=generator,
                     mapping_network=mapping_network,
                     style_encoder=style_encoder)

    return nets_ema

def compute_style(speaker_dicts):
    reference_embeddings = {}
    for key, (path, speaker) in speaker_dicts.items():
        if path == "":
            label = torch.LongTensor([speaker]).to('cuda')
            latent_dim = starganv2.mapping_network.shared[0].in_features
            ref = starganv2.mapping_network(torch.randn(1, latent_dim).to('cuda'), label)
        else:
            wave, sr = librosa.load(path, sr=24000)
            audio, index = librosa.effects.trim(wave, top_db=30)
            if sr != 24000:
                wave = librosa.resample(wave, sr, 24000)
            mel_tensor = preprocess(wave).to('cuda')

            with torch.no_grad():
                label = torch.LongTensor([speaker])
                ref = starganv2.style_encoder(mel_tensor.unsqueeze(1), label)
        reference_embeddings[key] = (ref, label)
    
    return reference_embeddings

### Load models

In [None]:
# load F0 model

F0_model = JDCNet(num_class=1, seq_len=192)
params = torch.load("Utils/JDC/bst.t7")['net']
F0_model.load_state_dict(params)
_ = F0_model.eval()
F0_model = F0_model.to('cuda')

In [None]:
# load vocoder
import scipy.signal.windows as windows
import scipy.signal
scipy.signal.kaiser = windows.kaiser

from parallel_wavegan.utils import load_model
vocoder = load_model("Vocoder/checkpoint-400000steps.pkl").to('cuda').eval()
vocoder.remove_weight_norm()
_ = vocoder.eval()

In [None]:
# load starganv2

model_path = 'Models/epoch_00150.pth'

with open('Models/config.yml') as f:
    starganv2_config = yaml.safe_load(f)
starganv2 = build_model(model_params=starganv2_config["model_params"])
params = torch.load(model_path, map_location='cpu')
params = params['model_ema']
_ = [starganv2[key].load_state_dict(params[key]) for key in starganv2]
_ = [starganv2[key].eval() for key in starganv2]
starganv2.style_encoder = starganv2.style_encoder.to('cuda')
starganv2.mapping_network = starganv2.mapping_network.to('cuda')
starganv2.generator = starganv2.generator.to('cuda')

### Conversion

In [None]:
# load input wave
selected_speakers = [230,258,273]
k = random.choice(selected_speakers)
wav_path = "separated/htdemucs/Perfect/vocals.wav"
audio, source_sr = librosa.load(wav_path, sr=24000)
audio = audio / np.max(np.abs(audio))
audio.dtype = np.float32

#### Convert by style encoder

In [None]:
# with reference, using style encoder

speaker_dicts = {}
for s in selected_speakers:
    k = s
    speaker_dicts['p' + str(s)] = (
         'Demo/VCTK-corpus/p' + str(k) + '/p' + str(k) + '_023.wav',
          speakers.index(s)
     )

reference_embeddings = compute_style(speaker_dicts)

"""
# For a single input audio file in the Demo folder:
speaker_dicts = {
    'myaudio': (
        r"Demo/VCTK-corpus/p500/p500_023.wav",  # replace with your WAV path
        0  # default speaker label
    )
}
reference_embeddings = compute_style(speaker_dicts)
"""

In [None]:
# conversion 
import time
import numpy as np
import torch
import librosa
import soundfile as sf
import IPython.display as ipd
import torch.nn.functional as F

start = time.time()
    
source = preprocess(audio).to('cuda:0')
keys = []
converted_samples = {}
reconstructed_samples = {}
converted_mels = {}

# --- Patch: Force vocoder.aux_context_window to a fixed tuple to avoid padding errors ---
if hasattr(vocoder, 'aux_context_window'):
    # Force it to (4,4); adjust if needed based on your vocoder's requirements.
    vocoder.aux_context_window = (4, 4)
# ---------------------------------------------------

for key, (ref, _) in reference_embeddings.items():
    with torch.no_grad():
        f0_feat = F0_model.get_feature_GAN(source.unsqueeze(1))
        out = starganv2.generator(source.unsqueeze(1), ref, F0=f0_feat)
        
        # 'out' is the full mel spectrogram from the generator.
        # Transpose and squeeze to get shape [T, n_mels]
        c = out.transpose(-1, -2).squeeze().to('cuda')
        
        # Determine the minimum required chunk length from the vocoder's auxiliary context.
        aux_context_window = vocoder.aux_context_window  # now a tuple, e.g., (4, 4)
        min_chunk_length = aux_context_window[0]
        
        # Set chunk size: ensure it's at least 1000 frames or the minimum required.
        chunk_size = max(1000, min_chunk_length)
        print(f"Using chunk size: {chunk_size}, min required: {min_chunk_length}")
        
        converted_audio_chunks = []
        for i in range(0, c.shape[0], chunk_size):
            chunk = c[i:i+chunk_size]  # shape: [chunk_frames, n_mels]
            # If the chunk's time dimension is too short, pad it by repeating the last frame.
            if chunk.shape[0] < min_chunk_length:
                pad_amount = min_chunk_length - chunk.shape[0]
                last_frame = chunk[-1:].clone()  # shape: [1, n_mels]
                pad_tensor = last_frame.repeat(pad_amount, 1)
                chunk = torch.cat([chunk, pad_tensor], dim=0)
            # Reshape to [1, n_mels, T] as expected by the vocoder
            chunk = chunk.unsqueeze(0).transpose(1, 2)
            print(f"Processing chunk with shape: {chunk.shape}")
            # Get expected length via upsample_net
            with torch.no_grad():
                c_up = vocoder.upsample_net(chunk)
            expected_T = c_up.shape[-1]
            # Create dummy noise with shape [1, 1, expected_T]
            dummy_noise = torch.randn(1, 1, expected_T).to(chunk.device)
            with torch.no_grad():
                y_chunk = vocoder.forward(dummy_noise, chunk)
            # Flatten the output and move to CPU
            y_chunk = y_chunk.squeeze(0).transpose(1, 0).contiguous().view(-1).cpu().numpy()
            converted_audio_chunks.append(y_chunk)
        
        # Concatenate all chunks into one waveform
        y_out = np.concatenate(converted_audio_chunks)
        
        # --- Reconstruction for Reference ---
        if key not in speaker_dicts or speaker_dicts[key][0] == "":
            recon = None
        else:
            wave, sr = librosa.load(speaker_dicts[key][0], sr=24000)
            mel = preprocess(wave)
            c_recon = mel.transpose(-1, -2).squeeze().to('cuda')
            if c_recon.shape[0] < min_chunk_length:
                pad_amount = min_chunk_length - c_recon.shape[0]
                last_frame = c_recon[-1:].clone()
                pad_tensor = last_frame.repeat(pad_amount, 1)
                c_recon = torch.cat([c_recon, pad_tensor], dim=0)
            c_recon = c_recon.unsqueeze(0).transpose(1, 2)
            # Determine expected time dimension for reconstruction
            with torch.no_grad():
                c_up_recon = vocoder.upsample_net(c_recon)
            expected_T_recon = c_up_recon.shape[-1]
            dummy_noise_recon = torch.randn(1, 1, expected_T_recon).to(c_recon.device)
            with torch.no_grad():
                recon = vocoder.forward(dummy_noise_recon, c_recon)
            recon = recon.squeeze(0).transpose(1, 0).contiguous().view(-1).cpu().numpy()

    converted_samples[key] = y_out
    reconstructed_samples[key] = recon
    converted_mels[key] = out    
    keys.append(key)

end = time.time()
print('total processing time: %.3f sec' % (end - start) )

for key, wave in converted_samples.items():
    print('Converted: %s' % key)
    ipd.display(ipd.Audio(wave, rate=24000))
    print('Reference (vocoder): %s' % key)
    if reconstructed_samples[key] is not None:
        ipd.display(ipd.Audio(reconstructed_samples[key], rate=24000))
"""
print('Original (vocoder):')
wave, sr = librosa.load(wav_path, sr=24000)
mel = preprocess(wave)
c_orig = mel.transpose(-1, -2).squeeze().to('cuda')
with torch.no_grad():
    c_orig = c_orig.unsqueeze(0).transpose(1, 2)
    if c_orig.shape[2] < min_chunk_length:
        pad_amount = min_chunk_length - c_orig.shape[2]
        last_frame_orig = c_orig[:, :, -1:].clone()
        pad_tensor_orig = last_frame_orig.repeat(1, 1, pad_amount)
        c_orig = torch.cat([c_orig, pad_tensor_orig], dim=2)
    recon = vocoder.inference(c_orig)
    recon = recon.view(-1).cpu().numpy()
ipd.display(ipd.Audio(recon, rate=24000))
print('Original:')
ipd.display(ipd.Audio(wav_path, rate=24000))
"""
print('Original (raw stem):')
ipd.display(ipd.Audio(wav_path, rate=24000))

#### Convert by mapping network

In [None]:
# --- Vocal Cleanup, EQ & Instrumental Mix in One Cell ---
import numpy as np
import librosa
import soundfile as sf
import scipy.signal as sps
from IPython.display import Audio, display

sr = 24000

# 1) Pull out your converted waveform
if 'converted_samples' not in globals():
    raise NameError("`converted_samples` not found—run your conversion cell first.")
# take the first (or only) entry
converted_audio = list(converted_samples.values())[0]
print("Using converted_samples key:", list(converted_samples.keys())[0])

# 2) Noise reduction (optional, requires noisereduce)
try:
    import noisereduce as nr
    noise_clip     = converted_audio[:sr]  # first second as noise profile
    denoised_vocals = nr.reduce_noise(
    y=converted_audio,                 # your converted waveform
    y_noise=converted_audio[:sr],      # first second as noise profile
    sr=sr,
    stationary=False,
    prop_decrease=0.8,                 # try 0.4–0.8; lower = gentler
    time_mask_smooth_ms=200,           # try 50–200 ms
    freq_mask_smooth_hz=500            # try 50–500 Hz
)

    print("✅ Noise reduction applied")
except ImportError:
    print("⚠ noisereduce not installed; skipping noise reduction")
    denoised_vocals = converted_audio

# 3) High‑pass filter at 80 Hz to remove low rumble
b, a        = sps.butter(4, 80/(sr/2), btype='highpass')
eq_vocals   = sps.filtfilt(b, a, denoised_vocals)
print("✅ High‑pass EQ at 80 Hz applied")

# quick listen to cleaned vocals
display(Audio(eq_vocals, rate=sr))

# … previous steps remain unchanged up through loading/pre‑EQ …

# 4) Load instrumental & align lengths
inst_path = "separated/htdemucs/Perfect/no_vocals.wav"
inst_audio, _ = librosa.load(inst_path, sr=sr)
min_len       = min(len(eq_vocals), len(inst_audio))
vocals_trim   = eq_vocals[:min_len]
inst_trim     = inst_audio[:min_len]

# 5) Normalize & mix at a true 50/50 ratio
vocals_norm = vocals_trim / np.max(np.abs(vocals_trim))
inst_norm   = inst_trim   / np.max(np.abs(inst_trim))
mixed       = 1.2 * vocals_norm + 0.5 * inst_norm
mixed       = mixed / np.max(np.abs(mixed)) * 0.95

# 6) Save & play
sf.write("final_song_mix.wav", mixed.astype(np.float32), sr)
display(Audio(mixed, rate=sr))
