In [1]:
import os
import sys
import warnings
warnings.filterwarnings('ignore')

# Add the seed-vc directory to path
sys.path.append('./seed-vc')

# Set up environment
os.environ['HF_HUB_CACHE'] = './seed-vc/checkpoints/hf_cache'

import numpy as np
import torch
import yaml
import torchaudio
import librosa
import time
from pathlib import Path

# Load the necessary modules from the project
from modules.commons import *
from modules.commons import str2bool
from hf_utils import load_custom_model_from_hf

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"MPS available: {torch.backends.mps.is_available() if hasattr(torch.backends, 'mps') else False}")

# Set device
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using CUDA")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using MPS (Apple Silicon)")
else:
    device = torch.device("cpu")
    print("Using CPU")

fp16 = False

PyTorch version: 2.5.1+cu124
CUDA available: True
MPS available: False
Using CUDA


In [29]:
modules

<module 'modules' (<_frozen_importlib_external._NamespaceLoader object at 0x0000023428108C40>)>

In [2]:
def load_voice_conversion_model(f0_condition=False, checkpoint=None, config=None, fp16_enabled=True):
    """
    Load the voice conversion model and all necessary components.
    
    Args:
        f0_condition: Whether to use F0 conditioning
        checkpoint: Path to custom checkpoint file
        config: Path to custom config file
        fp16_enabled: Whether to use FP16 precision
    
    Returns:
        Tuple of loaded models and functions
    """
    global fp16
    fp16 = fp16_enabled
    
    print("Loading voice conversion model...")
    
    # Load model configuration and checkpoint
    if not f0_condition:
        if checkpoint is None:
            dit_checkpoint_path, dit_config_path = load_custom_model_from_hf(
                "Plachta/Seed-VC",
                "DiT_seed_v2_uvit_whisper_small_wavenet_bigvgan_pruned.pth",
                "config_dit_mel_seed_uvit_whisper_small_wavenet.yml"
            )
        else:
            dit_checkpoint_path = checkpoint
            dit_config_path = config
        f0_fn = None
        print("Using model without F0 conditioning")
    else:
        if checkpoint is None:
            dit_checkpoint_path, dit_config_path = load_custom_model_from_hf(
                "Plachta/Seed-VC",
                "DiT_seed_v2_uvit_whisper_base_f0_44k_bigvgan_pruned_ft_ema_v2.pth",
                "config_dit_mel_seed_uvit_whisper_base_f0_44k.yml"
            )
        else:
            dit_checkpoint_path = checkpoint
            dit_config_path = config
        
        # Load F0 extractor
        print("Loading F0 extractor...")
        from modules.rmvpe import RMVPE
        model_path = load_custom_model_from_hf("lj1995/VoiceConversionWebUI", "rmvpe.pt", None)
        f0_extractor = RMVPE(model_path, is_half=False, device=device)
        f0_fn = f0_extractor.infer_from_audio
        print("Using model with F0 conditioning")

    # Load configuration
    config = yaml.safe_load(open(dit_config_path, "r"))
    model_params = recursive_munch(config["model_params"])
    model_params.dit_type = 'DiT'
    
    # Build and load the main model
    print("Building DiT model...")
    model = build_model(model_params, stage="DiT")
    hop_length = config["preprocess_params"]["spect_params"]["hop_length"]
    sr = config["preprocess_params"]["sr"]

    # Load checkpoints
    print("Loading model checkpoints...")
    model, _, _, _ = load_checkpoint(
        model,
        None,
        dit_checkpoint_path,
        load_only_params=True,
        ignore_modules=[],
        is_distributed=False,
    )
    
    for key in model:
        model[key].eval()
        model[key].to(device)
    
    model.cfm.estimator.setup_caches(max_batch_size=1, max_seq_length=8192)
    print("Main model loaded successfully!")

    return model, f0_fn, config, model_params

# Load the model
model, f0_fn, config, model_params = load_voice_conversion_model(f0_condition=False, fp16_enabled=True)

Loading voice conversion model...
Using model without F0 conditioning
Building DiT model...
Loading model checkpoints...
cfm loaded
length_regulator loaded
Main model loaded successfully!


In [3]:
model

Munch({'cfm': CFM(
  (criterion): L1Loss()
  (estimator): DiT(
    (transformer): Transformer(
      (layers): ModuleList(
        (0-12): 13 x TransformerBlock(
          (attention): Attention(
            (wqkv): Linear(in_features=512, out_features=1536, bias=False)
            (wo): Linear(in_features=512, out_features=512, bias=False)
          )
          (feed_forward): FeedForward(
            (w1): Linear(in_features=512, out_features=1536, bias=False)
            (w3): Linear(in_features=512, out_features=1536, bias=False)
            (w2): Linear(in_features=1536, out_features=512, bias=False)
          )
          (ffn_norm): AdaptiveLayerNorm(
            (project_layer): Linear(in_features=512, out_features=1024, bias=True)
            (norm): RMSNorm()
          )
          (attention_norm): AdaptiveLayerNorm(
            (project_layer): Linear(in_features=512, out_features=1024, bias=True)
            (norm): RMSNorm()
          )
          (skip_in_linear): Linear(in

In [4]:
def load_additional_models(model_params, config):
    """Load additional models required for voice conversion."""
    
    # Load CAMPPlus speaker encoder
    print("Loading CAMPPlus speaker encoder...")
    from modules.campplus.DTDNN import CAMPPlus
    campplus_ckpt_path = load_custom_model_from_hf(
        "funasr/campplus", "campplus_cn_common.bin", config_filename=None
    )
    campplus_model = CAMPPlus(feat_dim=80, embedding_size=192)
    campplus_model.load_state_dict(torch.load(campplus_ckpt_path, map_location="cpu"))
    campplus_model.eval()
    campplus_model.to(device)
    print("CAMPPlus loaded successfully!")

    # Load vocoder
    vocoder_type = model_params.vocoder.type
    print(f"Loading {vocoder_type} vocoder...")
    
    if vocoder_type == 'bigvgan':
        from modules.bigvgan import bigvgan
        bigvgan_name = model_params.vocoder.name
        bigvgan_model = bigvgan.BigVGAN.from_pretrained(bigvgan_name, use_cuda_kernel=False)
        bigvgan_model.remove_weight_norm()
        bigvgan_model = bigvgan_model.eval().to(device)
        vocoder_fn = bigvgan_model
        print("BigVGAN vocoder loaded successfully!")
    elif vocoder_type == 'hifigan':
        from modules.hifigan.generator import HiFTGenerator
        from modules.hifigan.f0_predictor import ConvRNNF0Predictor
        hift_config = yaml.safe_load(open('./seed-vc/configs/hifigan.yml', 'r'))
        hift_gen = HiFTGenerator(**hift_config['hift'], f0_predictor=ConvRNNF0Predictor(**hift_config['f0_predictor']))
        hift_path = load_custom_model_from_hf("FunAudioLLM/CosyVoice-300M", 'hift.pt', None)
        hift_gen.load_state_dict(torch.load(hift_path, map_location='cpu'))
        hift_gen.eval()
        hift_gen.to(device)
        vocoder_fn = hift_gen
        print("HiFiGAN vocoder loaded successfully!")
    else:
        raise ValueError(f"Unknown vocoder type: {vocoder_type}")

    # Load speech tokenizer (Whisper)
    speech_tokenizer_type = model_params.speech_tokenizer.type
    print(f"Loading {speech_tokenizer_type} speech tokenizer...")
    
    if speech_tokenizer_type == 'whisper':
        from transformers import AutoFeatureExtractor, WhisperModel
        whisper_name = model_params.speech_tokenizer.name
        whisper_model = WhisperModel.from_pretrained(whisper_name, torch_dtype=torch.float16).to(device)
        del whisper_model.decoder
        whisper_feature_extractor = AutoFeatureExtractor.from_pretrained(whisper_name)

        def semantic_fn(waves_16k):
            ori_inputs = whisper_feature_extractor([waves_16k.squeeze(0).cpu().numpy()],
                                                   return_tensors="pt",
                                                   return_attention_mask=True)
            ori_input_features = whisper_model._mask_input_features(
                ori_inputs.input_features, attention_mask=ori_inputs.attention_mask).to(device)
            with torch.no_grad():
                ori_outputs = whisper_model.encoder(
                    ori_input_features.to(whisper_model.encoder.dtype),
                    head_mask=None,
                    output_attentions=False,
                    output_hidden_states=False,
                    return_dict=True,
                )
            S_ori = ori_outputs.last_hidden_state.to(torch.float32)
            S_ori = S_ori[:, :waves_16k.size(-1) // 320 + 1]
            return S_ori
        print("Whisper speech tokenizer loaded successfully!")
    else:
        raise ValueError(f"Unknown speech tokenizer type: {speech_tokenizer_type}")

    # Setup mel spectrogram function
    mel_fn_args = {
        "n_fft": config['preprocess_params']['spect_params']['n_fft'],
        "win_size": config['preprocess_params']['spect_params']['win_length'],
        "hop_size": config['preprocess_params']['spect_params']['hop_length'],
        "num_mels": config['preprocess_params']['spect_params']['n_mels'],
        "sampling_rate": config['preprocess_params']['sr'],
        "fmin": config['preprocess_params']['spect_params'].get('fmin', 0),
        "fmax": None if config['preprocess_params']['spect_params'].get('fmax', "None") == "None" else 8000,
        "center": False
    }
    from modules.audio import mel_spectrogram
    to_mel = lambda x: mel_spectrogram(x, **mel_fn_args)

    return semantic_fn, vocoder_fn, campplus_model, to_mel, mel_fn_args

# Load additional models
semantic_fn, vocoder_fn, campplus_model, to_mel, mel_fn_args = load_additional_models(model_params, config)
print("All models loaded successfully!")

Loading CAMPPlus speaker encoder...
CAMPPlus loaded successfully!
Loading bigvgan vocoder...
Loading weights from nvidia/bigvgan_v2_22khz_80band_256x
Removing weight norm...
BigVGAN vocoder loaded successfully!
Loading whisper speech tokenizer...
Whisper speech tokenizer loaded successfully!
All models loaded successfully!


In [23]:
from modules.audio import mel_spectrogram


In [30]:
model.length_regulator

InterpolateRegulator(
  (model): Sequential(
    (0): Conv1d(512, 512, kernel_size=(3,), stride=(1,), padding=(1,))
    (1): GroupNorm(1, 512, eps=1e-05, affine=True)
    (2): Mish()
    (3): Conv1d(512, 512, kernel_size=(3,), stride=(1,), padding=(1,))
    (4): GroupNorm(1, 512, eps=1e-05, affine=True)
    (5): Mish()
    (6): Conv1d(512, 512, kernel_size=(3,), stride=(1,), padding=(1,))
    (7): GroupNorm(1, 512, eps=1e-05, affine=True)
    (8): Mish()
    (9): Conv1d(512, 512, kernel_size=(3,), stride=(1,), padding=(1,))
    (10): GroupNorm(1, 512, eps=1e-05, affine=True)
    (11): Mish()
    (12): Conv1d(512, 512, kernel_size=(1,), stride=(1,))
  )
  (embedding): Embedding(2048, 512)
  (content_in_proj): Linear(in_features=768, out_features=512, bias=True)
)

In [5]:
def perform_voice_conversion(source_path, target_path, diffusion_steps=30, length_adjust=1.0, inference_cfg_rate=0.7):
    """
    Perform voice conversion from source audio to target speaker style.
    
    Args:
        source_path: Path to source audio file
        target_path: Path to target/reference audio file
        diffusion_steps: Number of diffusion steps for generation
        length_adjust: Length adjustment factor
        inference_cfg_rate: CFG rate for inference
    
    Returns:
        Generated audio tensor and sampling rate
    """
    sr = mel_fn_args['sampling_rate']
    hop_length = mel_fn_args['hop_size']
    max_context_window = sr // hop_length * 30
    overlap_frame_len = 16
    overlap_wave_len = overlap_frame_len * hop_length
    
    print(f"Loading source audio: {source_path}")
    print(f"Loading target audio: {target_path}")
    
    # Load audio files
    source_audio = librosa.load(source_path, sr=sr)[0]
    ref_audio = librosa.load(target_path, sr=sr)[0]
    
    # Convert to tensors
    source_audio = torch.tensor(source_audio).unsqueeze(0).float().to(device)
    ref_audio = torch.tensor(ref_audio[:sr * 25]).unsqueeze(0).float().to(device)  # Limit ref audio to 25 seconds
    
    print(f"Source audio shape: {source_audio.shape}")
    print(f"Reference audio shape: {ref_audio.shape}")
    
    start_time = time.time()
    
    # Resample to 16kHz for semantic processing
    converted_waves_16k = torchaudio.functional.resample(source_audio, sr, 16000)
    ori_waves_16k = torchaudio.functional.resample(ref_audio, sr, 16000)
    
    # Extract semantic features
    print("Extracting semantic features...")
    S_alt = semantic_fn(converted_waves_16k)
    S_ori = semantic_fn(ori_waves_16k)
    
    # Extract mel spectrograms
    print("Extracting mel spectrograms...")
    mel = to_mel(source_audio.to(device).float())
    mel2 = to_mel(ref_audio.to(device).float())
    
    target_lengths = torch.LongTensor([int(mel.size(2) * length_adjust)]).to(mel.device)
    target2_lengths = torch.LongTensor([mel2.size(2)]).to(mel2.device)
    
    # Extract speaker embedding
    print("Extracting speaker embedding...")
    feat2 = torchaudio.compliance.kaldi.fbank(ori_waves_16k,
                                              num_mel_bins=80,
                                              dither=0,
                                              sample_frequency=16000)
    feat2 = feat2 - feat2.mean(dim=0, keepdim=True)
    style2 = campplus_model(feat2.unsqueeze(0))
    
    # Length regulation
    print("Performing length regulation...")
    cond, _, codes, commitment_loss, codebook_loss = model.length_regulator(S_alt, ylens=target_lengths,
                                                                           n_quantizers=3, f0=None)
    prompt_condition, _, codes, commitment_loss, codebook_loss = model.length_regulator(S_ori,
                                                                                       ylens=target2_lengths,
                                                                                       n_quantizers=3, f0=None)
    
    # Generate audio chunks
    print("Generating converted audio...")
    max_source_window = max_context_window - mel2.size(2)
    processed_frames = 0
    generated_wave_chunks = []
    
    def crossfade(chunk1, chunk2, overlap):
        fade_out = np.cos(np.linspace(0, np.pi / 2, overlap)) ** 2
        fade_in = np.cos(np.linspace(np.pi / 2, 0, overlap)) ** 2
        if len(chunk2) < overlap:
            chunk2[:overlap] = chunk2[:overlap] * fade_in[:len(chunk2)] + (chunk1[-overlap:] * fade_out)[:len(chunk2)]
        else:
            chunk2[:overlap] = chunk2[:overlap] * fade_in + chunk1[-overlap:] * fade_out
        return chunk2
    
    while processed_frames < cond.size(1):
        chunk_cond = cond[:, processed_frames:processed_frames + max_source_window]
        is_last_chunk = processed_frames + max_source_window >= cond.size(1)
        cat_condition = torch.cat([prompt_condition, chunk_cond], dim=1)
        
        with torch.autocast(device_type=device.type, dtype=torch.float16 if fp16 else torch.float32):
            # Voice Conversion
            vc_target = model.cfm.inference(cat_condition,
                                           torch.LongTensor([cat_condition.size(1)]).to(mel2.device),
                                           mel2, style2, None, diffusion_steps,
                                           inference_cfg_rate=inference_cfg_rate)
            vc_target = vc_target[:, :, mel2.size(-1):]
        
        with torch.inference_mode():
            vc_wave = vocoder_fn(vc_target.float()).squeeze()
        vc_wave = vc_wave[None, :]
        
        if processed_frames == 0:
            if is_last_chunk:
                output_wave = vc_wave[0].cpu().numpy()
                generated_wave_chunks.append(output_wave)
                break
            output_wave = vc_wave[0, :-overlap_wave_len].cpu().numpy()
            generated_wave_chunks.append(output_wave)
            previous_chunk = vc_wave[0, -overlap_wave_len:]
            processed_frames += vc_target.size(2) - overlap_frame_len
        elif is_last_chunk:
            output_wave = crossfade(previous_chunk.cpu().numpy(), vc_wave[0].cpu().numpy(), overlap_wave_len)
            generated_wave_chunks.append(output_wave)
            processed_frames += vc_target.size(2) - overlap_frame_len
            break
        else:
            output_wave = crossfade(previous_chunk.cpu().numpy(), vc_wave[0, :-overlap_wave_len].cpu().numpy(),
                                  overlap_wave_len)
            generated_wave_chunks.append(output_wave)
            previous_chunk = vc_wave[0, -overlap_wave_len:]
            processed_frames += vc_target.size(2) - overlap_frame_len
    
    vc_wave = torch.tensor(np.concatenate(generated_wave_chunks))[None, :].float()
    
    end_time = time.time()
    rtf = (end_time - start_time) / vc_wave.size(-1) * sr
    print(f"Voice conversion completed!")
    print(f"Processing time: {end_time - start_time:.2f} seconds")
    print(f"RTF (Real-Time Factor): {rtf:.4f}")
    print(f"Generated audio length: {vc_wave.size(-1) / sr:.2f} seconds")
    
    return vc_wave, sr

print("Voice conversion function ready!")

Voice conversion function ready!


In [11]:
# Record your own audio files for voice conversion
import sounddevice as sd
import soundfile as sf
import IPython.display as ipd
from IPython.display import display, HTML, clear_output
import ipywidgets as widgets
from threading import Thread
import queue
import time

# Audio recording parameters
SAMPLE_RATE = 22050  # Match the model's expected sample rate
CHANNELS = 1  # Mono audio

# Automatically select USB microphone if available
def setup_audio_device():
    """Setup audio device, preferring USB microphone if available"""
    try:
        devices = sd.query_devices()
        input_devices = [d for d in devices if d['max_input_channels'] > 0]
        
        # Look for USB microphone specifically
        usb_mic_device = None
        for i, device in enumerate(input_devices):
            if 'USB mic' in device['name'] or 'Microphone (USB mic)' in device['name']:
                usb_mic_device = i
                print(f"Found USB microphone: {device['name']}")
                break
        
        if usb_mic_device is not None:
            # Set the USB microphone as default input device
            sd.default.device[0] = usb_mic_device  # Input device
            print(f"Set default input device to: {devices[usb_mic_device]['name']}")
        else:
            print("USB microphone not found, using system default")
            
        return True
    except Exception as e:
        print(f"Error setting up audio device: {e}")
        return False

# Setup audio device
setup_audio_device()

class AudioRecorder:
    def __init__(self, sample_rate=SAMPLE_RATE, channels=CHANNELS):
        self.sample_rate = sample_rate
        self.channels = channels
        self.recording = False
        self.audio_data = []
        self.stream = None
        
    def start_recording(self):
        """Start recording audio"""
        self.recording = True
        self.audio_data = []
        
        def audio_callback(indata, frames, time_info, status):
            if self.recording:
                self.audio_data.append(indata.copy())
        
        self.stream = sd.InputStream(
            callback=audio_callback,
            channels=self.channels,
            samplerate=self.sample_rate,
            dtype='float32'
        )
        self.stream.start()
        
    def stop_recording(self):
        """Stop recording and return audio data"""
        self.recording = False
        if self.stream:
            self.stream.stop()
            self.stream.close()
        
        if self.audio_data:
            import numpy as np
            audio = np.concatenate(self.audio_data, axis=0)
            return audio.flatten()
        return None
    
    def is_recording(self):
        return self.recording

# Create recorder instance
recorder = AudioRecorder()

def record_audio(duration=5, label="Audio"):
    """Record audio for specified duration"""
    print(f"Recording {label} for {duration} seconds...")
    print("Recording started! Speak into your microphone...")
    
    recorder.start_recording()
    
    # Show countdown
    for i in range(duration, 0, -1):
        print(f"\r{i} seconds remaining...", end="", flush=True)
        time.sleep(1)
    
    print(f"\rRecording complete!                    ")
    
    audio_data = recorder.stop_recording()
    
    if audio_data is not None:
        print(f"Recorded {len(audio_data) / SAMPLE_RATE:.2f} seconds of audio")
        return audio_data
    else:
        print("No audio recorded!")
        return None

def save_and_play_audio(audio_data, filename, label="Audio"):
    """Save audio to file and display player"""
    if audio_data is not None:
        # Ensure output directory exists
        output_dir = "./recorded_audio"
        os.makedirs(output_dir, exist_ok=True)
        
        filepath = os.path.join(output_dir, filename)
        
        # Save audio file
        sf.write(filepath, audio_data, SAMPLE_RATE)
        print(f"{label} saved to: {filepath}")
        
        # Display audio player
        print(f"Playback {label}:")
        display(ipd.Audio(filepath))
        
        return filepath
    return None

# Interactive recording interface
print("=== Audio Recording Interface ===")
print("This will help you record source and target audio for voice conversion")
print(f"Recording parameters: {SAMPLE_RATE} Hz, {CHANNELS} channel(s)")
print()

# Check if audio devices are available
try:
    devices = sd.query_devices()
    input_devices = [d for d in devices if d['max_input_channels'] > 0]
    if input_devices:
        print("Available input devices:")
        current_input = sd.default.device[0] if isinstance(sd.default.device, (list, tuple)) else sd.default.device
        
        for i, device in enumerate(input_devices):
            marker = " ✓ (selected)" if i == current_input else ""
            usb_marker = " [USB MIC]" if ('USB mic' in device['name'] or 'Microphone (USB mic)' in device['name']) else ""
            print(f"  {i}: {device['name']}{usb_marker}{marker}")
        print()
        
        # Show current default device
        if current_input is not None:
            try:
                current_device = devices[current_input]
                print(f"Current input device: {current_device['name']}")
            except:
                print("Current input device: System default")
        else:
            print("Current input device: System default")
        print()
    else:
        print("Warning: No input devices found!")
except Exception as e:
    print(f"Error checking audio devices: {e}")

# Recording controls
print("Instructions:")
print("1. Run the cells below to record source audio (the voice you want to convert)")
print("2. Then record target audio (the voice style you want to convert to)")
print("3. The recorded files will be used in the voice conversion process")
print()
print("Tip: For best results:")
print("- Record in a quiet environment")
print("- Speak clearly and at normal volume")
print("- Keep recordings between 3-10 seconds")
print("- Target audio should be good quality with clear speech")

Found USB microphone: Microphone (USB mic)
Set default input device to: Microphone (USB mic)
=== Audio Recording Interface ===
This will help you record source and target audio for voice conversion
Recording parameters: 22050 Hz, 1 channel(s)

Available input devices:
  0: Microsoft Sound Mapper - Input
  1: Microphone (USB mic) [USB MIC]
  2: Microphone (Realtek High Defini
  3: Stereo Mix (Realtek HD Audio Stereo input)
  4: Microphone (Realtek HD Audio Mic input)
  5: Microphone (USB mic) [USB MIC]
  6: Input ()

Current input device: System default

Instructions:
1. Run the cells below to record source audio (the voice you want to convert)
2. Then record target audio (the voice style you want to convert to)
3. The recorded files will be used in the voice conversion process

Tip: For best results:
- Record in a quiet environment
- Speak clearly and at normal volume
- Keep recordings between 3-10 seconds
- Target audio should be good quality with clear speech


In [12]:
# Manual Audio Device Selection (Optional)
# Run this cell only if you want to manually select a different audio device

def list_and_select_device():
    """List all input devices and allow manual selection"""
    try:
        devices = sd.query_devices()
        input_devices = [(i, d) for i, d in enumerate(devices) if d['max_input_channels'] > 0]
        
        if not input_devices:
            print("No input devices found!")
            return
            
        print("Available input devices:")
        for i, (device_id, device) in enumerate(input_devices):
            current_marker = " ✓ (current)" if device_id == sd.default.device[0] else ""
            usb_marker = " [USB MIC]" if ('USB mic' in device['name'] or 'Microphone (USB mic)' in device['name']) else ""
            print(f"  {i}: {device['name']}{usb_marker}{current_marker}")
        
        # Uncomment the lines below to manually select a device
        # device_choice = int(input("Enter device number: "))
        # if 0 <= device_choice < len(input_devices):
        #     selected_device_id = input_devices[device_choice][0]
        #     sd.default.device[0] = selected_device_id
        #     print(f"Selected device: {input_devices[device_choice][1]['name']}")
        # else:
        #     print("Invalid device number!")
            
    except Exception as e:
        print(f"Error: {e}")

def select_device_by_name(device_name):
    """Select audio device by name"""
    try:
        devices = sd.query_devices()
        for i, device in enumerate(devices):
            if device['max_input_channels'] > 0 and device_name.lower() in device['name'].lower():
                sd.default.device[0] = i
                print(f"Selected device: {device['name']}")
                return True
        print(f"Device containing '{device_name}' not found!")
        return False
    except Exception as e:
        print(f"Error selecting device: {e}")
        return False

# List current devices
list_and_select_device()

# Example: Uncomment to select a specific device by name
# select_device_by_name("USB mic")  # This will select any device with "USB mic" in the name
# select_device_by_name("Microphone")  # This will select any device with "Microphone" in the name

print("\nCurrent audio setup complete. You can now proceed with recording.")

Available input devices:
  0: Microsoft Sound Mapper - Input
  1: Microphone (USB mic) [USB MIC] ✓ (current)
  2: Microphone (Realtek High Defini
  3: Stereo Mix (Realtek HD Audio Stereo input)
  4: Microphone (Realtek HD Audio Mic input)
  5: Microphone (USB mic) [USB MIC]
  6: Input ()

Current audio setup complete. You can now proceed with recording.


In [17]:
# Record Source Audio (the voice you want to convert)
print("=== Recording Source Audio ===")
print("This will be the voice that gets converted to the target style")
print("Speak naturally - this could be your voice or any speech you want to convert")
print()

# Record source audio
source_duration = 10  # seconds - adjust as needed
source_audio_data = record_audio(duration=source_duration, label="Source Audio")

# Save and play source audio
if source_audio_data is not None:
    recorded_source_path = save_and_play_audio(
        source_audio_data, 
        "recorded_source.wav", 
        "Source Audio"
    )
    print(f"✓ Source audio ready: {recorded_source_path}")
else:
    print("❌ Failed to record source audio")
    recorded_source_path = None

=== Recording Source Audio ===
This will be the voice that gets converted to the target style
Speak naturally - this could be your voice or any speech you want to convert

Recording Source Audio for 10 seconds...
Recording started! Speak into your microphone...
Recording complete!                    
Recorded 9.85 seconds of audio
Source Audio saved to: ./recorded_audio\recorded_source.wav
Playback Source Audio:
Recording complete!                    
Recorded 9.85 seconds of audio
Source Audio saved to: ./recorded_audio\recorded_source.wav
Playback Source Audio:


✓ Source audio ready: ./recorded_audio\recorded_source.wav


In [19]:
# Record Target Audio (the voice style you want to achieve)
print("=== Recording Target Audio ===")
print("This will be the reference voice style for conversion")
print("Speak in the voice/style you want the source to sound like")
print("For best results, use clear, high-quality speech")
print()

# Record target audio
target_duration = 10  # seconds - adjust as needed  
target_audio_data = record_audio(duration=target_duration, label="Target Audio")

# Save and play target audio
if target_audio_data is not None:
    recorded_target_path = save_and_play_audio(
        target_audio_data, 
        "recorded_target.wav", 
        "Target Audio"
    )
    print(f"✓ Target audio ready: {recorded_target_path}")
else:
    print("❌ Failed to record target audio")
    recorded_target_path = None

# Check if both recordings are ready
print("\n=== Recording Status ===")
if recorded_source_path and recorded_target_path:
    print("✓ Both audio files recorded successfully!")
    print(f"Source: {recorded_source_path}")
    print(f"Target: {recorded_target_path}")
    print("Ready for voice conversion!")
else:
    print("❌ Missing audio recordings:")
    if not recorded_source_path:
        print("  - Source audio not recorded")
    if not recorded_target_path:
        print("  - Target audio not recorded")
    print("Please re-run the recording cells above.")

=== Recording Target Audio ===
This will be the reference voice style for conversion
Speak in the voice/style you want the source to sound like
For best results, use clear, high-quality speech

Recording Target Audio for 10 seconds...
Recording started! Speak into your microphone...
Recording complete!                    
Recorded 9.82 seconds of audio
Target Audio saved to: ./recorded_audio\recorded_target.wav
Playback Target Audio:
Recording complete!                    
Recorded 9.82 seconds of audio
Target Audio saved to: ./recorded_audio\recorded_target.wav
Playback Target Audio:


✓ Target audio ready: ./recorded_audio\recorded_target.wav

=== Recording Status ===
✓ Both audio files recorded successfully!
Source: ./recorded_audio\recorded_source.wav
Target: ./recorded_audio\recorded_target.wav
Ready for voice conversion!


In [20]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Voice Conversion with Recorded Audio
print("=== Voice Conversion with Recorded Audio ===")

# Check if we have recorded audio files
if 'recorded_source_path' in locals() and 'recorded_target_path' in locals():
    if recorded_source_path and recorded_target_path and os.path.exists(recorded_source_path) and os.path.exists(recorded_target_path):
        
        # Use recorded audio files
        source_path = recorded_source_path
        target_path = recorded_target_path
        
        print(f"Using recorded audio files:")
        print(f"Source: {source_path}")
        print(f"Target: {target_path}")
        
    else:
        print("❌ Recorded audio files not found!")
        print("Using default sample files as fallback...")
        source_path = "./seed-vc/examples/source/source_s1.wav"
        target_path = "./seed-vc/examples/reference/s1p1.wav"
else:
    print("⚠️ No recorded audio detected!")
    print("Using default sample files...")
    source_path = "./seed-vc/examples/source/source_s1.wav"
    target_path = "./seed-vc/examples/reference/s1p1.wav"

# Voice conversion parameters
diffusion_steps = 25
length_adjust = 1.0
inference_cfg_rate = 0.7

print(f"\nVoice Conversion Parameters:")
print(f"Diffusion steps: {diffusion_steps}")
print(f"Length adjust: {length_adjust}")
print(f"CFG rate: {inference_cfg_rate}")

# Perform voice conversion using the function
print(f"\n=== Starting Voice Conversion ===")
try:
    converted_audio, output_sr = perform_voice_conversion(
        source_path, 
        target_path,
        diffusion_steps=diffusion_steps,
        length_adjust=length_adjust,
        inference_cfg_rate=inference_cfg_rate
    )
    
    # Save converted audio
    output_dir = "./output"
    os.makedirs(output_dir, exist_ok=True)
    
    # Create filename based on whether we used recorded or sample audio
    if 'recorded_source_path' in locals() and source_path == recorded_source_path:
        output_filename = "converted_recorded_audio.wav"
    else:
        output_filename = "converted_sample_audio.wav"
    
    output_path = os.path.join(output_dir, output_filename)
    torchaudio.save(output_path, converted_audio.cpu(), output_sr)
    print(f"Converted audio saved to: {output_path}")
    
    # Display audio comparison
    print(f"\n=== Audio Comparison ===")
    print("Original Source:")
    display(ipd.Audio(source_path))
    
    print("Target Reference:")
    display(ipd.Audio(target_path))
    
    print("Converted Result:")
    display(ipd.Audio(output_path))
    
    print(f"\n✓ Voice conversion completed successfully!")
    
except Exception as e:
    print(f"❌ Error during voice conversion: {e}")
    import traceback
    traceback.print_exc()

=== Voice Conversion with Recorded Audio ===
Using recorded audio files:
Source: ./recorded_audio\recorded_source.wav
Target: ./recorded_audio\recorded_target.wav

Voice Conversion Parameters:
Diffusion steps: 25
Length adjust: 1.0
CFG rate: 0.7

=== Starting Voice Conversion ===
Loading source audio: ./recorded_audio\recorded_source.wav
Loading target audio: ./recorded_audio\recorded_target.wav


It is strongly recommended to pass the `sampling_rate` argument to `WhisperFeatureExtractor()`. Failing to do so can result in silent errors that might be hard to debug.


Source audio shape: torch.Size([1, 217152])
Reference audio shape: torch.Size([1, 216576])
Extracting semantic features...


It is strongly recommended to pass the `sampling_rate` argument to `WhisperFeatureExtractor()`. Failing to do so can result in silent errors that might be hard to debug.


Extracting mel spectrograms...
Extracting speaker embedding...
Performing length regulation...
Generating converted audio...
Generating converted audio...


100%|██████████| 25/25 [01:00<00:00,  2.40s/it]



Voice conversion completed!
Processing time: 77.46 seconds
RTF (Real-Time Factor): 7.8677
Generated audio length: 9.85 seconds
Converted audio saved to: ./output\converted_recorded_audio.wav

=== Audio Comparison ===
Original Source:


Target Reference:


Converted Result:



✓ Voice conversion completed successfully!


In [21]:
to_mel

<function __main__.load_additional_models.<locals>.<lambda>(x)>

In [None]:
# Direct voice conversion execution (without function wrapper)
# This performs the same operations as perform_voice_conversion but directly in the cell

# Configuration parameters - Use recorded audio if available
if 'recorded_source_path' in locals() and 'recorded_target_path' in locals():
    if recorded_source_path and recorded_target_path and os.path.exists(recorded_source_path) and os.path.exists(recorded_target_path):
        source_path = recorded_source_path
        target_path = recorded_target_path
        print("Using recorded audio files for direct conversion")
    else:
        source_path = "./seed-vc/examples/source/source_s1.wav"
        target_path = "./seed-vc/examples/reference/s1p1.wav"
        print("Using sample audio files (recorded audio not available)")
else:
    source_path = "./seed-vc/examples/source/source_s1.wav"
    target_path = "./seed-vc/examples/reference/s1p1.wav"
    print("Using sample audio files (no recorded audio detected)")

diffusion_steps = 25
length_adjust = 1.0
inference_cfg_rate = 0.7

print(f"=== Direct Voice Conversion Execution ===")
print(f"Source: {source_path}")
print(f"Target: {target_path}")
print(f"Diffusion steps: {diffusion_steps}")
print(f"Length adjust: {length_adjust}")
print(f"CFG rate: {inference_cfg_rate}")

# Get parameters from mel function
sr = mel_fn_args['sampling_rate']
hop_length = mel_fn_args['hop_size']
max_context_window = sr // hop_length * 30
overlap_frame_len = 16
overlap_wave_len = overlap_frame_len * hop_length

print(f"Sample rate: {sr} Hz")
print(f"Hop length: {hop_length}")

# Load audio files
print("\nLoading audio files...")
source_audio = librosa.load(source_path, sr=sr)[0]
ref_audio = librosa.load(target_path, sr=sr)[0]

# Convert to tensors
source_audio = torch.tensor(source_audio).unsqueeze(0).float().to(device)
ref_audio = torch.tensor(ref_audio[:sr * 25]).unsqueeze(0).float().to(device)  # Limit ref audio to 25 seconds

print(f"Source audio shape: {source_audio.shape}")
print(f"Reference audio shape: {ref_audio.shape}")
print(f"Source duration: {source_audio.shape[-1] / sr:.2f} seconds")
print(f"Reference duration: {ref_audio.shape[-1] / sr:.2f} seconds")

# Start timing
start_time = time.time()

# Resample to 16kHz for semantic processing
print("\nResampling audio for semantic processing...")
converted_waves_16k = torchaudio.functional.resample(source_audio, sr, 16000)
ori_waves_16k = torchaudio.functional.resample(ref_audio, sr, 16000)

print(f"Resampled source shape: {converted_waves_16k.shape}")
print(f"Resampled reference shape: {ori_waves_16k.shape}")

# Extract semantic features
print("\nExtracting semantic features...")
S_alt = semantic_fn(converted_waves_16k)
S_ori = semantic_fn(ori_waves_16k)

print(f"Source semantic features shape: {S_alt.shape}")
print(f"Reference semantic features shape: {S_ori.shape}")

# Extract mel spectrograms
print("\nExtracting mel spectrograms...")
mel = to_mel(source_audio.to(device).float())
mel2 = to_mel(ref_audio.to(device).float())

print(f"Source mel shape: {mel.shape}")
print(f"Reference mel shape: {mel2.shape}")

target_lengths = torch.LongTensor([int(mel.size(2) * length_adjust)]).to(mel.device)
target2_lengths = torch.LongTensor([mel2.size(2)]).to(mel2.device)

print(f"Target lengths: {target_lengths}")
print(f"Reference lengths: {target2_lengths}")

# Extract speaker embedding
print("\nExtracting speaker embedding...")
feat2 = torchaudio.compliance.kaldi.fbank(ori_waves_16k,
                                          num_mel_bins=80,
                                          dither=0,
                                          sample_frequency=16000)
feat2 = feat2 - feat2.mean(dim=0, keepdim=True)
style2 = campplus_model(feat2.unsqueeze(0))

print(f"Speaker embedding shape: {style2.shape}")

# Length regulation
print("\nPerforming length regulation...")
cond, _, codes, commitment_loss, codebook_loss = model.length_regulator(S_alt, ylens=target_lengths,
                                                                       n_quantizers=3, f0=None)
prompt_condition, _, codes, commitment_loss, codebook_loss = model.length_regulator(S_ori,
                                                                                   ylens=target2_lengths,
                                                                                   n_quantizers=3, f0=None)

print(f"Source condition shape: {cond.shape}")
print(f"Prompt condition shape: {prompt_condition.shape}")

# Generate audio chunks
print("\nGenerating converted audio...")
max_source_window = max_context_window - mel2.size(2)
processed_frames = 0
generated_wave_chunks = []

print(f"Max source window: {max_source_window}")
print(f"Total frames to process: {cond.size(1)}")

# Crossfade function (defined inline)
def crossfade_inline(chunk1, chunk2, overlap):
    fade_out = np.cos(np.linspace(0, np.pi / 2, overlap)) ** 2
    fade_in = np.cos(np.linspace(np.pi / 2, 0, overlap)) ** 2
    if len(chunk2) < overlap:
        chunk2[:overlap] = chunk2[:overlap] * fade_in[:len(chunk2)] + (chunk1[-overlap:] * fade_out)[:len(chunk2)]
    else:
        chunk2[:overlap] = chunk2[:overlap] * fade_in + chunk1[-overlap:] * fade_out
    return chunk2

chunk_count = 0
while processed_frames < cond.size(1):
    chunk_count += 1
    print(f"Processing chunk {chunk_count}, frames {processed_frames}/{cond.size(1)}")
    
    chunk_cond = cond[:, processed_frames:processed_frames + max_source_window]
    is_last_chunk = processed_frames + max_source_window >= cond.size(1)
    cat_condition = torch.cat([prompt_condition, chunk_cond], dim=1)
    
    print(f"  Chunk condition shape: {chunk_cond.shape}")
    print(f"  Concatenated condition shape: {cat_condition.shape}")
    print(f"  Is last chunk: {is_last_chunk}")
    
    with torch.autocast(device_type=device.type, dtype=torch.float16 if fp16 else torch.float32):
        # Voice Conversion
        vc_target = model.cfm.inference(cat_condition,
                                       torch.LongTensor([cat_condition.size(1)]).to(mel2.device),
                                       mel2, style2, None, diffusion_steps,
                                       inference_cfg_rate=inference_cfg_rate)
        vc_target = vc_target[:, :, mel2.size(-1):]
    
    print(f"  Generated target shape: {vc_target.shape}")
    
    with torch.inference_mode():
        vc_wave = vocoder_fn(vc_target.float()).squeeze()
    vc_wave = vc_wave[None, :]
    
    print(f"  Vocoded wave shape: {vc_wave.shape}")
    
    if processed_frames == 0:
        if is_last_chunk:
            output_wave = vc_wave[0].cpu().numpy()
            generated_wave_chunks.append(output_wave)
            print(f"  Single chunk output: {output_wave.shape}")
            break
        output_wave = vc_wave[0, :-overlap_wave_len].cpu().numpy()
        generated_wave_chunks.append(output_wave)
        previous_chunk = vc_wave[0, -overlap_wave_len:]
        processed_frames += vc_target.size(2) - overlap_frame_len
        print(f"  First chunk output: {output_wave.shape}")
    elif is_last_chunk:
        output_wave = crossfade_inline(previous_chunk.cpu().numpy(), vc_wave[0].cpu().numpy(), overlap_wave_len)
        generated_wave_chunks.append(output_wave)
        processed_frames += vc_target.size(2) - overlap_frame_len
        print(f"  Last chunk output: {output_wave.shape}")
        break
    else:
        output_wave = crossfade_inline(previous_chunk.cpu().numpy(), vc_wave[0, :-overlap_wave_len].cpu().numpy(),
                              overlap_wave_len)
        generated_wave_chunks.append(output_wave)
        previous_chunk = vc_wave[0, -overlap_wave_len:]
        processed_frames += vc_target.size(2) - overlap_frame_len
        print(f"  Middle chunk output: {output_wave.shape}")

# Concatenate all chunks
print(f"\nConcatenating {len(generated_wave_chunks)} chunks...")
vc_wave = torch.tensor(np.concatenate(generated_wave_chunks))[None, :].float()

end_time = time.time()
rtf = (end_time - start_time) / vc_wave.size(-1) * sr

print(f"\n=== Voice Conversion Completed! ===")
print(f"Final output shape: {vc_wave.shape}")
print(f"Processing time: {end_time - start_time:.2f} seconds")
print(f"RTF (Real-Time Factor): {rtf:.4f}")
print(f"Generated audio length: {vc_wave.size(-1) / sr:.2f} seconds")

# Save the converted audio
output_dir = "./output"
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, "direct_converted_audio.wav")
torchaudio.save(output_path, vc_wave.cpu(), sr)
print(f"Converted audio saved to: {output_path}")

# Display audio players for comparison
print(f"\n=== Audio Comparison ===")
print("Original Source:")
ipd.display(ipd.Audio(source_path))

print("Target Reference:")
ipd.display(ipd.Audio(target_path))

print("Converted Result:")
ipd.display(ipd.Audio(output_path))

Using recorded audio files for direct conversion
=== Direct Voice Conversion Execution ===
Source: ./recorded_audio\recorded_source.wav
Target: ./recorded_audio\recorded_target.wav
Diffusion steps: 25
Length adjust: 1.0
CFG rate: 0.7
Sample rate: 22050 Hz
Hop length: 256

Loading audio files...


TypeError: to() received an invalid combination of arguments - got (dict), but expected one of:
 * (torch.device device = None, torch.dtype dtype = None, bool non_blocking = False, bool copy = False, *, torch.memory_format memory_format = None)
 * (torch.dtype dtype, bool non_blocking = False, bool copy = False, *, torch.memory_format memory_format = None)
 * (Tensor tensor, bool non_blocking = False, bool copy = False, *, torch.memory_format memory_format = None)


In [None]:
device

{'name': 'Input ()',
 'index': 9,
 'hostapi': 1,
 'max_input_channels': 8,
 'max_output_channels': 0,
 'default_low_input_latency': 0.01,
 'default_low_output_latency': 0.01,
 'default_high_input_latency': 0.08533333333333333,
 'default_high_output_latency': 0.08533333333333333,
 'default_samplerate': 44100.0}

In [None]:
torch.tensor(source_audio)

torch.Size([110592])