In [44]:
import sys
import os
import numpy as np
import soundfile as sf
import librosa
from pydub import AudioSegment
from pydub.playback import play

# Assuming rtvc is installed and accessible
from synthesizer.inference import Synthesizer
from encoder import inference as encoder
from vocoder import inference as vocoder

def load_models():
    """Loads the synthesizer model.

    Returns:
        Synthesizer: The loaded synthesizer model.
    """
    synthesizer_model = Synthesizer('rtvc/saved_models/synthesizer/synthesizer.pt')
    return synthesizer_model


def convert_to_wav(input_file, output_file):
    """Converts an audio file to WAV format.

    Args:
        input_file (str): Path to the input audio file.
        output_file (str): Path to the output WAV file.
    """
    audio = AudioSegment.from_file(input_file)
    audio.export(output_file, format="wav")


def preprocess_wav(wav, source_sr, target_sr):
    """Preprocesses the WAV audio.

    Args:
        wav (np.ndarray): The audio data as a NumPy array.
        source_sr (int): The sampling rate of the source audio.
        target_sr (int): The target sampling rate.

    Returns:
        np.ndarray: The preprocessed audio data.
    """
    # Resample the wav if needed
    if source_sr is not None and source_sr != target_sr:
        wav = librosa.resample(y=wav, orig_sr=source_sr, target_sr=target_sr)
    return wav


def clone_voice_and_generate_text(synthesizer, audio_file, text):
    """Clones a voice and generates text using the cloned voice.

    Args:
        synthesizer (Synthesizer): The loaded synthesizer model.
        audio_file (str): Path to the audio file containing the reference voice.
        text (str): The text to be spoken with the cloned voice.
    """
    # Convert audio file to WAV format
    converted_audio_file = 'converted_audio.wav'
    convert_to_wav(audio_file, converted_audio_file)

    # Load and preprocess the reference audio file
    try:
        original_wav, sampling_rate = sf.read(converted_audio_file)
        preprocessed_wav = preprocess_wav(original_wav, sampling_rate, synthesizer.sample_rate)
    except Exception as e:
        print(f"Error in loading or preprocessing audio file: {e}")
        return

    # Create the embedding for the voice
    try:
        mel_spectrogram = librosa.feature.melspectrogram(y=preprocessed_wav, sr=synthesizer.sample_rate)
        embed = encoder.embed_utterance(mel_spectrogram)
    except Exception as e:
        print(f"Error in embedding utterance: {e}")
        return

    # Generate the speech from text using the cloned voice
    try:
        spectrogram = synthesizer.synthesize_spectrograms([text], [embed])[0]
        generated_wav = vocoder.infer_waveform(spectrogram)
    except Exception as e:
        print(f"Error in generating speech: {e}")
        return

    # Save or play the generated speech
    output_file = "output.wav"
    try:
        sf.write(output_file, generated_wav, synthesizer.sample_rate)
        print(f"Generated speech saved as {output_file}")
    except Exception as e:
        print(f"Error in saving audio file: {e}")
        return

    try:
        generated_wav_audio = AudioSegment(
            generated_wav.tobytes(),
            frame_rate=synthesizer.sample_rate,
            sample_width=generated_wav.dtype.itemsize,
            channels=1
        )
        play(generated_wav_audio)
    except Exception as e:
        print(f"Error in playing audio: {e}")


# Example usage
if __name__ == "__main__":
    synthesizer = load_models()
    audio_file = 'data_training/audio_1724919320.2932997.wav'
    if os.path.exists(audio_file):
        text = 'Hello, this is a cloned voice speaking!'
        clone_voice_and_generate_text(synthesizer, audio_file, text)
    else:
        print(f"Audio file does not exist: {audio_file}")

Synthesizer using device: cpu
Error in embedding utterance: melspectrogram() takes 0 positional arguments but 2 positional arguments (and 2 keyword-only arguments) were given


In [36]:
import os
from pydub import AudioSegment
from pydub.utils import which

# Set the path to ffmpeg and ffprobe
ffmpeg_path = r"C:\Users\Amaan M k\anaconda3\pkgs\ffmpeg-4.2.2-he774522_0\Library\bin\ffmpeg.exe"
ffprobe_path = r"C:\Users\Amaan M k\anaconda3\pkgs\ffmpeg-4.2.2-he774522_0\Library\bin\ffprobe.exe"

# Update the paths in AudioSegment
AudioSegment.converter = ffmpeg_path
AudioSegment.ffprobe = ffprobe_path

def convert_to_wav(input_file, output_file):
    # Verify the existence of the input file
    if not os.path.exists(input_file):
        print(f"Input file does not exist: {input_file}")
        return

    # Check if the output file already exists
    if os.path.exists(output_file):
        print(f"Output file already exists: {output_file}")
        return

    try:
        audio = AudioSegment.from_file(input_file)
        audio.export(output_file, format="wav")
        print(f"Conversion successful: {output_file}")
    except Exception as e:
        print(f"Error converting to WAV: {e}")

# Example usage
audio_file = 'data_training/audio_1724919320.9108188.wav'
converted_audio_file = 'converted_audio.wav'
convert_to_wav(audio_file, converted_audio_file)


Output file already exists: converted_audio.wav


In [37]:
import sys
import os
import numpy as np
import soundfile as sf
import librosa
from pydub import AudioSegment
from pydub.playback import play 


# Assuming rtvc is installed and accessible
from synthesizer.inference import Synthesizer
from encoder import inference as encoder
from vocoder import inference as vocoder 


def load_models(rtvc_path="rtvc"):
  """Loads the rtvc models from the specified path.

  Args:
      rtvc_path (str, optional): Path to the rtvc directory. Defaults to "rtvc".

  Returns:
      tuple: A tuple containing the loaded models (encoder, synthesizer, vocoder).
  """

  if rtvc_path not in sys.path:
    sys.path.append(rtvc_path)

  try:
    encoder_model = encoder.load_model(os.path.join(rtvc_path, "saved_models/encoder/encoder.pt"))
  except Exception as e:
    print(f"Error loading encoder model: {e}")
    return None, None, None  # Return None for all models if loading fails

  try:
    synthesizer_model = Synthesizer(os.path.join(rtvc_path, "saved_models/synthesizer/synthesizer.pt"))
  except Exception as e:
    print(f"Error loading synthesizer model: {e}")
    return None, None, None  # Return None for all models if loading fails

  try:
    vocoder_model = vocoder.load_model(os.path.join(rtvc_path, "saved_models/vocoder/vocoder.pt"))
  except Exception as e:
    print(f"Error loading vocoder model: {e}")
    return None, None, None  # Return None for all models if loading fails

  return encoder_model, synthesizer_model, vocoder_model


def clone_voice(audio_path, text="Hello there"):
  """Clones a voice from the input audio and speaks the specified text.

  Args:
      audio_path (str): Path to the audio file containing the voice to be cloned.
      text (str, optional): The text to be spoken with the cloned voice. Defaults to "Hello there".

  Returns:
      AudioSegment: The generated audio with the cloned voice speaking the text,
                     or None if there's an error.
  """

  # Load and preprocess the input audio
  audio, sr = librosa.load(audio_path)
  mel_spectrogram = librosa.feature.melspectrogram(y=audio, sr=sr)

  # Load rtvc models
  try:
    encoder_model, synthesizer_model, vocoder_model = load_models()
  except Exception as e:  # Catch potential errors during model loading
    print(f"Error loading models: {e}")
    return None

  # Check if encoder model is loaded correctly (removed unnecessary message)
  if encoder_model is None:
    # Provide more specific error message about encoder loading failure
    print("Failed to load encoder model. Please check the model file path or its integrity.")
    return None

  # Encode the mel spectrogram
  encoded_features = encoder_model.infer(mel_spectrogram)

  # Text-to-speech with cloned voice
  mel_predicted = synthesizer_model.infer_text(text)

  # Generate audio from the predicted mel spectrogram
  generated_audio = vocoder_model.infer(mel_predicted)

  # Convert generated audio data to a pydub AudioSegment
  generated_audio = generated_audio.squeeze()  # Remove channel dimension if present
  dub_audio = AudioSegment(np.float32(generated_audio) * 32767, frame_rate=sr, channels=1, sample_width=2)

  return dub_audio


if __name__ == "__main__":
  audio_path = "data_training/audio_1724919320.2932997.wav"  # Replace with your audio file path

  try:
    cloned_audio = clone_voice(audio_path)
    if cloned_audio is not None:
      print("Text-to-speech with cloned voice generated successfully!")
      play(cloned_audio)  # Play the generated audio
    else:
      print("Error cloning voice. Please check model loading and audio file.")

  except Exception as e:
    print(f"Error cloning voice: {e}")

Loaded encoder "rtvc\saved_models/encoder/encoder.pt" trained to step 1564501
Synthesizer using device: cpu
Building Wave-RNN
Trainable Parameters: 4.481M
Loading model weights at rtvc\saved_models/vocoder/vocoder.pt
Failed to load encoder model. Please check the model file path or its integrity.
Error cloning voice. Please check model loading and audio file.
