In [5]:
import sys
import os
import numpy as np
import librosa
import soundfile as sf

rtvc_path = os.path.abspath('final-project/rtvc')
if rtvc_path not in sys.path:
    sys.path.append(rtvc_path)

# Import necessary modules
from synthesizer.inference import Synthesizer
from encoder import inference as encoder
from vocoder import inference as vocoder

def load_models():
    encoder.load_model('final-project/rtvc/encoder/saved_model.pt')
    synthesizer_model = Synthesizer('final-project/rtvc/synthesizer/saved_model.pt')
    vocoder.load_model('final-project/rtvc/vocoder/saved_model.pt')
    return synthesizer_model

def clone_voice_and_generate_text(synthesizer, audio_file, text):
    # Load and preprocess the reference audio file
    original_wav, sampling_rate = librosa.load(audio_file, sr=None)
    preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)
    
    # Create the embedding for the voice
    embed = encoder.embed_utterance(preprocessed_wav)
    
    # Generate the speech from text using the cloned voice
    spectrogram = synthesizer.synthesize_spectrograms([text], [embed])[0]
    generated_wav = vocoder.infer_waveform(spectrogram)
    
    # Normalize volume and convert to float32
    generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant")
    generated_wav = encoder.preprocess_wav(generated_wav)
    
    # Save or play the generated speech
    output_file = "output.wav"
    sf.write(output_file, generated_wav, synthesizer.sample_rate)
    print(f"Generated speech saved as {output_file}")

# Example usage
if __name__ == "__main__":
    synthesizer = load_models()
    audio_file = 'data_training/audio_1724919320.2932997.wav'
    text = 'Hello, this is a cloned voice speaking!'
    clone_voice_and_generate_text(synthesizer, audio_file, text)


ModuleNotFoundError: No module named 'unidecode'

In [7]:
import os

# List contents of rtvc directory
rtvc_dir = os.path.abspath('rtvc')
print("Listing contents of rtvc directory:")
for root, dirs, files in os.walk(rtvc_dir):
    for name in files:
        print(os.path.join(root, name))


Listing contents of rtvc directory:


In [None]:
# Change directory to Real-Time-Voice-Cloning
os.chdir('rtvc')

# Try import directly
try:
    from synthesizer.inference import Synthesizer
    from encoder import inference as encoder
    from vocoder import inference as vocoder
    print("Modules imported successfully!")
except ModuleNotFoundError as e:
    print(f"ModuleNotFoundError: {e}")
except Exception as e:
    print(f"An error occurred: {e}")


FileNotFoundError: [WinError 2] The system cannot find the file specified: 'rtvc'

In [None]:
import tensorflow as tf
import librosa
import numpy as np

# Data collection and feature extraction
def collect_data(target_voice_path):
  audio, sr = librosa.load(target_voice_path)
  mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=40)
  return mfcc

# Model training
def train_model(training_data):
  # Create a Variational Autoencoder (VAE) model
  model = tf.keras.models.Sequential([
      tf.keras.layers.Input(shape=(40, None)),  # Input shape (MFCCs)
      tf.keras.layers.LSTM(256, return_sequences=True),
      tf.keras.layers.LSTM(256, return_sequences=True),
      tf.keras.layers.Dense(64),
      tf.keras.layers.Lambda(lambda x: tf.keras.layers.experimental.preprocessing.Reshape(target_shape=(1, -1))(x)),
      tf.keras.layers.Dense(40)
  ])

  model.compile(loss='mse', optimizer='adam')
  model.fit(training_data, training_data, epochs=100)
  return model

# Text-to-speech synthesis
def synthesize_voice(model, text):
  # Convert text to phonemes or characters
  phonemes = convert_text_to_phonemes(text)

  # Generate features using the trained model
  features = model.predict(np.array([phonemes]))

  # Synthesize audio using a vocoder or neural vocoder
  audio = synthesize_audio_from_features(features)
  return audio

# Helper functions (implementations omitted for brevity)
def convert_text_to_phonemes(text):
  # ...
  return phonemes

def synthesize_audio_from_features(features):
  # ...
  return audio

# Example usage
target_voice_path = "target_voice.wav"
training_data = collect_data(target_voice_path)
model = train_model(training_data)

text_to_clone = "Hello, world!"
cloned_voice = synthesize_voice(model, text_to_clone)

ModuleNotFoundError: No module named 'tensorflow'