In [10]:
from encoder.params_model import model_embedding_size as speaker_embedding_size
from utils.argutils import print_args
from synthesizer.inference import Synthesizer
from encoder import inference as encoder
from vocoder import inference as vocoder
from pathlib import Path
import numpy as np
import librosa
import argparse
import torch
import sys
import sounddevice as sd


if __name__ == '__main__':
    
    # Set path to audio file for style embedding, change this and rerun to get a new voice
    audioPath=Path("data/rand1.flac")
    text = "My name is Jared Levy and I'm a massive bitch made hoe"
    
    
    args={"enc_model_fpath": Path("encoder/saved_models/pretrained.pt"),
          "syn_model_dir": Path("synthesizer/saved_models/logs-pretrained/"),
          "voc_model_fpath": Path("vocoder/saved_models/pretrained/pretrained.pt")}
        
    
    ## Print some environment information (for debugging purposes)
    print("Running a test of your configuration...\n")
    if not torch.cuda.is_available():
        print("Your PyTorch installation is not configured to use CUDA. If you have a GPU ready "
              "for deep learning, ensure that the drivers are properly installed, and that your "
              "CUDA version matches your PyTorch installation. CPU-only inference is currently "
              "not supported.",)# file=sys.stderr)
        quit(-1)
    device_id = torch.cuda.current_device()
    gpu_properties = torch.cuda.get_device_properties(device_id)
    print("Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with "
          "%.1fGb total memory.\n" % 
          (torch.cuda.device_count(),
           device_id,
           gpu_properties.name,
           gpu_properties.major,
           gpu_properties.minor,
           gpu_properties.total_memory / 1e9))
    
    
    ## Load the models one by one.
    print("Preparing the encoder, the synthesizer and the vocoder...")
    encoder.load_model(args["enc_model_fpath"])
    synthesizer = Synthesizer(args["syn_model_dir"].joinpath("taco_pretrained"), low_mem=False)
    vocoder.load_model(args["voc_model_fpath"])
    
    try:
        ## Computing the embedding
        # First, we load the wav using the function that the speaker encoder provides. This is 
        # important: there is preprocessing that must be applied.

        # The following two methods are equivalent:
        # - Directly load from the filepath:
        preprocessed_wav = encoder.preprocess_wav(audioPath)
        # - If the wav is already loaded:
        original_wav, sampling_rate = librosa.load(audioPath)
        preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)
        print("Loaded file succesfully")

        # Then we derive the embedding. There are many functions and parameters that the 
        # speaker encoder interfaces. These are mostly for in-depth research. You will typically
        # only use this function (with its default parameters):
        embed = encoder.embed_utterance(preprocessed_wav)
        print("Created the embedding")

        # The synthesizer works in batch, so you need to put your data in a list or numpy array
        texts = [text]
        embeds = [embed]
        # If you know what the attention layer alignments are, you can retrieve them here by
        # passing return_alignments=True
        specs = synthesizer.synthesize_spectrograms(texts, embeds)
        spec = specs[0]
        print("Created the mel spectrogram")


        ## Generating the waveform
        print("Synthesizing the waveform:")
        # Synthesizing the waveform is fairly straightforward. Remember that the longer the
        # spectrogram, the more time-efficient the vocoder.
        generated_wav = vocoder.infer_waveform(spec)


        ## Post-generation
        # There's a bug with sounddevice that makes the audio cut one second earlier, so we
        # pad it.
        generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant")

        # Play the audio (non-blocking)
        sd.stop()
        sd.play(generated_wav, synthesizer.sample_rate)

        # Save it on the disk
        fpath = "demo_output_%02d.wav" % num_generated
        print(generated_wav.dtype)
        librosa.output.write_wav(fpath, generated_wav.astype(np.float32), 
                                 synthesizer.sample_rate)
        num_generated += 1
        print("\nSaved output as %s\n\n" % fpath)


    except Exception as e:
        print("Caught exception: %s" % repr(e))
        print("Restarting\n")


Running a test of your configuration...

Found 1 GPUs available. Using GPU 0 (GeForce GTX 960M) of compute capability 5.0 with 4.2Gb total memory.

Preparing the encoder, the synthesizer and the vocoder...
Loaded encoder "pretrained.pt" trained to step 1564501
Found synthesizer "pretrained" trained to step 278000
Building Wave-RNN
Trainable Parameters: 4.481M
Loading model weights at vocoder/saved_models/pretrained/pretrained.pt
Loaded file succesfully
Created the embedding
Constructing model: Tacotron








initialisation done /gpu:0
Initialized Tacotron model. Dimensions (? = dynamic shape): 
  Train mode:               False
  Eval mode:                False
  GTA mode:                 False
  Synthesis mode:           True
  Input:                    (?, ?)
  device:                   0
  embedding:                (?, ?, 512)
  enc conv out:             (?, ?, 512)
  encoder out (cond):       (?, ?, 768)
  decoder out:              (?, ?, 80)
  residual out:             (?, ?, 512)
  projected residual out:   (?, ?, 80)
  mel out:                  (?, ?, 80)
  <stop_token> out:         (?, ?)
  Tacotron Parameters       28.439 Million.
Loading checkpoint: synthesizer/saved_models/logs-pretrained/taco_pretrained/tacotron_model.ckpt-278000


INFO:tensorflow:Restoring parameters from synthesizer/saved_models/logs-pretrained/taco_pretrained/tacotron_model.ckpt-278000
Created the mel spectrogram
Synthesizing the waveform:
{| ████████████████ 66500/67200 | Batch Size: 7 | Gen Rate: 9.8kHz | }float64

Saved output as demo_output_02.wav


