In [1]:
from encoder.params_model import model_embedding_size as speaker_embedding_size
from utils.argutils import print_args
from synthesizer.inference import Synthesizer
from encoder import inference as encoder
from vocoder import inference as vocoder
from contentEncoder import model as ContentEncoder
from pathlib import Path
import numpy as np
import librosa
import argparse
import torch
import sys
import sounddevice as sd




Import requested from: 'numba.decorators', please update to use 'numba.core.decorators' or pin to Numba version 0.48.0. This alias will not be present in Numba version 0.50.0.
  from numba.decorators import jit as optional_jit
Import of 'jit' requested from: 'numba.decorators', please update to use 'numba.core.decorators' or pin to Numba version 0.48.0. This alias will not be present in Numba version 0.50.0.
  from numba.decorators import jit as optional_jit


In [2]:
# Set paths and load the models
args={"enc_model_fpath": Path("encoder/saved_models/pretrained.pt"),
      "syn_model_dir": Path("synthesizer/saved_models/logs-pretrained/"),
      "voc_model_fpath": Path("vocoder/saved_models/pretrained/pretrained.pt"),
      "styleAudio": Path("data/styleAudio/rand1.flac"),
      "contentAudio":  "./data/contentAudio/121-121726-0010.flac",
      "content_model_fpath": "contentEncoder/saved_models/deepspeech4.pt",}


## Print some environment information (for debugging purposes)
print("Running a test of your configuration...\n")
if not torch.cuda.is_available():
    print("Your PyTorch installation is not configured to use CUDA. If you have a GPU ready "
          "for deep learning, ensure that the drivers are properly installed, and that your "
          "CUDA version matches your PyTorch installation. CPU-only inference is currently "
          "not supported.",)# file=sys.stderr)
    quit(-1)
device_id = torch.cuda.current_device()
gpu_properties = torch.cuda.get_device_properties(device_id)
print("Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with "
      "%.1fGb total memory.\n" % 
      (torch.cuda.device_count(),
       device_id,
       gpu_properties.name,
       gpu_properties.major,
       gpu_properties.minor,
       gpu_properties.total_memory / 1e9))


## Load the models one by one.
print("Preparing the encoder, the synthesizer and the vocoder...")
ContentEncoder.load_model(args['content_model_fpath'])
encoder.load_model(args["enc_model_fpath"])
synthesizer = Synthesizer(args["syn_model_dir"].joinpath("taco_pretrained"), low_mem=False)
synthesizer.load()
vocoder.load_model(args["voc_model_fpath"])

Running a test of your configuration...

Found 1 GPUs available. Using GPU 0 (GeForce GTX 960M) of compute capability 5.0 with 4.2Gb total memory.

Preparing the encoder, the synthesizer and the vocoder...
SpeechRecognitionModel(
  (cnn): Conv2d(1, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (rescnn_layers): Sequential(
    (0): ResidualCNN(
      (cnn1): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (cnn2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (dropout1): Dropout(p=0.1, inplace=False)
      (dropout2): Dropout(p=0.1, inplace=False)
      (layer_norm1): CNNLayerNorm(
        (layer_norm): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      )
      (layer_norm2): CNNLayerNorm(
        (layer_norm): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      )
    )
    (1): ResidualCNN(
      (cnn1): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (cnn2): Conv2d(32, 32, kernel_size=(3

Instructions for updating:
Use keras.layers.dropout instead.


Instructions for updating:
Please use `keras.layers.Bidirectional(keras.layers.RNN(cell))`, which is equivalent to this API
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Instructions for updating:
This class is equivalent as tf.keras.layers.StackedRNNCells, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Use keras.layers.dense instead.






initialisation done /gpu:0

Initialized Tacotron model. Dimensions (? = dynamic shape): 
  Train mode:               False
  Eval mode:                False
  GTA mode:                 False
  Synthesis mode:           True
  Input:                    (?, ?)
  device:                   0
  embedding:                (?, ?, 512)
  enc conv out:             (?, ?, 512)
  encoder out (cond):       (?, ?, 768)
  decoder out:              (?, ?, 80)
  residual out:             (?, ?, 512)
  projected residual out:   (?, ?, 80)
  mel out:                  (?, ?, 80)
  <stop_token> out:         (?, ?)
  Tacotron Parameters       28.439 Million.
Loading checkpoint: synthesizer/saved_models/logs-pretrained/taco_pretrained/tacotron_model.ckpt-278000



Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from synthesizer/saved_models/logs-pretrained/taco_pretrained/tacotron_model.ckpt-278000
Building Wave-RNN
Trainable Parameters: 4.481M
Loading model weights at vocoder/saved_models/pretrained/pretrained.pt


In [4]:
## Computing the embedding
# First, we load the wav using the function that the speaker encoder provides. This is 
# important: there is preprocessing that must be applied.

# The following two methods are equivalent:
# - Directly load from the filepath:
preprocessed_wav = encoder.preprocess_wav(args['styleAudio'])
# - If the wav is already loaded:
original_wav, sampling_rate = librosa.load(args['styleAudio'])
preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)
print("Loaded file succesfully")

# Then we derive the embedding. There are many functions and parameters that the 
# speaker encoder interfaces. These are mostly for in-depth research. You will typically
# only use this function (with its default parameters):
embed = encoder.embed_utterance(preprocessed_wav)
print("Created the embedding")

# The synthesizer works in batch, so you need to put your data in a list or numpy array
encoding = [ContentEncoder.encode(args['contentAudio'])]
embeds = [embed]
# If you know what the attention layer alignments are, you can retrieve them here by
# passing return_alignments=True
specs = synthesizer.synthesize_spectrograms(encoding, embeds)
spec = specs[0]
print("Created the mel spectrogram")


## Generating the waveform
print("Synthesizing the waveform:")
# Synthesizing the waveform is fairly straightforward. Remember that the longer the
# spectrogram, the more time-efficient the vocoder.
generated_wav = vocoder.infer_waveform(spec)


## Post-generation
# There's a bug with sounddevice that makes the audio cut one second earlier, so we
# pad it.
generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant")

# Play the audio (non-blocking)
sd.stop()
sd.play(generated_wav, synthesizer.sample_rate)

# Save it on the disk
fpath = "demo_output.wav"
print(generated_wav.dtype)
librosa.output.write_wav(fpath, generated_wav.astype(np.float32), 
                         synthesizer.sample_rate)

print("\nSaved output as %s\n\n" % fpath)

Loaded file succesfully
Created the embedding
torch.Size([1, 393, 29])
['howsecleaning ay de mestick u hebiledthat makes it easy for the government to inlist all the soldiers it neds']
<class 'str'>
Created the mel spectrogram
Synthesizing the waveform:
{| ████████████████ 85500/86400 | Batch Size: 9 | Gen Rate: 10.9kHz | }float64

Saved output as demo_output.wav


