In [14]:
from encoder.params_model import model_embedding_size as speaker_embedding_size
from utils.argutils import print_args
from synthesizer.inference import Synthesizer
from encoder import inference as encoder
from vocoder import inference as vocoder
from contentEncoder import model as ContentEncoder
from pathlib import Path
import numpy as np
import librosa
import argparse
import torch
import sys
import sounddevice as sd

In [15]:
# Set paths and load the models
args={"enc_model_fpath": Path("encoder/saved_models/pretrained.pt"),
      "syn_model_dir": Path("synthesizer/saved_models/logs-pretrained/"),
      "voc_model_fpath": Path("vocoder/saved_models/pretrained/pretrained.pt"),
      
      "styleAudio": Path("data/styleAudio/rand1.flac"),
      "contentAudio":  "./data/contentAudio/40-222-0030.flac",
      "content_model_fpath": "contentEncoder/saved_models/deepspeech4.pt",}


## Print some environment information (for debugging purposes)
print("Running a test of your configuration...\n")
if not torch.cuda.is_available():
    print("Your PyTorch installation is not configured to use CUDA. If you have a GPU ready "
          "for deep learning, ensure that the drivers are properly installed, and that your "
          "CUDA version matches your PyTorch installation. CPU-only inference is currently "
          "not supported.",)# file=sys.stderr)
    quit(-1)
device_id = torch.cuda.current_device()
gpu_properties = torch.cuda.get_device_properties(device_id)
print("Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with "
      "%.1fGb total memory.\n" % 
      (torch.cuda.device_count(),
       device_id,
       gpu_properties.name,
       gpu_properties.major,
       gpu_properties.minor,
       gpu_properties.total_memory / 1e9))


## Load the models one by one.
print("Preparing the encoder, the synthesizer and the vocoder...")
ContentEncoder.load_model(args['content_model_fpath'])
encoder.load_model(args["enc_model_fpath"])
synthesizer = Synthesizer(args["syn_model_dir"].joinpath("taco_pretrained"), low_mem=False)
synthesizer.load()
vocoder.load_model(args["voc_model_fpath"])

Running a test of your configuration...

Found 1 GPUs available. Using GPU 0 (GeForce GTX 960M) of compute capability 5.0 with 4.2Gb total memory.

Preparing the encoder, the synthesizer and the vocoder...
SpeechRecognitionModel(
  (cnn): Conv2d(1, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (rescnn_layers): Sequential(
    (0): ResidualCNN(
      (cnn1): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (cnn2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (dropout1): Dropout(p=0.1, inplace=False)
      (dropout2): Dropout(p=0.1, inplace=False)
      (layer_norm1): CNNLayerNorm(
        (layer_norm): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      )
      (layer_norm2): CNNLayerNorm(
        (layer_norm): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      )
    )
    (1): ResidualCNN(
      (cnn1): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (cnn2): Conv2d(32, 32, kernel_size=(3









initialisation done /gpu:0
Initialized Tacotron model. Dimensions (? = dynamic shape): 
  Train mode:               False
  Eval mode:                False
  GTA mode:                 False
  Synthesis mode:           True
  Input:                    (?, ?)
  device:                   0
  embedding:                (?, ?, 512)
  enc conv out:             (?, ?, 512)
  encoder out (cond):       (?, ?, 768)
  decoder out:              (?, ?, 80)
  residual out:             (?, ?, 512)
  projected residual out:   (?, ?, 80)
  mel out:                  (?, ?, 80)
  <stop_token> out:         (?, ?)
  Tacotron Parameters       28.439 Million.
Loading checkpoint: synthesizer/saved_models/logs-pretrained/taco_pretrained/tacotron_model.ckpt-278000
INFO:tensorflow:Restoring parameters from synthesizer/saved_models/logs-pretrained/taco_pretrained/tacotron_model.ckpt-278000
Building Wave-RNN
Trainable Parameters: 4.481M
Loading model weights at vocoder/saved_models/pretrained/pretrained.pt


In [16]:
## Computing the embedding
# First, we load the wav using the function that the speaker encoder provides. This is 
# important: there is preprocessing that must be applied.

# The following two methods are equivalent:
# - Directly load from the filepath:
preprocessed_wav = encoder.preprocess_wav(args['styleAudio'])
# - If the wav is already loaded:
original_wav, sampling_rate = librosa.load(args['styleAudio'])
preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)
print("Loaded file succesfully")

# Then we derive the embedding. There are many functions and parameters that the 
# speaker encoder interfaces. These are mostly for in-depth research. You will typically
# only use this function (with its default parameters):
embed = encoder.embed_utterance(preprocessed_wav)
print("Created the embedding")

# The synthesizer works in batch, so you need to put your data in a list or numpy array
text = ContentEncoder.generate_text(args['contentAudio'])
print(text)
texts = [text]
embeds = [embed]
print(embed.shape)
# If you know what the attention layer alignments are, you can retrieve them here by
# passing return_alignments=True
specs = synthesizer.synthesize_spectrograms(texts, embeds)
spec = specs[0]
print("Created the mel spectrogram")


## Generating the waveform
print("Synthesizing the waveform:")
# Synthesizing the waveform is fairly straightforward. Remember that the longer the
# spectrogram, the more time-efficient the vocoder.
generated_wav = vocoder.infer_waveform(spec)


## Post-generation
# There's a bug with sounddevice that makes the audio cut one second earlier, so we
# pad it.
generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant")

# # Play the audio (non-blocking)
# sd.stop()
# sd.play(generated_wav, synthesizer.sample_rate)

# Save it on the disk
fpath = "demo_output.wav"
print(generated_wav.dtype)
librosa.output.write_wav(fpath, generated_wav.astype(np.float32), 
                         synthesizer.sample_rate)

print("\nSaved output as %s\n\n" % fpath)

Loaded file succesfully
Created the embedding
['till the very last if i reasoned with her she declared herself as much attached to me as ever']
(256,)
Created the mel spectrogram
Synthesizing the waveform:
{| ████████████████ 76000/76800 | Batch Size: 8 | Gen Rate: 8.5kHz | }float64

Saved output as demo_output.wav




In [17]:
# Play the Content Audio
!play data/contentAudio/40-222-0030.flac


data/contentAudio/40-222-0030.flac:

 File Size: 129k      Bit Rate: 160k
  Encoding: FLAC          
  Channels: 1 @ 16-bit   
Samplerate: 16000Hz      
Replaygain: off         
  Duration: 00:00:06.47  

In:100%  00:00:06.47 [00:00:00.00] Out:104k  [      |      ]        Clip:0    
Done.


In [18]:
# Play the Style Audio
!play data/styleAudio/rand1.flac


data/styleAudio/rand1.flac:

 File Size: 243k      Bit Rate: 133k
  Encoding: FLAC          
  Channels: 1 @ 16-bit   
Samplerate: 16000Hz      
Replaygain: off         
  Duration: 00:00:14.65  

In:100%  00:00:14.65 [00:00:00.00] Out:234k  [      |      ] Hd:4.6 Clip:0    
Done.


In [19]:
# Play the Generated
!play demo_output.wav


demo_output.wav:

 File Size: 348k      Bit Rate: 512k
  Encoding: F.P. PCM      
  Channels: 1 @ 25-bit   
Samplerate: 16000Hz      
Replaygain: off         
  Duration: 00:00:05.44  

In:100%  00:00:05.44 [00:00:00.00] Out:87.0k [      |      ]        Clip:2.78k
play WARN sox: `demo_output.wav' input clipped 2777 samples
Done.
