In [14]:
# coding: utf-8
"""
Synthesis waveform from trained model.

usage: synthesis.py [options] <checkpoint> <text_list_file> <dst_dir>

options:
    --hparams=<parmas>                Hyper parameters [default: ].
    --checkpoint-seq2seq=<path>       Load seq2seq model from checkpoint path.
    --checkpoint-postnet=<path>       Load postnet model from checkpoint path.
    --file-name-suffix=<s>            File name suffix [default: ].
    --max-decoder-steps=<N>           Max decoder steps [default: 500].
    --replace_pronunciation_prob=<N>  Prob [default: 0.0].
    --speaker_id=<id>                 Speaker ID (for multi-speaker model).
    --output-html                     Output html for blog post.
    -h, --help               Show help message.
"""
from docopt import docopt
import sys
import os
from os.path import dirname, join, basename, splitext
import audio
import torch
from torch.autograd import Variable
import numpy as np
import nltk
# The deepvoice3 model
from deepvoice3_pytorch import frontend
from hparams import hparams
from tqdm import tqdm
import IPython.display as ipd


use_cuda = torch.cuda.is_available()
_frontend = None  # to be set later


In [15]:

def tts(model, text, p=0, speaker_id=None, fast=False):
    """Convert text to speech waveform given a deepvoice3 model.

    Args:
        text (str) : Input text to be synthesized
        p (float) : Replace word to pronounciation if p > 0. Default is 0.
    """
    if use_cuda:
        model = model.cuda()
    model.eval()
    if fast:
        model.make_generation_fast_()

    sequence = np.array(_frontend.text_to_sequence(text, p=p))
    sequence = Variable(torch.from_numpy(sequence)).unsqueeze(0)
    text_positions = torch.arange(1, sequence.size(-1) + 1).unsqueeze(0).long()
    text_positions = Variable(text_positions)
    speaker_ids = None if speaker_id is None else Variable(torch.LongTensor([speaker_id]))
    if use_cuda:
        sequence = sequence.cuda()
        text_positions = text_positions.cuda()
        speaker_ids = None if speaker_ids is None else speaker_ids.cuda()

    # Greedy decoding
    mel_outputs, linear_outputs, alignments, done = model(
        sequence, text_positions=text_positions, speaker_ids=speaker_ids)

    linear_output = linear_outputs[0].cpu().data.numpy()
    spectrogram = audio._denormalize(linear_output)
    alignment = alignments[0].cpu().data.numpy()
    mel = mel_outputs[0].cpu().data.numpy()
    mel = audio._denormalize(mel)

    # Predicted audio signal
    waveform = audio.inv_spectrogram(linear_output.T)

    return waveform, alignment, spectrogram, mel



In [16]:
import pickle
with open(f'./argsdict.pickle', 'rb') as file2:
    args = pickle.load(file2)
#print(args)
#args['<checkpoint>']= './checkpoints/20210503_cei_ljspeech_vctk_p225_10min_checkpoint_step000010000.pth'
args['<checkpoint>']= './checkpoints/20210503_cei_ljspeech_vctk_p226_10min_checkpoint_step000010000.pth'
#args['<checkpoint>']= './checkpoints/20210503_cei_ljspeech_vctk_p227_10min_checkpoint_step000010000.pth'
print(args)

{'--checkpoint-postnet': None,
 '--checkpoint-seq2seq': None,
 '--file-name-suffix': '',
 '--help': False,
 '--hparams': 'builder=deepvoice3,preset=deepvoice3_ljspeech',
 '--max-decoder-steps': '500',
 '--output-html': False,
 '--replace_pronunciation_prob': '0.0',
 '--speaker_id': None,
 '<checkpoint>': './checkpoints/20210503_cei_ljspeech_vctk_p226_10min_checkpoint_step000010000.pth',
 '<dst_dir>': './checkpoints',
 '<text_list_file>': './demostring.txt'}


In [17]:
def sec2numpy(sec):
    return np.zeros(int(24000*sec))

In [27]:
! cat ./demostring.txt

We are in the business of collaborative innovation and making those ideas happen. 
Building an innovative product can be challenging. 
Compal knows how to work with clients across industries to push forward through this process, 
down to the last detail. 
Let Compal's forward-thinking mindset, 
rich experience and 43,000 professionals worldwide power your journey to delivering innovative results quickly, 
with our highest regard for quality.


In [19]:


print("Command line args:\n", args)
checkpoint_path = args["<checkpoint>"]
text_list_file_path = args["<text_list_file>"]
dst_dir = args["<dst_dir>"]
checkpoint_seq2seq_path = args["--checkpoint-seq2seq"]
checkpoint_postnet_path = args["--checkpoint-postnet"]
max_decoder_steps = int(args["--max-decoder-steps"])
file_name_suffix = args["--file-name-suffix"]
replace_pronunciation_prob = float(args["--replace_pronunciation_prob"])
#output_html = args["--output-html"]
speaker_id = args["--speaker_id"]

# Override hyper parameters
hparams.parse(args["--hparams"])
assert hparams.name == "deepvoice3"

# Presets
if hparams.preset is not None and hparams.preset != "":
    preset = hparams.presets[hparams.preset]
    import json
    hparams.parse_json(json.dumps(preset))
    print("Override hyper parameters with preset \"{}\": {}".format(
        hparams.preset, json.dumps(preset, indent=4)))

_frontend = getattr(frontend, hparams.frontend)
import train
train._frontend = _frontend
from train import plot_alignment, build_model

# Model
model = build_model()

# Load checkpoints separately
if checkpoint_postnet_path is not None and checkpoint_seq2seq_path is not None:
    checkpoint = torch.load(checkpoint_seq2seq_path)
    model.seq2seq.load_state_dict(checkpoint["state_dict"])
    checkpoint = torch.load(checkpoint_postnet_path)
    model.postnet.load_state_dict(checkpoint["state_dict"])
    checkpoint_name = splitext(basename(checkpoint_seq2seq_path))[0]
else:
    checkpoint = torch.load(checkpoint_path)
    model.load_state_dict(checkpoint["state_dict"])
    checkpoint_name = splitext(basename(checkpoint_path))[0]

model.seq2seq.decoder.max_decoder_steps = max_decoder_steps

os.makedirs(dst_dir, exist_ok=True)
with open(text_list_file_path, "rb") as f:
    lines = f.readlines()
    au1 = np.array([])
    for idx, line in enumerate(lines):
        text = line.decode("utf-8")[:-1]
        words = nltk.word_tokenize(text)
        waveform, alignment, _, _ = tts(model, text, p=replace_pronunciation_prob, speaker_id=speaker_id, fast=True)
        dst_wav_path = join(dst_dir, "{}_{}{}.wav".format(idx, checkpoint_name, file_name_suffix))
        dst_alignment_path = join(dst_dir, "{}_{}{}_alignment.png".format(idx, checkpoint_name,file_name_suffix))
        plot_alignment(alignment.T, dst_alignment_path,
                                                    info="{}, {}".format(hparams.builder, basename(checkpoint_path)))
        '''
        audio. (waveform, dst_wav_path)
        from os.path import basename, splitext
        name = splitext(basename(text_list_file_path))[0]
        print(name)
        '''
        au1 = waveform  if idx == 0 else np.concatenate((au1,sec2numpy(0.1), waveform),axis=0)
    ipd.Audio(au1, rate=hparams.sample_rate)        


Command line args:
 {'--checkpoint-postnet': None,
 '--checkpoint-seq2seq': None,
 '--file-name-suffix': '',
 '--help': False,
 '--hparams': 'builder=deepvoice3,preset=deepvoice3_ljspeech',
 '--max-decoder-steps': '500',
 '--output-html': False,
 '--replace_pronunciation_prob': '0.0',
 '--speaker_id': None,
 '<checkpoint>': './checkpoints/20210503_cei_ljspeech_vctk_p226_10min_checkpoint_step000010000.pth',
 '<dst_dir>': './checkpoints',
 '<text_list_file>': './demostring.txt'}
Override hyper parameters with preset "deepvoice3_ljspeech": {
    "n_speakers": 1,
    "downsample_step": 4,
    "outputs_per_step": 1,
    "embedding_weight_std": 0.1,
    "dropout": 0.050000000000000044,
    "kernel_size": 3,
    "text_embed_dim": 256,
    "encoder_channels": 512,
    "decoder_channels": 256,
    "converter_channels": 256,
    "use_guided_attention": true,
    "guided_attention_sigma": 0.2,
    "binary_divergence_weight": 0.1,
    "use_decoder_state_for_postnet_input": true,
    "max_positions

In [20]:
au1.shape

(527168,)

In [21]:
 ipd.Audio(au1, rate=hparams.sample_rate)   

In [24]:
!ls ./eval_output

p225_001.wav  p225_003.wav  p226_002.wav  p227_001.wav	p227_003.wav
p225_002.wav  p226_001.wav  p226_003.wav  p227_002.wav


In [31]:
tar='p227'
au1 = audio.load_wav('./eval_output/'+tar+'_001.wav')
ipd.Audio(au1, rate=hparams.sample_rate)  

In [32]:
au1 = audio.load_wav('./eval_output/'+tar+'_002.wav')
ipd.Audio(au1, rate=hparams.sample_rate) 

In [33]:
au1 = audio.load_wav('./eval_output/'+tar+'_003.wav')
ipd.Audio(au1, rate=hparams.sample_rate) 