# Setup

In [1]:
# imports
import argparse
import os
import sys

import numpy as np
from scipy import signal
import soundfile as sf
import librosa
import torch

from encoder.params_model import model_embedding_size as speaker_embedding_size
from synthesizer.inference import Synthesizer
from encoder import inference as encoder
from vocoder import inference as vocoder 

# maybe
# import sounddevice as sd

  warn("Unable to import 'webrtcvad'. This package enables noise removal and is recommended.")


In [3]:
# args in demo_cli.py
# paths to respective .pt files
project_root = os.getcwd()
enc_model_fpath = os.path.join(project_root, 'pretrained', 'encoder', 'saved_models', 'pretrained.pt')
syn_model_fpath = os.path.join(project_root, 'pretrained', 'synthesizer', 'saved_models', 'pretrained', 'pretrained.pt')
voc_model_fpath = os.path.join(project_root, 'pretrained', 'vocoder', 'saved_models', 'pretrained', 'pretrained.pt')

In [4]:
print(torch.cuda.is_available())

False


In [5]:
# load the models
encoder.load_model(enc_model_fpath)
synthesizer = Synthesizer(syn_model_fpath)
vocoder.load_model(voc_model_fpath)

Synthesizer using device: cpu
Building Wave-RNN
Trainable Parameters: 4.481M
Loading model weights at /home/amar/projects/audino/pretrained/vocoder/saved_models/pretrained/pretrained.pt


# Test

In [6]:
# testing encoder
encoder.embed_utterance(np.zeros(encoder.sampling_rate))

# create a dummy encoding
embed = np.random.rand(speaker_embedding_size)

# Embeddings are L2-normalized
embed /= np.linalg.norm(embed)

# synthesizer can handle multiple inputs with batching so lets create another embedding
embeds = [embed, np.zeros(speaker_embedding_size)]
texts = ['test_1', 'test_2']

mels = synthesizer.synthesize_spectrograms(texts, embeds)

# concatenating the mel spectograms because vocoder is more efficient with longer ones
mel = np.concatenate(mels, axis=1)

# vocoder has callback func to display the generation. for now we will hide it
no_action = lambda *args: None

vocoder.infer_waveform(mel, target=200, overlap=50, progress_callback=no_action)

Trainable Parameters: 30.870M
+----------+---+
| Tacotron | r |
+----------+---+
|   295k   | 2 |
+----------+---+
 

| Generating 1/1


Done.



array([ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
       -3.73963522e-07, -1.87383031e-07, -0.00000000e+00])

# Interactive generation loop

## Computing the embedding

In [6]:
# requires a in_wav_fpath
in_wav_fpath = os.path.join(project_root, 'example_wavs', 'originals', 'p240_00000.wav')

In [7]:
# Compute the embedding, there are two methods to do this
# directly loading from the file path
preprocessed_wav = encoder.preprocess_wav(in_wav_fpath)

# if the wav is already loaded
original_wav, sampling_rate = librosa.load(str(in_wav_fpath))
preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)

In [8]:
# Now let's get the embedding
embed = encoder.embed_utterance(preprocessed_wav)
print(embed.shape)

(256,)


## Generating the spectogram

In [60]:
# needs text (waiting on it)
text = 'It is a truth universally acknowledged, that a single man in possession of a good fortune, must be in want of a wife.'

In [9]:
text_2 = 'Audio-books are a great alternative to reading books and online courses. MOOCs are great as they provide relevant information in a short period of time but they lack the in-depth coverage and comprehensiveness of the topic that books can provide. Books are the holy grail of information but in today’s fast-paced world they are cumbersome. So why not use your smartphone and make books accessible to you anytime you want? The objective of our project is to make books accessible on the fly and easy to understand. We use NLP techniques to convert a book to audio format, we also summarize the book if the user wants to try out a book before actually committing to it, or if they just want to get the gist of it.'

In [10]:
# the synthesizer works in batch so the data must be in list or array
# perfect because i have lots of text from summary
texts = [text_2]
embeds = [embed]

In [11]:
# if you know what attention layer alignments are, you can retrieve them from here
specs = synthesizer.synthesize_spectrograms(texts, embeds)
spec = specs[0]

Trainable Parameters: 30.870M
+----------+---+
| Tacotron | r |
+----------+---+
|   295k   | 2 |
+----------+---+
 

| Generating 1/1


Done.



## Generating the waveform

In [12]:
generated_wav = vocoder.infer_waveform(spec)

{| ████████████████ 437000/441600 | Batch Size: 46 | Gen Rate: 8.4kHz | }

In [13]:
# post generation
generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode='constant')

In [14]:
generated_wav = encoder.preprocess_wav(generated_wav)

In [17]:
B, A = signal.butter(1, (0.1, 0.9), btype='bandpass', output='ba')
smoothed_wav = signal.filtfilt(B, A, generated_wav)

In [18]:
out_filename = 'example_wavs/generated/training_voices_smoothed_01-09.wav'
print(generated_wav.dtype)
sf.write(out_filename, smoothed_wav.astype(np.float32), synthesizer.sample_rate)

float64
