In [260]:
%env CUDA_VISIBLE_DEVICES=1

env: CUDA_VISIBLE_DEVICES=1


# Inference process of WaveGrad

In [261]:
import sys
sys.path.insert(0, '..')

import json
import IPython.display as ipd

import torch

from tqdm import tqdm

import utils
import benchmark
from model import WaveGrad
from data import AudioDataset, MelSpectrogramFixed

**Load configuration**

In [262]:
CONFIG_PATH='../configs/default.json'

with open(CONFIG_PATH) as f:
    config = utils.ConfigWrapper(**json.load(f))
config.training_config.logdir = f'../{config.training_config.logdir}'
config.training_config.train_filelist_path = f'../{config.training_config.train_filelist_path}'
config.training_config.test_filelist_path = f'../{config.training_config.test_filelist_path}'
config

{'model_config': {'factors': [5, 5, 3, 2, 2], 'upsampling_preconv_out_channels': 768, 'upsampling_out_channels': [512, 512, 256, 128, 128], 'upsampling_dilations': [[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 4, 8], [1, 2, 4, 8], [1, 2, 4, 8]], 'downsampling_preconv_out_channels': 32, 'downsampling_out_channels': [128, 128, 256, 512], 'downsampling_dilations': [[1, 2, 4], [1, 2, 4], [1, 2, 4], [1, 2, 4]]}, 'data_config': {'sample_rate': 22050, 'n_fft': 1024, 'win_length': 1024, 'hop_length': 300, 'f_min': 80.0, 'f_max': 8000, 'n_mels': 80}, 'training_config': {'logdir': '../logs/default', 'continue_training': False, 'train_filelist_path': '../filelists/train.txt', 'test_filelist_path': '../filelists/test.txt', 'batch_size': 96, 'segment_length': 7200, 'lr': 0.001, 'grad_clip_threshold': 1, 'scheduler_step_size': 1, 'scheduler_gamma': 0.9, 'n_epoch': 100000000, 'n_samples_to_test': 4, 'test_interval': 1, 'training_noise_schedule': {'n_iter': 1000, 'betas_range': [1e-06, 0.01]}, 'test_noise_sched

**Initialize the model**

In [263]:
model = WaveGrad(config).cuda()
print(f'Number of parameters: {model.nparams}')

model.load_state_dict(torch.load('../logs/pretrained_ljspeech.pt')['model'], strict=False)
# model, _, _ = utils.load_latest_checkpoint(config.training_config.logdir, model)

Number of parameters: 15810401


_IncompatibleKeys(missing_keys=[], unexpected_keys=['betas', 'alphas', 'alphas_cumprod', 'alphas_cumprod_prev', 'sqrt_alphas_cumprod', 'sqrt_recip_alphas_cumprod', 'sqrt_recipm1_alphas_cumprod', 'posterior_log_variance_clipped', 'posterior_mean_coef1', 'posterior_mean_coef2'])

**Initialize the dataset**

In [264]:
dataset = AudioDataset(config, training=False)
mel_fn = MelSpectrogramFixed(
    sample_rate=config.data_config.sample_rate,
    n_fft=config.data_config.n_fft,
    win_length=config.data_config.win_length,
    hop_length=config.data_config.hop_length,
    f_min=config.data_config.f_min,
    f_max=config.data_config.f_max,
    n_mels=config.data_config.n_mels,
    window_fn=torch.hann_window
).cuda()

In [265]:
TEST_BATCH_SIZE=2

# Sample test batch from test set 
test_batch = dataset.sample_test_batch(TEST_BATCH_SIZE)

for test_sample in test_batch:
    ipd.display(ipd.Audio(test_sample.squeeze(), rate=22050))

**Grid search of best schedule (optional, otherwise set betas in the next section by hand)**

Note: the lower `step` argument, the more accurate the search is.

In [266]:
PERFORM_GRID_SEARCH=False

if PERFORM_GRID_SEARCH:
    n_iter = 25
    path_to_store_schedule = f'../schedules/default/{n_iter}iters.pt'

    iters_best_schedule, stats = benchmark.iters_schedule_grid_search(
        model, config,
        n_iter=n_iter,
        betas_range=(1e-06, 0.01),
        test_batch_size=1, step=1,
        path_to_store_schedule=path_to_store_schedule,
        save_stats_for_grid=True,
        verbose=True, n_jobs=4
    )

**Set noise schedule**

Note: `init_kwargs` should always contain the key `steps`.

In [267]:
SCHEDULE_PATHS={
    6: '../schedules/pretrained/6iters.pt',
    7: '../schedules/pretrained/7iters.pt',
    12: '../schedules/pretrained/12iters.pt',
    25: '../schedules/pretrained/25iters.pt',
    50: '../schedules/pretrained/50iters.pt',
    100: '../schedules/pretrained/100iters.pt',
    1000: '../schedules/pretrained/1000iters.pt',
}

SCHEDULES = {
    schedule_type: {
        'init': lambda **kwargs: torch.FloatTensor(torch.load(kwargs['path'])),
        'init_kwargs': {'steps': schedule_type, 'path': path}
    } for schedule_type, path in SCHEDULE_PATHS.items()
}

In [268]:
SCHEDULE_TYPE_TO_SET=25

model.set_new_noise_schedule(
    init=SCHEDULES[SCHEDULE_TYPE_TO_SET]['init'],
    init_kwargs=SCHEDULES[SCHEDULE_TYPE_TO_SET]['init_kwargs']
)
# torch.load(SCHEDULE_PATHS[SCHEDULE_TYPE_TO_SET])

**Inference**

In [269]:
test_preds = []
for test_sample in tqdm(test_batch):
    mel = mel_fn(test_sample[None].cuda())
    outputs = model.forward(
        mel, store_intermediate_states=False
    )
    test_preds.append(outputs)

100%|██████████| 2/2 [00:02<00:00,  1.49s/it]


In [270]:
for signal in test_preds:
    ipd.display(ipd.Audio((signal).squeeze().cpu(), rate=config.data_config.sample_rate))

**Compute real-time factor (RTF)**

In [16]:
rtf_stats = benchmark.estimate_average_rtf_on_filelist(
    '../filelists/test.txt', config, model, verbose=True
)

100%|██████████| 100/100 [00:39<00:00,  2.51it/s]

DEVICE: cuda:0. average_rtf=0.05847837846552753, std=0.0043890874577761346



