In [1]:
%matplotlib inline
import IPython.display as ipd

import torch
from torch.utils.data import DataLoader

import commons
import utils
from data_utils import TextAudioLoader, TextAudioCollate
from models import SynthesizerTrn
from text.symbols import symbols
from text import text_to_sequence

from scipy.io.wavfile import write as write_wav


def get_text(text, hps):
    text = text_to_sequence(text, hps.data.text_cleaners)
    if hps.data.add_blank:
        text = commons.intersperse(text, 0)
    text = torch.LongTensor(text)
    return text

# hyperparameter
hps = utils.get_hparams_from_file("local/12-19.config.json")

## Single-person speech

#### Load checkpoint

In [2]:
net_g = SynthesizerTrn(
    len(symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    **hps.model
).cuda()
_ = net_g.eval()

_ = utils.load_checkpoint("logs/12-19/G_3500.pth", net_g, None)

INFO:root:Loaded checkpoint 'logs/12-19/G_3500.pth' (iteration 270)


#### Run inference

In [19]:
stn_tst = get_text("Example text!", hps)
with torch.no_grad():
    x_tst = stn_tst.cuda().unsqueeze(0)
    x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).cuda()
    audio = (
        net_g.infer(
            x_tst, x_tst_lengths, noise_scale=0.667, noise_scale_w=0.8, length_scale=1
        )[0][0, 0]
        .data.cpu()
        .float()
        .numpy()
    )
ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))
write_wav("local/12-19-output.wav", hps.data.sampling_rate, audio)

#### Prepare for voice conversion

In [None]:
dataset = TextAudioLoader(hps.data.validation_files, hps.data)
collate_fn = TextAudioCollate()
loader = DataLoader(
    dataset,
    num_workers=8,
    shuffle=False,
    batch_size=1,
    pin_memory=True,
    drop_last=True,
    collate_fn=collate_fn,
)
data_list = list(loader)

#### Run conversion

In [None]:
# TODO: modify this!

with torch.no_grad():
    x, x_lengths, spec, spec_lengths, y, y_lengths, sid_src = [
        x.cuda() for x in data_list[0]
    ]
    sid_tgt1 = torch.LongTensor([1]).cuda()
    sid_tgt2 = torch.LongTensor([2]).cuda()
    sid_tgt3 = torch.LongTensor([4]).cuda()
    audio1 = (
        net_g.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt1)[
            0
        ][0, 0]
        .data.cpu()
        .float()
        .numpy()
    )
    audio2 = (
        net_g.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt2)[
            0
        ][0, 0]
        .data.cpu()
        .float()
        .numpy()
    )
    audio3 = (
        net_g.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt3)[
            0
        ][0, 0]
        .data.cpu()
        .float()
        .numpy()
    )
print("Original SID: %d" % sid_src.item())
ipd.display(ipd.Audio(y[0].cpu().numpy(), rate=hps.data.sampling_rate, normalize=False))
print("Converted SID: %d" % sid_tgt1.item())
ipd.display(ipd.Audio(audio1, rate=hps.data.sampling_rate, normalize=False))
print("Converted SID: %d" % sid_tgt2.item())
ipd.display(ipd.Audio(audio2, rate=hps.data.sampling_rate, normalize=False))
print("Converted SID: %d" % sid_tgt3.item())
ipd.display(ipd.Audio(audio3, rate=hps.data.sampling_rate, normalize=False))