In [None]:

import os
import torch
from torch.utils.data import DataLoader

import commons
import utils
from data_utils import TextAudioLoader, TextAudioCollate, TextAudioSpeakerLoader, TextAudioSpeakerCollate, CustomLoader, CustomCollate
from models import SynthesizerTrn
from text.symbols import symbols
from text import text_to_sequence, _id_to_symbol

from scipy.io.wavfile import write


def get_text(text, hps):
    text_norm = text_to_sequence(text, hps.data.text_cleaners)
    if hps.data.add_blank:
        text_norm = commons.intersperse(text_norm, 0)
    text_norm = torch.LongTensor(text_norm)
    return text_norm

In [9]:
# 모델 정의
from scipy.io import wavfile

iter = 1895
speakers = 4
dir = 'tokyom'
os.makedirs(f'./inference/{dir}/{iter}k', exist_ok=True)

hps = utils.get_hparams_from_file(f"./configs/{dir}.json")

net_g = SynthesizerTrn(
    len(symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    n_speakers=hps.data.n_speakers,
    **hps.model).cuda()
_ = net_g.eval()

_ = utils.load_checkpoint(f"./logs/{dir}/G_{iter}000.pth", net_g, None)



INFO:root:Loaded checkpoint './logs/tokyom/G_1895000.pth' (iteration 11485)


In [10]:
# 일반적인 추론
texts = open('./inference.txt', 'r').readlines()
speakers = [0, 1, 2, 3]
for j, text in enumerate(texts):
    stn_tst = get_text(text, hps)
    with torch.no_grad():
        x_tst = stn_tst.cuda().unsqueeze(0)
        x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).cuda()
        for i in speakers:
            sid = torch.LongTensor([i]).cuda()
            audio, attn, mask, *_ = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8, length_scale=1.0)
            audio = audio[0, 0].data.cpu().float().numpy()
            attn = attn.data.cpu().float().numpy()

            wavfile.write(f'./inference/{dir}/{iter}k/{j}_{text[:5]}_{dir}_{iter}k_{i}.wav', hps.data.sampling_rate, audio)

In [53]:
# 배치 추론
import time

texts = open('./inference.txt', 'r').readlines()

speaker = 0
dataset = CustomLoader(texts, speaker)
collate_fn = CustomCollate()
loader = DataLoader(dataset, num_workers=1, shuffle=False,
        batch_size=4, pin_memory=True,
        drop_last=False, collate_fn=collate_fn, prefetch_factor=4)

i = 0
생성시간 = 0
start = time.time()
with torch.no_grad():
    for x, x_lengths, speakers in loader:
        x, x_lengths = x.cuda(0), x_lengths.cuda(0)
        speakers = speakers.cuda(0)
        audios, attn, mask, w_ceil, *_ = net_g.infer(x, x_lengths, sid=speakers, noise_scale=.667, noise_scale_w=0.8, length_scale=1)
        lengths = mask.sum([1,2]).long() * hps.data.hop_length
        for audio, length in zip(audios, lengths):
            length = length.data.cpu().long().numpy()
            audio = audio[0].data.cpu().float().numpy()[:length]
            wavfile.write(f'./inference/{dir}/{iter}k/{i}_{dir}_{iter}k_{speaker}.wav', hps.data.sampling_rate, audio)
            i += 1
end = time.time()
생성시간 += end-start



In [3]:
# textgrid 만들기
import textgrid

def make_textgrid(path, phones, durations):
    times = [0] # seconds
    time = 0
    for duration in durations:
        time += duration.item()/86.1328
        times.append(round(time, 5))
    minTime = times[0]
    maxTime = times[-1]

    tg = textgrid.TextGrid(minTime=minTime, maxTime=maxTime)
    tg.tiers.append(textgrid.IntervalTier(name='phones'))
    for i, phone in enumerate(phones):
        phone = _id_to_symbol[phone]
        # if times[i] == times[i+1]:
        #     times[i+1] += 0.00001
        minTime = times[i]
        maxTime = times[i+1]

        tg.tiers[0].add(minTime=minTime, maxTime=maxTime, mark=phone)
    tg.write(path+'.TextGrid')

In [33]:
# 길이 조절 1 

text = open('./inference.txt', 'r').readline()
stn_tst = get_text(text, hps)
speaker = 1
path = f'./inference/{dir}/{iter}k/{text[:5]}_{dir}_{iter}k_{speaker}'
if os.path.isfile(path+'.txt'):
    w_target = []
    for line in open(path+'.txt', 'r').readlines():
        dur = line.split('| ')[-1]
        dur = round(float(dur)*86.1328)
        w_target.append(dur)
    w_target = torch.FloatTensor(w_target).unsqueeze(0).unsqueeze(0).cuda()
else:
    w_target = None

with torch.no_grad():
    x_tst = stn_tst.cuda().unsqueeze(0)
    x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).cuda()
    sid = torch.LongTensor([speaker]).cuda()
    audio, attn, mask, w_ceil, *_ = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=0.667, noise_scale_w=0.8, length_scale=1.0, w_target=w_target)
    audio = audio[0, 0].data.cpu().float().numpy()
    w_ceil = w_ceil.data.cpu().int().numpy()[0][0]
    stn_tst = stn_tst.data.cpu().int().numpy()

    wavfile.write(f'./inference/{dir}/{iter}k/{text[:5]}_{dir}_{iter}k_{speaker}.wav', hps.data.sampling_rate, audio)

p = f'./inference/{dir}/{iter}k/{text[:5]}_{dir}_{iter}k_{speaker}'
make_textgrid(p, stn_tst, w_ceil)

In [31]:
# 길이 조절 2 [길이 정보 파일 만들기]

path = f'./inference/{dir}/{iter}k/{text[:5]}_{dir}_{iter}k_{speaker}'

with open(f'{path}.txt', 'w') as f:
    for s, d in zip(stn_tst, w_ceil):
        p = _id_to_symbol[s]
        dur = round(d/86.1328, 3)
        information = f'{p} : {dur} | {dur}\n'
        f.write(information)
        

### Voice Conversion

In [None]:
dataset = TextAudioSpeakerLoader(hps.data.validation_files, hps.data)
collate_fn = TextAudioSpeakerCollate()
loader = DataLoader(dataset, num_workers=8, shuffle=False,
    batch_size=1, pin_memory=True,
    drop_last=True, collate_fn=collate_fn)
data_list = list(loader)

In [None]:
with torch.no_grad():
    x, x_lengths, spec, spec_lengths, y, y_lengths, sid_src = [x.cuda() for x in data_list[0]]
    sid_tgt1 = torch.LongTensor([1]).cuda()
    sid_tgt2 = torch.LongTensor([2]).cuda()
    sid_tgt3 = torch.LongTensor([4]).cuda()
    audio1 = net_g.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt1)[0][0,0].data.cpu().float().numpy()
    audio2 = net_g.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt2)[0][0,0].data.cpu().float().numpy()
    audio3 = net_g.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt3)[0][0,0].data.cpu().float().numpy()
print("Original SID: %d" % sid_src.item())
ipd.display(ipd.Audio(y[0].cpu().numpy(), rate=hps.data.sampling_rate, normalize=False))
print("Converted SID: %d" % sid_tgt1.item())
ipd.display(ipd.Audio(audio1, rate=hps.data.sampling_rate, normalize=False))
print("Converted SID: %d" % sid_tgt2.item())
ipd.display(ipd.Audio(audio2, rate=hps.data.sampling_rate, normalize=False))
print("Converted SID: %d" % sid_tgt3.item())
ipd.display(ipd.Audio(audio3, rate=hps.data.sampling_rate, normalize=False))