# Собираем датасет

In [None]:
%%bash
#install libraries
pip install torchaudio
pip install wandb
pip install gdown

#download LjSpeech
wget https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2 -o /dev/null
mkdir data
tar -xvf LJSpeech-1.1.tar.bz2 >> /dev/null
mv LJSpeech-1.1 data/LJSpeech-1.1

gdown https://drive.google.com/u/0/uc?id=1-EdH0t0loc6vPiuVtXdhsDtzygWNSNZx
mv train.txt data/

#download Waveglow
gdown https://drive.google.com/u/0/uc?id=1WsibBTsuRg_SF2Z6L6NFRTT-NjEy1oTx
mkdir -p waveglow/pretrained_model/
mv waveglow_256channels_ljs_v2.pt waveglow/pretrained_model/waveglow_256channels.pt

gdown https://drive.google.com/u/0/uc?id=1cJKJTmYd905a-9GFoo5gKjzhKjUVj83j
tar -xvf mel.tar.gz
echo $(ls mels | wc -l)

#download alignments
wget https://github.com/xcmyz/FastSpeech/raw/master/alignments.zip
unzip alignments.zip >> /dev/null

# we will use waveglow code, data and audio preprocessing from this repo
git clone https://github.com/xcmyz/FastSpeech.git
mv FastSpeech/text .
mv FastSpeech/audio .
mv FastSpeech/waveglow/* waveglow/
mv FastSpeech/utils.py .
mv FastSpeech/glow.py .
mv FastSpeech/hparams.py .

In [None]:
# собираем датасет: текст, pitch, energy, duration, mel_spectrogram

!mkdir ./fastspeech2_dataset/
!cp ./data/train.txt ./fastspeech2_dataset/
!cp -r ./alignments ./fastspeech2_dataset/ # ./alignments/0.npy 0 ... 13099
!cp -r ./mels ./fastspeech2_dataset/ # ./mels/ljspeech-mel-00001.npy 1 ... 13100

In [None]:
!mkdir ./fastspeech2_dataset/pitch
!mkdir ./fastspeech2_dataset/energy

In [None]:
!pip install pyworld

In [None]:
# pitch

import os
import soundfile as sf
import pyworld as pw
import numpy as np
import pandas as pd
from scipy.interpolate import interp1d

from tqdm import tqdm

frame_size = 1024
hop_size = 256
sample_rate = 22050

data = pd.read_csv('/content/data/LJSpeech-1.1/metadata.csv', sep='|', header=None)

for i in tqdm(range(len(data[0]))):
    filename = "./data/LJSpeech-1.1/wavs/" + data[0][i] + ".wav"
    audio, sr = sf.read(filename)

    f0, temp_pos = pw.dio(np.array(audio, dtype=np.float64), sr, frame_period=hop_size/sample_rate*1000)
    pitch = pw.stonemask(np.array(audio, dtype=np.float64), f0, temp_pos, sr)

    duration_name = "./fastspeech2_dataset/alignments/" + str(i) + ".npy"
    duration = np.load(duration_name)
    length = np.sum(duration)
    pitch = pitch[:length]

    #interpolate
    non_zero_ids = np.arange(len(pitch))[pitch != 0]
    interpolation = interp1d(non_zero_ids, pitch[non_zero_ids],
                             bounds_error=False,
                             fill_value=(pitch[non_zero_ids[0]], pitch[non_zero_ids[-1]]))
    pitch = interpolation(np.arange(len(pitch)))

    #log
    pitch = np.log(pitch)

    #normalize
    mean = pitch.mean()
    std = pitch.std()
    pitch = (pitch - mean) / (std + 1e-8)

    new_filename = "./fastspeech2_dataset/pitch/" + str(i) + ".npy"
    np.save(new_filename, np.array([mean, std] + pitch.tolist()))

100%|██████████| 13100/13100 [35:25<00:00,  6.16it/s]


In [None]:
for i in tqdm(range(0, 13100)):
    align = np.load("./fastspeech2_dataset/alignments/" + str(i) + ".npy")
    mel = np.load("./fastspeech2_dataset/mels/ljspeech-mel-" + ("%05d" % (i+1)) + ".npy")
    assert align.sum() == mel.shape[0]

100%|██████████| 13100/13100 [00:38<00:00, 337.19it/s]


In [None]:
for i in tqdm(range(0, 13100)):
    pitch = np.load("./fastspeech2_dataset/pitch/" + str(i) + ".npy")[2:]
    mel = np.load("./fastspeech2_dataset/mels/ljspeech-mel-" + ("%05d" % (i+1)) + ".npy")
    assert pitch.shape[0] == mel.shape[0]

100%|██████████| 13100/13100 [00:45<00:00, 290.86it/s]


In [None]:
# energy

for i in tqdm(range(len(data[0]))):
    filename = "./fastspeech2_dataset/mels/ljspeech-mel-" + ("%05d" % (i + 1)) + ".npy"
    mel_spec = np.load(filename)
    energy = ((mel_spec ** 2).sum(-1))**0.5

    new_filename = "./fastspeech2_dataset/energy/" + str(i) + ".npy"
    np.save(new_filename, energy)

100%|██████████| 13100/13100 [00:41<00:00, 319.19it/s]


In [None]:
for i in tqdm(range(0, 13100)):
    energy = np.load("./fastspeech2_dataset/energy/" + str(i) + ".npy")
    mel = np.load("./fastspeech2_dataset/mels/ljspeech-mel-" + ("%05d" % (i+1)) + ".npy")
    assert energy.shape[0] == mel.shape[0]

100%|██████████| 13100/13100 [00:41<00:00, 317.85it/s]


In [None]:
# stats

max_pitch = -1e10
min_pitch = 1e10
max_energy = -1
min_energy = 1e20

for i in tqdm(range(0, 13100)):
    pitch = np.load("./fastspeech2_dataset/pitch/" + str(i) + ".npy")[2:]
    energy = np.load("./fastspeech2_dataset/energy/" + str(i) + ".npy")
    max_pitch = max(max_pitch, pitch.max())
    min_pitch = min(min_pitch, pitch.min())
    max_energy = max(max_energy, energy.max())
    min_energy = min(min_energy, energy.min())

max_pitch, min_pitch, max_energy, min_energy

100%|██████████| 13100/13100 [00:16<00:00, 815.76it/s] 


(7.497155011278676, -6.635588278343256, 91.4197, 15.023643)

In [None]:
# !zip -r fastspeech2_dataset.zip ./fastspeech2_dataset
# !mkdir -p ./drive/MyDrive/dla_hw3/dataset/
# !cp fastspeech2_dataset.zip ./drive/MyDrive/dla_hw3/dataset/


# !gdown https://drive.google.com/u/0/uc?id=1-4cIK7IXOlpQYNqFoyF3RLMiy14JufGn
# !unzip fastspeech2_dataset.zip
# !rm -rf fastspeech2_dataset.zip

# Моделька (черновик)

In [None]:
import pathlib
import random
import itertools
from tqdm import tqdm_notebook

from IPython import display
from dataclasses import dataclass

import torch
import torch.nn.functional as F
from torch import distributions
from torch import nn
from torch import optim
from torch.utils.data import Dataset, DataLoader

import torchaudio
from torchaudio.transforms import MelSpectrogram
import math
import time
import os
import librosa
import pandas as pd
from tqdm import tqdm
import numpy as np
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from dataclasses import dataclass
from collections import OrderedDict

import seaborn as sns
sns.set()

import sys
sys.path.append('.')

In [None]:
@dataclass
class MelSpectrogramConfig:
    num_mels = 80

@dataclass
class FastSpeech2Config:
    vocab_size = 300
    max_seq_len = 3000

    encoder_dim = 256
    encoder_n_layer = 4
    encoder_head = 2
    encoder_conv1d_filter_size = 1024

    decoder_dim = 256
    decoder_n_layer = 4
    decoder_head = 2
    decoder_conv1d_filter_size = 1024

    fft_conv1d_kernel = (9, 1)
    fft_conv1d_padding = (4, 0)

    duration_predictor_filter_size = 256
    duration_predictor_kernel_size = 3
    pitch_predictor_filter_size = 256
    pitch_predictor_kernel_size = 3
    energy_predictor_filter_size = 256
    energy_predictor_kernel_size = 3
    dropout = 0.1

    max_pitch = 7.5
    min_pitch = -6.7
    pitch_n_emb = 256
    max_energy = 91.5
    min_energy = 15.0
    energy_n_emb = 256

    PAD = 0
    UNK = 1
    BOS = 2
    EOS = 3

    PAD_WORD = '<blank>'
    UNK_WORD = '<unk>'
    BOS_WORD = '<s>'
    EOS_WORD = '</s>'


@dataclass
class TrainConfig:
    checkpoint_path = "./model_new"
    logger_path = "./logger"
    text_path = "./fastspeech2_dataset/train.txt"
    mel_path = "./fastspeech2_dataset/mels"
    alignment_path = "./fastspeech2_dataset/alignments"
    pitch_path = "./fastspeech2_dataset/pitch"
    energy_path = "./fastspeech2_dataset/energy"

    wandb_project = 'fastspeech2_example'

    text_cleaners = ['english_cleaners']

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    device = 'cuda:0'

    batch_size = 16
    epochs = 2000
    n_warm_up_step = 4000

    learning_rate = 1e-3
    weight_decay = 1e-6
    grad_clip_thresh = 1.0
    decay_step = [500000, 1000000, 2000000]

    save_step = 3000
    log_step = 5
    clear_Time = 20

    batch_expand_size = 32


mel_config = MelSpectrogramConfig()
model_config = FastSpeech2Config()
train_config = TrainConfig()

In [None]:
!pip install unidecode

Collecting unidecode
  Downloading Unidecode-1.3.7-py3-none-any.whl (235 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.5/235.5 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: unidecode
Successfully installed unidecode-1.3.7


In [None]:
from text import text_to_sequence


def pad_1D(inputs, PAD=0):

    def pad_data(x, length, PAD):
        x_padded = np.pad(x, (0, length - x.shape[0]),
                          mode='constant',
                          constant_values=PAD)
        return x_padded

    max_len = max((len(x) for x in inputs))
    padded = np.stack([pad_data(x, max_len, PAD) for x in inputs])

    return padded


def pad_1D_tensor(inputs, PAD=0):

    def pad_data(x, length, PAD):
        x_padded = F.pad(x, (0, length - x.shape[0]))
        return x_padded

    max_len = max((len(x) for x in inputs))
    padded = torch.stack([pad_data(x, max_len, PAD) for x in inputs])

    return padded


def pad_2D(inputs, maxlen=None):

    def pad(x, max_len):
        PAD = 0
        if np.shape(x)[0] > max_len:
            raise ValueError("not max_len")

        s = np.shape(x)[1]
        x_padded = np.pad(x, (0, max_len - np.shape(x)[0]),
                          mode='constant',
                          constant_values=PAD)
        return x_padded[:, :s]

    if maxlen:
        output = np.stack([pad(x, maxlen) for x in inputs])
    else:
        max_len = max(np.shape(x)[0] for x in inputs)
        output = np.stack([pad(x, max_len) for x in inputs])

    return output


def pad_2D_tensor(inputs, maxlen=None):

    def pad(x, max_len):
        if x.size(0) > max_len:
            raise ValueError("not max_len")

        s = x.size(1)
        x_padded = F.pad(x, (0, 0, 0, max_len-x.size(0)))
        return x_padded[:, :s]

    if maxlen:
        output = torch.stack([pad(x, maxlen) for x in inputs])
    else:
        max_len = max(x.size(0) for x in inputs)
        output = torch.stack([pad(x, max_len) for x in inputs])

    return output


def process_text(train_text_path):
    with open(train_text_path, "r", encoding="utf-8") as f:
        txt = []
        for line in f.readlines():
            txt.append(line)

        return txt


def get_data_to_buffer(train_config):
    buffer = list()
    text = process_text(train_config.text_path)

    start = time.perf_counter()
    for i in tqdm(range(len(text))):

        mel_gt_name = os.path.join(
            train_config.mel_path, "ljspeech-mel-%05d.npy" % (i+1))
        mel_gt_target = np.load(mel_gt_name)
        duration = np.load(os.path.join(
            train_config.alignment_path, str(i)+".npy"))
        pitch = np.load(os.path.join(
            train_config.pitch_path, str(i)+".npy"))
        pitch_mean = np.array([pitch[0]])
        pitch_std = np.array([pitch[1]])
        pitch = pitch[2:]
        energy = np.load(os.path.join(
            train_config.energy_path, str(i)+".npy"))

        character = text[i][0:len(text[i])-1]
        character = np.array(
            text_to_sequence(character, train_config.text_cleaners))

        character = torch.from_numpy(character)
        duration = torch.from_numpy(duration)
        pitch = torch.from_numpy(pitch)
        pitch_mean = torch.from_numpy(pitch_mean)
        pitch_std = torch.from_numpy(pitch_std)
        energy = torch.from_numpy(energy)
        mel_gt_target = torch.from_numpy(mel_gt_target)

        buffer.append({"text": character,
                       "duration": duration,
                       "pitch": pitch,
                       "pitch_mean": pitch_mean,
                       "pitch_std": pitch_std,
                       "energy": energy,
                       "mel_target": mel_gt_target})

    end = time.perf_counter()
    print("cost {:.2f}s to load all data into buffer.".format(end-start))

    return buffer


class BufferDataset(Dataset):
    def __init__(self, buffer):
        self.buffer = buffer
        self.length_dataset = len(self.buffer)

    def __len__(self):
        return self.length_dataset

    def __getitem__(self, idx):
        return self.buffer[idx]


def reprocess_tensor(batch, cut_list):
    texts = [batch[ind]["text"] for ind in cut_list]
    mel_targets = [batch[ind]["mel_target"] for ind in cut_list]
    durations = [batch[ind]["duration"] for ind in cut_list]
    pitches = [batch[ind]["pitch"] for ind in cut_list]
    pitch_means = [batch[ind]["pitch_mean"] for ind in cut_list]
    pitch_stds = [batch[ind]["pitch_std"] for ind in cut_list]
    energies = [batch[ind]["energy"] for ind in cut_list]

    length_text = np.array([])
    for text in texts:
        length_text = np.append(length_text, text.size(0))

    src_pos = list()
    max_len = int(max(length_text))
    for length_src_row in length_text:
        src_pos.append(np.pad([i+1 for i in range(int(length_src_row))],
                              (0, max_len-int(length_src_row)), 'constant'))
    src_pos = torch.from_numpy(np.array(src_pos))

    length_mel = np.array(list())
    for mel in mel_targets:
        length_mel = np.append(length_mel, mel.size(0))

    mel_pos = list()
    max_mel_len = int(max(length_mel))
    for length_mel_row in length_mel:
        mel_pos.append(np.pad([i+1 for i in range(int(length_mel_row))],
                              (0, max_mel_len-int(length_mel_row)), 'constant'))
    mel_pos = torch.from_numpy(np.array(mel_pos))

    texts = pad_1D_tensor(texts)
    mel_targets = pad_2D_tensor(mel_targets)
    durations = pad_1D_tensor(durations)
    pitches = pad_1D_tensor(pitches)
    pitch_means = torch.tensor(pitch_means)
    pitch_stds = torch.tensor(pitch_stds)
    energies = pad_1D_tensor(energies)

    out = {"text": texts,
           "mel_target": mel_targets,
           "duration": durations,
           "pitch": pitches,
           "pitch_mean": pitch_means,
           "pitch_std": pitch_stds,
           "energy": energies,
           "mel_pos": mel_pos,
           "src_pos": src_pos,
           "mel_max_len": max_mel_len}

    return out


def collate_fn_tensor(batch):
    len_arr = np.array([d["text"].size(0) for d in batch])
    index_arr = np.argsort(-len_arr)
    batchsize = len(batch)
    real_batchsize = batchsize // train_config.batch_expand_size

    cut_list = list()
    for i in range(train_config.batch_expand_size):
        cut_list.append(index_arr[i*real_batchsize:(i+1)*real_batchsize])

    output = list()
    for i in range(train_config.batch_expand_size):
        output.append(reprocess_tensor(batch, cut_list[i]))

    return output

  return s in _symbol_to_id and s is not '_' and s is not '~'
  return s in _symbol_to_id and s is not '_' and s is not '~'


In [None]:
buffer = get_data_to_buffer(train_config)

dataset = BufferDataset(buffer)

training_loader = DataLoader(
    dataset,
    batch_size=train_config.batch_expand_size * train_config.batch_size,
    shuffle=True,
    collate_fn=collate_fn_tensor,
    drop_last=True,
    num_workers=0
)

100%|██████████| 13100/13100 [01:00<00:00, 217.43it/s]

cost 60.26s to load all data into buffer.





In [None]:
batch = next(iter(training_loader))[0]
batch['text'].shape

torch.Size([16, 182])

In [None]:
class ScaledDotProductAttention(nn.Module):
    ''' Scaled Dot-Product Attention '''

    def __init__(self, temperature, attn_dropout=0.1):
        super().__init__()
        self.temperature = temperature
        self.dropout = nn.Dropout(attn_dropout)
        self.softmax = nn.Softmax(dim=2)

    def forward(self, q, k, v, mask=None):
        # q, k, v: [ (batch_size * n_heads) x seq_len x hidden_size ]

        if mask is None:
            attn = self.softmax((q @ k.transpose(-1, -2)) / self.temperature)
        else:
            attn = self.softmax((q @ k.transpose(-1, -2)) / self.temperature) * ~mask

        # attn: [ (batch_size * n_heads) x seq_len x seq_len ]

        output = self.dropout(attn) @ v

        # output: [ (batch_size * n_heads) x seq_len x hidden_size ]
        return output, attn


class MultiHeadAttention(nn.Module):
    ''' Multi-Head Attention module '''

    def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1):
        super().__init__()

        self.n_head = n_head
        self.d_k = d_k
        self.d_v = d_v
        self.d_model = d_model

        self.w_qs = nn.Linear(d_model, n_head * d_k)
        self.w_ks = nn.Linear(d_model, n_head * d_k)
        self.w_vs = nn.Linear(d_model, n_head * d_v)

        self.attention = ScaledDotProductAttention(
            temperature=d_k**0.5)
        self.layer_norm = nn.LayerNorm(d_model)

        self.fc = nn.Linear(n_head * d_v, d_model)
        nn.init.xavier_normal_(self.fc.weight)

        self.dropout = nn.Dropout(dropout)

        self.reset_parameters()

    def reset_parameters(self):
         # normal distribution initialization better than kaiming(default in pytorch)
        nn.init.normal_(self.w_qs.weight, mean=0,
                        std=np.sqrt(2.0 / (self.d_model + self.d_k)))
        nn.init.normal_(self.w_ks.weight, mean=0,
                        std=np.sqrt(2.0 / (self.d_model + self.d_k)))
        nn.init.normal_(self.w_vs.weight, mean=0,
                        std=np.sqrt(2.0 / (self.d_model + self.d_v)))

    def forward(self, q, k, v, mask=None):
        d_k, d_v, n_head = self.d_k, self.d_v, self.n_head

        sz_b, len_q, _ = q.size()
        sz_b, len_k, _ = k.size()
        sz_b, len_v, _ = v.size()

        residual = q

        q = self.w_qs(q).view(sz_b, len_q, n_head, d_k)
        k = self.w_ks(k).view(sz_b, len_k, n_head, d_k)
        v = self.w_vs(v).view(sz_b, len_v, n_head, d_v)

        q = q.permute(2, 0, 1, 3).contiguous().view(-1, len_q, d_k)  # (n*b) x lq x dk
        k = k.permute(2, 0, 1, 3).contiguous().view(-1, len_k, d_k)  # (n*b) x lk x dk
        v = v.permute(2, 0, 1, 3).contiguous().view(-1, len_v, d_v)  # (n*b) x lv x dv

        if mask is not None:
            mask = mask.repeat(n_head, 1, 1)  # (n*b) x .. x ..
        output, attn = self.attention(q, k, v, mask=mask)

        output = output.view(n_head, sz_b, len_q, d_v)
        output = output.permute(1, 2, 0, 3).contiguous().view(sz_b, len_q, -1)  # b x lq x (n*dv)

        output = self.dropout(self.fc(output))
        output = self.layer_norm(output + residual)

        return output, attn


class PositionwiseFeedForward(nn.Module):
    ''' A two-feed-forward-layer module '''

    def __init__(self, d_in, d_hid, dropout=0.1):
        super().__init__()

        # Use Conv1D
        # position-wise
        self.w_1 = nn.Conv1d(
            d_in, d_hid, kernel_size=model_config.fft_conv1d_kernel[0], padding=model_config.fft_conv1d_padding[0])
        # position-wise
        self.w_2 = nn.Conv1d(
            d_hid, d_in, kernel_size=model_config.fft_conv1d_kernel[1], padding=model_config.fft_conv1d_padding[1])

        self.layer_norm = nn.LayerNorm(d_in)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        residual = x
        output = x.transpose(1, 2)
        output = self.w_2(F.relu(self.w_1(output)))
        output = output.transpose(1, 2)
        output = self.dropout(output)
        output = self.layer_norm(output + residual)

        return output


class FFTBlock(torch.nn.Module):
    """FFT Block"""

    def __init__(self,
                 d_model,
                 d_inner,
                 n_head,
                 d_k,
                 d_v,
                 dropout=0.1):
        super(FFTBlock, self).__init__()
        self.slf_attn = MultiHeadAttention(
            n_head, d_model, d_k, d_v, dropout=dropout)
        self.pos_ffn = PositionwiseFeedForward(
            d_model, d_inner, dropout=dropout)

    def forward(self, enc_input, non_pad_mask=None, slf_attn_mask=None):
        enc_output, enc_slf_attn = self.slf_attn(
            enc_input, enc_input, enc_input, mask=slf_attn_mask)

        if non_pad_mask is not None:
            enc_output *= non_pad_mask

        enc_output = self.pos_ffn(enc_output)

        if non_pad_mask is not None:
            enc_output *= non_pad_mask

        return enc_output, enc_slf_attn

In [None]:
def create_alignment(base_mat, duration_predictor_output):
    N, L = duration_predictor_output.shape
    for i in range(N):
        count = 0
        for j in range(L):
            for k in range(duration_predictor_output[i][j]):
                base_mat[i][count+k][j] = 1
            count = count + duration_predictor_output[i][j]
    return base_mat


class Transpose(nn.Module):
    def __init__(self, dim_1, dim_2):
        super().__init__()
        self.dim_1 = dim_1
        self.dim_2 = dim_2

    def forward(self, x):
        return x.transpose(self.dim_1, self.dim_2)


class DurationPredictor(nn.Module):
    """ Duration Predictor """

    def __init__(self, model_config: FastSpeech2Config):
        super(DurationPredictor, self).__init__()

        self.input_size = model_config.encoder_dim
        self.filter_size = model_config.duration_predictor_filter_size
        self.kernel = model_config.duration_predictor_kernel_size
        self.conv_output_size = model_config.duration_predictor_filter_size
        self.dropout = model_config.dropout

        self.conv_net = nn.Sequential(
            Transpose(-1, -2),
            nn.Conv1d(
                self.input_size, self.filter_size,
                kernel_size=self.kernel, padding=1
            ),
            Transpose(-1, -2),
            nn.ReLU(),
            nn.LayerNorm(self.filter_size),
            nn.Dropout(self.dropout),
            Transpose(-1, -2),
            nn.Conv1d(
                self.filter_size, self.filter_size,
                kernel_size=self.kernel, padding=1
            ),
            Transpose(-1, -2),
            nn.ReLU(),
            nn.LayerNorm(self.filter_size),
            nn.Dropout(self.dropout)
        )

        self.linear_layer = nn.Linear(self.conv_output_size, 1)

    def forward(self, encoder_output):
        encoder_output = self.conv_net(encoder_output)

        out = self.linear_layer(encoder_output)
        out = out.squeeze()
        if not self.training:
            out = out.unsqueeze(0)
        return out


class LengthRegulator(nn.Module):
    """ Length Regulator """

    def __init__(self, model_config):
        super(LengthRegulator, self).__init__()
        self.duration_predictor = DurationPredictor(model_config)

    def LR(self, x, duration_predictor_output, mel_max_length=None):
        expand_max_len = torch.max(
            torch.sum(duration_predictor_output, -1), -1)[0]
        alignment = torch.zeros(duration_predictor_output.size(0),
                                expand_max_len,
                                duration_predictor_output.size(1)).numpy()
        alignment = create_alignment(alignment,
                                     duration_predictor_output.cpu().numpy())
        alignment = torch.from_numpy(alignment).to(x.device)

        output = alignment @ x
        if mel_max_length:
            output = F.pad(
                output, (0, 0, 0, mel_max_length-output.size(1), 0, 0))
        return output

    def forward(self, x, alpha=1.0, target=None, mel_max_length=None):
        durations = torch.exp(self.duration_predictor(x)) * alpha
        if target is not None:
            if target.shape[1] != durations.shape[1]:
                new_target = F.pad(target, (0, durations.shape[-1] - target.shape[-1])) * alpha
            else:
                new_target = target * alpha
            duration_predictor_output = torch.round(new_target).int()
        else:
            duration_predictor_output = torch.round(durations).int()
        output = self.LR(x, duration_predictor_output, mel_max_length)
        return output, duration_predictor_output, durations

In [None]:
def get_non_pad_mask(seq):
    assert seq.dim() == 2
    return seq.ne(model_config.PAD).type(torch.float).unsqueeze(-1)

def get_attn_key_pad_mask(seq_k, seq_q):
    ''' For masking out the padding part of key sequence. '''
    # Expand to fit the shape of key query attention matrix.
    len_q = seq_q.size(1)
    padding_mask = seq_k.eq(model_config.PAD)
    padding_mask = padding_mask.unsqueeze(
        1).expand(-1, len_q, -1)  # b x lq x lk

    return padding_mask


class Encoder(nn.Module):
    def __init__(self, model_config):
        super(Encoder, self).__init__()

        len_max_seq=model_config.max_seq_len
        n_position = len_max_seq + 1
        n_layers = model_config.encoder_n_layer

        self.src_word_emb = nn.Embedding(
            model_config.vocab_size,
            model_config.encoder_dim,
            padding_idx=model_config.PAD
        )

        self.position_enc = nn.Embedding(
            n_position,
            model_config.encoder_dim,
            padding_idx=model_config.PAD
        )

        self.layer_stack = nn.ModuleList([FFTBlock(
            model_config.encoder_dim,
            model_config.encoder_conv1d_filter_size,
            model_config.encoder_head,
            model_config.encoder_dim // model_config.encoder_head,
            model_config.encoder_dim // model_config.encoder_head,
            dropout=model_config.dropout
        ) for _ in range(n_layers)])

    def forward(self, src_seq, src_pos, return_attns=False):

        enc_slf_attn_list = []

        # -- Prepare masks
        slf_attn_mask = get_attn_key_pad_mask(seq_k=src_seq, seq_q=src_seq)
        non_pad_mask = get_non_pad_mask(src_seq)

        # -- Forward
        enc_output = self.src_word_emb(src_seq) + self.position_enc(src_pos)

        for enc_layer in self.layer_stack:
            enc_output, enc_slf_attn = enc_layer(
                enc_output,
                non_pad_mask=non_pad_mask,
                slf_attn_mask=slf_attn_mask)
            if return_attns:
                enc_slf_attn_list += [enc_slf_attn]

        return enc_output, non_pad_mask


class Decoder(nn.Module):
    """ Decoder """

    def __init__(self, model_config):

        super(Decoder, self).__init__()

        len_max_seq=model_config.max_seq_len
        n_position = len_max_seq + 1
        n_layers = model_config.decoder_n_layer

        self.position_enc = nn.Embedding(
            n_position,
            model_config.encoder_dim,
            padding_idx=model_config.PAD,
        )

        self.layer_stack = nn.ModuleList([FFTBlock(
            model_config.encoder_dim,
            model_config.encoder_conv1d_filter_size,
            model_config.encoder_head,
            model_config.encoder_dim // model_config.encoder_head,
            model_config.encoder_dim // model_config.encoder_head,
            dropout=model_config.dropout
        ) for _ in range(n_layers)])

    def forward(self, enc_seq, enc_pos, return_attns=False):

        dec_slf_attn_list = []

        # -- Prepare masks
        slf_attn_mask = get_attn_key_pad_mask(seq_k=enc_pos, seq_q=enc_pos)
        non_pad_mask = get_non_pad_mask(enc_pos)

        # -- Forward
        dec_output = enc_seq + self.position_enc(enc_pos)

        for dec_layer in self.layer_stack:
            dec_output, dec_slf_attn = dec_layer(
                dec_output,
                non_pad_mask=non_pad_mask,
                slf_attn_mask=slf_attn_mask)
            if return_attns:
                dec_slf_attn_list += [dec_slf_attn]

        return dec_output

In [None]:
class PitchPredictor(nn.Module):

    def __init__(self, model_config: FastSpeech2Config):
        super(PitchPredictor, self).__init__()

        self.input_size = model_config.encoder_dim
        self.filter_size = model_config.pitch_predictor_filter_size
        self.kernel = model_config.pitch_predictor_kernel_size
        self.conv_output_size = model_config.pitch_predictor_filter_size
        self.dropout = model_config.dropout

        self.conv_net = nn.Sequential(
            Transpose(-1, -2),
            nn.Conv1d(
                self.input_size, self.filter_size,
                kernel_size=self.kernel, padding=1
            ),
            Transpose(-1, -2),
            nn.ReLU(),
            nn.LayerNorm(self.filter_size),
            nn.Dropout(self.dropout),
            Transpose(-1, -2),
            nn.Conv1d(
                self.filter_size, self.filter_size,
                kernel_size=self.kernel, padding=1
            ),
            Transpose(-1, -2),
            nn.ReLU(),
            nn.LayerNorm(self.filter_size),
            nn.Dropout(self.dropout)
        )

        self.linear_layer = nn.Linear(self.conv_output_size, 1)
        self.mean_pred = nn.Linear(self.conv_output_size, 1)
        self.std_pred = nn.Linear(self.conv_output_size, 1)

    def forward(self, encoder_output):
        encoder_output = self.conv_net(encoder_output)

        out = self.linear_layer(encoder_output)
        mean = self.mean_pred(encoder_output.mean(1))
        std = self.std_pred(encoder_output.mean(1))
        out = out.squeeze()
        if not self.training:
            out = out.unsqueeze(0)
        return out, mean, std


class EnergyPredictor(nn.Module):

    def __init__(self, model_config: FastSpeech2Config):
        super(EnergyPredictor, self).__init__()

        self.input_size = model_config.encoder_dim
        self.filter_size = model_config.pitch_predictor_filter_size
        self.kernel = model_config.pitch_predictor_kernel_size
        self.conv_output_size = model_config.pitch_predictor_filter_size
        self.dropout = model_config.dropout

        self.conv_net = nn.Sequential(
            Transpose(-1, -2),
            nn.Conv1d(
                self.input_size, self.filter_size,
                kernel_size=self.kernel, padding=1
            ),
            Transpose(-1, -2),
            nn.ReLU(),
            nn.LayerNorm(self.filter_size),
            nn.Dropout(self.dropout),
            Transpose(-1, -2),
            nn.Conv1d(
                self.filter_size, self.filter_size,
                kernel_size=self.kernel, padding=1
            ),
            Transpose(-1, -2),
            nn.ReLU(),
            nn.LayerNorm(self.filter_size),
            nn.Dropout(self.dropout)
        )

        self.linear_layer = nn.Linear(self.conv_output_size, 1)

    def forward(self, encoder_output):
        encoder_output = self.conv_net(encoder_output)

        out = self.linear_layer(encoder_output)
        out = out.squeeze()
        if not self.training:
            out = out.unsqueeze(0)
        return out

In [None]:
def get_mask_from_lengths(lengths, max_len=None):
    if max_len == None:
        max_len = torch.max(lengths).item()

    ids = torch.arange(0, max_len, 1, device=lengths.device)
    mask = (ids < lengths.unsqueeze(1)).bool()

    return mask


class FastSpeech2(nn.Module):

    def __init__(self, model_config):
        super(FastSpeech2, self).__init__()

        self.encoder = Encoder(model_config)

        self.length_regulator = LengthRegulator(model_config)

        self.pitch_predictor = PitchPredictor(model_config)
        self.pitch_embed = nn.Embedding(model_config.pitch_n_emb,
                                        model_config.encoder_dim)
        self.pitch_boundaries = torch.linspace(model_config.min_pitch,
                                          model_config.max_pitch,
                                          model_config.pitch_n_emb + 1).unsqueeze(0).unsqueeze(0)

        self.energy_predictor = EnergyPredictor(model_config)
        self.energy_embed = nn.Embedding(model_config.energy_n_emb,
                                        model_config.encoder_dim)
        self.energy_boundaries = torch.linspace(model_config.min_energy,
                                           model_config.max_energy,
                                           model_config.energy_n_emb + 1).unsqueeze(0).unsqueeze(0)
        self.decoder = Decoder(model_config)

        self.mel_linear = nn.Linear(model_config.decoder_dim, mel_config.num_mels)

    def mask_tensor(self, mel_output, position, mel_max_length):
        lengths = torch.max(position, -1)[0]
        mask = ~get_mask_from_lengths(lengths, max_len=mel_max_length)
        mask = mask.unsqueeze(-1).expand(-1, -1, mel_output.size(-1))
        return mel_output.masked_fill(mask, 0.)

    def forward(self, src_seq, src_pos,
                mel_pos=None, mel_max_length=None,
                target_duration=None,
                target_pitch=None,
                target_pitch_mean=None,
                target_pitch_std=None,
                target_energy=None,
                alpha=1.0):
        self.pitch_boundaries = self.pitch_boundaries.to(src_seq.device)
        self.energy_boundaries = self.energy_boundaries.to(src_seq.device)

        enc_output, non_pad_mask = self.encoder(src_seq, src_pos)

        lr_output, duration_predictor_output, durations = self.length_regulator(enc_output, alpha, target_duration, mel_max_length)
        got_expansion = duration_predictor_output.sum(-1).cpu()
        new_enc_pos = list()
        max_new_enc_pos = lr_output.shape[1]
        for i in range(got_expansion.shape[0]):
            new_enc_pos.append(np.pad([i+1 for i in range(int(got_expansion[i].item()))],
                                      (0, max_new_enc_pos-int(got_expansion[i].item())), 'constant'))
        new_enc_pos = torch.from_numpy(np.array(new_enc_pos)).to(lr_output.device)

        pitch, mean, std = self.pitch_predictor(lr_output)
        if target_pitch is not None:
            new_pitch = (target_pitch * target_pitch_std.unsqueeze(-1) + \
                         target_pitch_mean.unsqueeze(-1)).unsqueeze(-1).repeat(1, 1, self.pitch_boundaries.shape[-1])
        else:
            new_pitch = (pitch * std.unsqueeze(-1) + \
                         mean.unsqueeze(-1)).unsqueeze(-1).repeat(1, 1, self.pitch_boundaries.shape[-1])
        pitch_ids = (~(new_pitch < self.pitch_boundaries)).sum(-1) - 1
        pitch_ids = torch.maximum(pitch_ids, torch.zeros_like(pitch_ids).to(pitch_ids.device)).long()
        pitch_output = self.pitch_embed(pitch_ids)
        pitch_output = self.mask_tensor(pitch_output, new_enc_pos, mel_max_length)

        energy = self.energy_predictor(lr_output)
        if target_energy is not None:
            new_energy = target_energy.unsqueeze(-1).repeat(1, 1, self._boundaries.shape[-1])
        else:
            new_energy = energy.unsqueeze(-1).repeat(1, 1, self.energy_boundaries.shape[-1])
        energy_ids = (~(new_energy < self.energy_boundaries)).sum(-1) - 1
        energy_ids = torch.maximum(energy_ids, torch.zeros_like(energy_ids).to(energy_ids.device)).long()
        energy_output = self.energy_embed(energy_ids)
        energy_output = self.mask_tensor(energy_output, new_enc_pos, mel_max_length)

        output = lr_output + pitch_output + energy_output

        dec_output = self.mel_linear(self.decoder(output, new_enc_pos))
        if target_duration is not None: # training
            dec_output = self.mask_tensor(dec_output, mel_pos, mel_max_length)
        else:
            dec_output = self.mask_tensor(dec_output, new_enc_pos, mel_max_length)
        return dec_output, durations, pitch, mean, std, energy

In [None]:
import torch
import torch.nn as nn


class FastSpeech2Loss(nn.Module):
    def __init__(self):
        super().__init__()
        self.mse_loss = nn.MSELoss()
        self.l1_loss = nn.L1Loss()

    def forward(self, mel, duration_predicted,
                pitch_predicted, pitch_mean_predicted, pitch_std_predicted,
                energy_predicted,
                mel_target, duration_target,
                pitch_target, pitch_mean_target, pitch_std_target,
                energy_target):
        mel_loss = self.l1_loss(mel, mel_target)
        pitch_loss = self.mse_loss(pitch_predicted, pitch_target)
        pitch_mean_loss = self.mse_loss(pitch_mean_predicted, pitch_mean_target)
        pitch_std_loss = self.mse_loss(pitch_std_predicted, pitch_std_target)
        energy_loss = self.mse_loss(energy_predicted, energy_target)
        duration_loss = self.mse_loss(duration_predicted, duration_target.float())

        return mel_loss, duration_loss, pitch_loss, pitch_mean_loss, pitch_std_loss, energy_loss

In [None]:
!pip install wandb

In [None]:
from torch.optim.lr_scheduler  import OneCycleLR
from wandb_writer import WanDBWriter

In [None]:
logger = WanDBWriter(train_config)

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33makhmatovaanna2002[0m ([33mteam-from-wonderland[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
model = FastSpeech2(model_config)
model = model.to(train_config.device)

fastspeech_loss = FastSpeech2Loss()
current_step = 0

optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=train_config.learning_rate,
    betas=(0.9, 0.98),
    eps=1e-9)

scheduler = OneCycleLR(optimizer, **{
    "steps_per_epoch": len(training_loader) * train_config.batch_expand_size,
    "epochs": train_config.epochs,
    "anneal_strategy": "cos",
    "max_lr": train_config.learning_rate,
    "pct_start": 0.1
})

In [None]:
!mkdir ./model_new

In [None]:
tqdm_bar = tqdm(total=train_config.epochs * len(training_loader) * train_config.batch_expand_size - current_step)


for epoch in range(train_config.epochs):
    for i, batchs in enumerate(training_loader):
        # real batch start here
        for j, db in enumerate(batchs):
            current_step += 1
            tqdm_bar.update(1)

            logger.set_step(current_step)

            # Get Data
            character = db["text"].long().to(train_config.device)
            mel_target = db["mel_target"].float().to(train_config.device)
            duration = db["duration"].long().to(train_config.device)
            pitch = db["pitch"].float().to(train_config.device)
            pitch_mean = db["pitch_mean"].float().to(train_config.device)
            pitch_std = db["pitch_std"].float().to(train_config.device)
            energy = db["energy"].float().to(train_config.device)
            mel_pos = db["mel_pos"].long().to(train_config.device)
            src_pos = db["src_pos"].long().to(train_config.device)
            max_mel_len = db["mel_max_len"]

            # Forward
            preds = model(character,
                          src_pos,
                          mel_pos=mel_pos,
                          mel_max_length=max_mel_len,
                          target_duration=duration,
                          target_pitch=pitch,
                          target_pitch_mean=pitch_mean,
                          target_pitch_std=pitch_std,
                          target_energy=energy,
                          alpha=1.0)

            mel_pred, durations_pred, pitch_pred, pitch_mean_pred, pitch_std_pred, energy_pred = preds

            # Calc Loss
            losses = fastspeech_loss(mel_pred,
                                     durations_pred,
                                     pitch_pred,
                                     pitch_mean_pred,
                                     pitch_std_pred,
                                     energy_pred,
                                     mel_target,
                                     duration,
                                     pitch,
                                     pitch_mean,
                                     pitch_std,
                                     energy)

            mel_loss, duration_loss, pitch_loss, pitch_mean_loss, pitch_std_loss, energy_loss = losses

            total_loss = mel_loss + duration_loss + pitch_loss + pitch_mean_loss + pitch_std_loss + energy_loss

            # Logger
            t_l = total_loss.detach().cpu().numpy()
            m_l = mel_loss.detach().cpu().numpy()
            d_l = duration_loss.detach().cpu().numpy()
            p_l = pitch_loss.detach().cpu().numpy()
            pm_l = pitch_mean_loss.detach().cpu().numpy()
            ps_l = pitch_std_loss.detach().cpu().numpy()
            e_l = energy_loss.detach().cpu().numpy()

            logger.add_scalar("duration_loss", d_l)
            logger.add_scalar("mel_loss", m_l)
            logger.add_scalar("total_loss", t_l)
            logger.add_scalar("pitch_loss", p_l)
            logger.add_scalar("pitch_mean_loss", pm_l)
            logger.add_scalar("pitch_std_loss", ps_l)
            logger.add_scalar("energy_loss", e_l)

            # Backward
            total_loss.backward()

            # Clipping gradients to avoid gradient explosion
            nn.utils.clip_grad_norm_(
                model.parameters(), train_config.grad_clip_thresh)

            optimizer.step()
            optimizer.zero_grad(set_to_none=True)
            scheduler.step()

            if current_step % train_config.save_step == 0:
                torch.save({'model': model.state_dict(), 'optimizer': optimizer.state_dict(
                )}, os.path.join(train_config.checkpoint_path, 'checkpoint_%d.pth.tar' % current_step))
                print("save model at step %d ..." % current_step)



  0%|          | 1/1600000 [04:44<126376:14:01, 284.35s/it]

  return F.mse_loss(input, target, reduction=self.reduction)

  0%|          | 2/1600000 [00:01<487:37:14,  1.10s/it][A
  0%|          | 3/1600000 [00:02<351:37:23,  1.26it/s][A
  0%|          | 4/1600000 [00:02<289:26:08,  1.54it/s][A
  0%|          | 5/1600000 [00:03<251:36:11,  1.77it/s][A
  0%|          | 6/1600000 [00:03<229:44:32,  1.93it/s][A
  0%|          | 7/1600000 [00:04<214:29:03,  2.07it/s][A
  0%|          | 8/1600000 [00:04<205:06:57,  2.17it/s][A
  0%|          | 9/1600000 [00:04<195:00:28,  2.28it/s][A
  0%|          | 10/1600000 [00:05<187:53:35,  2.37it/s][A
  0%|          | 11/1600000 [00:05<184:07:31,  2.41it/s][A
  0%|          | 12/1600000 [00:06<179:08:34,  2.48it/s][A
  0%|          | 13/1600000 [00:06<176:29:39,  2.52it/s][A
  0%|          | 14/1600000 [00:06<173:03:39,  2.57it/s][A
  0%|          | 15/1600000 [00:07<165:59:57,  2.68it/s][A
  0%|          | 16/1600000 [00:07<165:18:47

save model at step 3000 ...



  0%|          | 3002/1600000 [17:38<209:58:45,  2.11it/s][A
  0%|          | 3003/1600000 [17:38<186:37:57,  2.38it/s][A
  0%|          | 3004/1600000 [17:38<160:36:01,  2.76it/s][A
  0%|          | 3005/1600000 [17:38<137:54:30,  3.22it/s][A
  0%|          | 3006/1600000 [17:38<122:51:30,  3.61it/s][A
  0%|          | 3007/1600000 [17:39<110:29:18,  4.01it/s][A
  0%|          | 3008/1600000 [17:39<103:32:16,  4.28it/s][A
  0%|          | 3009/1600000 [17:39<114:07:28,  3.89it/s][A
  0%|          | 3010/1600000 [17:40<136:26:25,  3.25it/s][A
  0%|          | 3011/1600000 [17:40<156:12:49,  2.84it/s][A
  0%|          | 3012/1600000 [17:40<170:39:01,  2.60it/s][A
  0%|          | 3013/1600000 [17:41<179:19:44,  2.47it/s][A
  0%|          | 3014/1600000 [17:41<184:05:00,  2.41it/s][A
  0%|          | 3015/1600000 [17:42<185:38:48,  2.39it/s][A
  0%|          | 3016/1600000 [17:42<186:36:19,  2.38it/s][A
  0%|          | 3017/1600000 [17:43<187:19:17,  2.37it/s][A
  0%|  

save model at step 6000 ...



  0%|          | 6002/1600000 [35:40<271:49:23,  1.63it/s][A
  0%|          | 6003/1600000 [35:40<236:07:15,  1.88it/s][A
  0%|          | 6004/1600000 [35:41<214:35:50,  2.06it/s][A
  0%|          | 6005/1600000 [35:41<193:53:00,  2.28it/s][A
  0%|          | 6006/1600000 [35:41<178:52:35,  2.48it/s][A
  0%|          | 6007/1600000 [35:42<166:39:23,  2.66it/s][A
  0%|          | 6008/1600000 [35:42<156:53:46,  2.82it/s][A
  0%|          | 6009/1600000 [35:42<149:31:11,  2.96it/s][A
  0%|          | 6010/1600000 [35:42<145:51:27,  3.04it/s][A
  0%|          | 6011/1600000 [35:43<137:23:57,  3.22it/s][A
  0%|          | 6012/1600000 [35:43<132:04:24,  3.35it/s][A
  0%|          | 6013/1600000 [35:43<123:57:07,  3.57it/s][A
  0%|          | 6014/1600000 [35:43<114:19:58,  3.87it/s][A
  0%|          | 6015/1600000 [35:44<105:50:46,  4.18it/s][A
  0%|          | 6016/1600000 [35:44<97:05:20,  4.56it/s] [A
  0%|          | 6017/1600000 [35:44<109:16:35,  4.05it/s][A
  0%|  

save model at step 9000 ...



  1%|          | 9002/1600000 [53:48<249:10:37,  1.77it/s][A
  1%|          | 9003/1600000 [53:49<228:44:01,  1.93it/s][A
  1%|          | 9004/1600000 [53:49<209:27:59,  2.11it/s][A
  1%|          | 9005/1600000 [53:50<198:31:03,  2.23it/s][A
  1%|          | 9006/1600000 [53:50<189:50:30,  2.33it/s][A
  1%|          | 9007/1600000 [53:50<178:45:05,  2.47it/s][A
  1%|          | 9008/1600000 [53:51<174:43:38,  2.53it/s][A
  1%|          | 9009/1600000 [53:51<176:40:40,  2.50it/s][A
  1%|          | 9010/1600000 [53:51<168:52:33,  2.62it/s][A
  1%|          | 9011/1600000 [53:52<163:22:11,  2.71it/s][A
  1%|          | 9012/1600000 [53:52<156:30:27,  2.82it/s][A
  1%|          | 9013/1600000 [53:52<149:02:02,  2.97it/s][A
  1%|          | 9014/1600000 [53:53<143:05:28,  3.09it/s][A
  1%|          | 9015/1600000 [53:53<141:03:24,  3.13it/s][A
  1%|          | 9016/1600000 [53:53<137:24:25,  3.22it/s][A
  1%|          | 9017/1600000 [53:54<131:14:34,  3.37it/s][A
  1%|  

save model at step 12000 ...



  1%|          | 12001/1600000 [1:12:01<275:04:14,  1.60it/s][A
  1%|          | 12002/1600000 [1:12:01<252:53:04,  1.74it/s][A
  1%|          | 12003/1600000 [1:12:02<242:50:51,  1.82it/s][A
  1%|          | 12004/1600000 [1:12:02<233:53:21,  1.89it/s][A
  1%|          | 12005/1600000 [1:12:03<225:17:47,  1.96it/s][A
  1%|          | 12006/1600000 [1:12:03<219:48:04,  2.01it/s][A
  1%|          | 12007/1600000 [1:12:04<217:03:11,  2.03it/s][A
  1%|          | 12008/1600000 [1:12:04<211:02:30,  2.09it/s][A
  1%|          | 12009/1600000 [1:12:05<204:11:44,  2.16it/s][A
  1%|          | 12010/1600000 [1:12:05<201:20:57,  2.19it/s][A
  1%|          | 12011/1600000 [1:12:05<196:04:51,  2.25it/s][A
  1%|          | 12012/1600000 [1:12:06<189:45:38,  2.32it/s][A
  1%|          | 12013/1600000 [1:12:06<184:09:13,  2.40it/s][A
  1%|          | 12014/1600000 [1:12:07<184:36:29,  2.39it/s][A
  1%|          | 12015/1600000 [1:12:07<177:26:01,  2.49it/s][A
  1%|          | 12016/1

save model at step 15000 ...



  1%|          | 15002/1600000 [1:30:20<226:47:00,  1.94it/s][A
  1%|          | 15003/1600000 [1:30:20<195:26:48,  2.25it/s][A
  1%|          | 15004/1600000 [1:30:20<168:58:17,  2.61it/s][A
  1%|          | 15005/1600000 [1:30:21<149:07:30,  2.95it/s][A
  1%|          | 15006/1600000 [1:30:21<133:03:58,  3.31it/s][A
  1%|          | 15007/1600000 [1:30:21<119:22:41,  3.69it/s][A
  1%|          | 15008/1600000 [1:30:21<110:49:00,  3.97it/s][A
  1%|          | 15009/1600000 [1:30:22<142:25:35,  3.09it/s][A
  1%|          | 15010/1600000 [1:30:22<161:24:34,  2.73it/s][A
  1%|          | 15011/1600000 [1:30:23<177:26:36,  2.48it/s][A
  1%|          | 15012/1600000 [1:30:23<187:24:58,  2.35it/s][A
  1%|          | 15013/1600000 [1:30:24<191:14:39,  2.30it/s][A
  1%|          | 15014/1600000 [1:30:24<195:07:38,  2.26it/s][A
  1%|          | 15015/1600000 [1:30:25<200:30:33,  2.20it/s][A
  1%|          | 15016/1600000 [1:30:25<201:53:37,  2.18it/s][A
  1%|          | 15017/1

save model at step 18000 ...



  1%|          | 18002/1600000 [1:48:43<299:05:49,  1.47it/s][A
  1%|          | 18003/1600000 [1:48:43<267:08:09,  1.65it/s][A
  1%|          | 18004/1600000 [1:48:44<234:52:25,  1.87it/s][A
  1%|          | 18005/1600000 [1:48:44<219:08:32,  2.01it/s][A
  1%|          | 18006/1600000 [1:48:44<202:51:32,  2.17it/s][A
  1%|          | 18007/1600000 [1:48:45<191:27:15,  2.30it/s][A
  1%|          | 18008/1600000 [1:48:45<178:48:40,  2.46it/s][A
  1%|          | 18009/1600000 [1:48:45<170:13:25,  2.58it/s][A
  1%|          | 18010/1600000 [1:48:46<160:46:54,  2.73it/s][A
  1%|          | 18011/1600000 [1:48:46<150:24:58,  2.92it/s][A
  1%|          | 18012/1600000 [1:48:46<142:39:37,  3.08it/s][A
  1%|          | 18013/1600000 [1:48:47<134:45:26,  3.26it/s][A
  1%|          | 18014/1600000 [1:48:47<134:18:04,  3.27it/s][A
  1%|          | 18015/1600000 [1:48:47<127:26:43,  3.45it/s][A
  1%|          | 18016/1600000 [1:48:47<117:55:11,  3.73it/s][A
  1%|          | 18017/1

save model at step 21000 ...



  1%|▏         | 21002/1600000 [2:07:00<291:02:07,  1.51it/s][A
  1%|▏         | 21003/1600000 [2:07:01<262:35:40,  1.67it/s][A
  1%|▏         | 21004/1600000 [2:07:01<245:09:06,  1.79it/s][A
  1%|▏         | 21005/1600000 [2:07:02<225:35:12,  1.94it/s][A
  1%|▏         | 21006/1600000 [2:07:02<209:44:33,  2.09it/s][A
  1%|▏         | 21007/1600000 [2:07:02<204:08:02,  2.15it/s][A
  1%|▏         | 21008/1600000 [2:07:03<192:19:52,  2.28it/s][A
  1%|▏         | 21009/1600000 [2:07:03<186:40:21,  2.35it/s][A
  1%|▏         | 21010/1600000 [2:07:04<187:16:14,  2.34it/s][A
  1%|▏         | 21011/1600000 [2:07:04<181:57:53,  2.41it/s][A
  1%|▏         | 21012/1600000 [2:07:04<177:58:55,  2.46it/s][A
  1%|▏         | 21013/1600000 [2:07:05<168:35:30,  2.60it/s][A
  1%|▏         | 21014/1600000 [2:07:05<172:54:43,  2.54it/s][A
  1%|▏         | 21015/1600000 [2:07:06<171:03:38,  2.56it/s][A
  1%|▏         | 21016/1600000 [2:07:06<166:59:29,  2.63it/s][A
  1%|▏         | 21017/1

save model at step 24000 ...



  2%|▏         | 24001/1600000 [2:25:17<341:51:47,  1.28it/s][A
  2%|▏         | 24002/1600000 [2:25:17<313:51:55,  1.39it/s][A
  2%|▏         | 24003/1600000 [2:25:18<287:35:10,  1.52it/s][A
  2%|▏         | 24004/1600000 [2:25:18<267:21:09,  1.64it/s][A
  2%|▏         | 24005/1600000 [2:25:19<246:18:39,  1.78it/s][A
  2%|▏         | 24006/1600000 [2:25:19<233:30:51,  1.87it/s][A
  2%|▏         | 24007/1600000 [2:25:20<220:48:50,  1.98it/s][A
  2%|▏         | 24008/1600000 [2:25:20<216:25:34,  2.02it/s][A
  2%|▏         | 24009/1600000 [2:25:21<210:09:39,  2.08it/s][A
  2%|▏         | 24010/1600000 [2:25:21<198:56:23,  2.20it/s][A
  2%|▏         | 24011/1600000 [2:25:22<193:07:01,  2.27it/s][A
  2%|▏         | 24012/1600000 [2:25:22<188:17:23,  2.33it/s][A
  2%|▏         | 24013/1600000 [2:25:22<184:24:58,  2.37it/s][A
  2%|▏         | 24014/1600000 [2:25:23<178:25:50,  2.45it/s][A
  2%|▏         | 24015/1600000 [2:25:23<173:04:38,  2.53it/s][A
  2%|▏         | 24016/1

save model at step 27000 ...



  2%|▏         | 27002/1600000 [2:43:34<236:49:04,  1.85it/s][A
  2%|▏         | 27003/1600000 [2:43:34<199:36:23,  2.19it/s][A
  2%|▏         | 27004/1600000 [2:43:34<176:14:11,  2.48it/s][A
  2%|▏         | 27005/1600000 [2:43:34<160:22:38,  2.72it/s][A
  2%|▏         | 27006/1600000 [2:43:35<145:02:40,  3.01it/s][A
  2%|▏         | 27007/1600000 [2:43:35<133:18:53,  3.28it/s][A
  2%|▏         | 27008/1600000 [2:43:35<119:10:43,  3.67it/s][A
  2%|▏         | 27009/1600000 [2:43:35<142:44:42,  3.06it/s][A
  2%|▏         | 27010/1600000 [2:43:36<160:53:55,  2.72it/s][A
  2%|▏         | 27011/1600000 [2:43:36<172:19:16,  2.54it/s][A
  2%|▏         | 27012/1600000 [2:43:37<180:09:23,  2.43it/s][A
  2%|▏         | 27013/1600000 [2:43:37<188:00:07,  2.32it/s][A
  2%|▏         | 27014/1600000 [2:43:38<193:07:12,  2.26it/s][A
  2%|▏         | 27015/1600000 [2:43:38<193:43:14,  2.26it/s][A
  2%|▏         | 27016/1600000 [2:43:39<194:42:05,  2.24it/s][A
  2%|▏         | 27017/1

save model at step 30000 ...



  2%|▏         | 30002/1600000 [3:01:56<257:56:01,  1.69it/s][A
  2%|▏         | 30003/1600000 [3:01:56<232:20:36,  1.88it/s][A
  2%|▏         | 30004/1600000 [3:01:57<209:59:32,  2.08it/s][A
  2%|▏         | 30005/1600000 [3:01:57<191:45:23,  2.27it/s][A
  2%|▏         | 30006/1600000 [3:01:57<177:26:30,  2.46it/s][A
  2%|▏         | 30007/1600000 [3:01:58<168:21:10,  2.59it/s][A
  2%|▏         | 30008/1600000 [3:01:58<159:35:17,  2.73it/s][A
  2%|▏         | 30009/1600000 [3:01:58<151:49:52,  2.87it/s][A
  2%|▏         | 30010/1600000 [3:01:59<141:36:31,  3.08it/s][A
  2%|▏         | 30011/1600000 [3:01:59<135:11:03,  3.23it/s][A
  2%|▏         | 30012/1600000 [3:01:59<127:53:47,  3.41it/s][A
  2%|▏         | 30013/1600000 [3:01:59<121:25:39,  3.59it/s][A
  2%|▏         | 30014/1600000 [3:02:00<114:21:30,  3.81it/s][A
  2%|▏         | 30015/1600000 [3:02:00<107:09:17,  4.07it/s][A
  2%|▏         | 30016/1600000 [3:02:00<102:10:41,  4.27it/s][A
  2%|▏         | 30017/1

save model at step 33000 ...



  2%|▏         | 33002/1600000 [3:20:16<250:25:37,  1.74it/s][A
  2%|▏         | 33003/1600000 [3:20:16<228:57:32,  1.90it/s][A
  2%|▏         | 33004/1600000 [3:20:17<216:36:41,  2.01it/s][A
  2%|▏         | 33005/1600000 [3:20:17<206:20:50,  2.11it/s][A
  2%|▏         | 33006/1600000 [3:20:17<194:56:10,  2.23it/s][A
  2%|▏         | 33007/1600000 [3:20:18<185:30:31,  2.35it/s][A
  2%|▏         | 33008/1600000 [3:20:18<177:33:09,  2.45it/s][A
  2%|▏         | 33009/1600000 [3:20:18<171:54:49,  2.53it/s][A
  2%|▏         | 33010/1600000 [3:20:19<166:56:39,  2.61it/s][A
  2%|▏         | 33011/1600000 [3:20:19<168:53:22,  2.58it/s][A
  2%|▏         | 33012/1600000 [3:20:20<165:04:26,  2.64it/s][A
  2%|▏         | 33013/1600000 [3:20:20<159:30:06,  2.73it/s][A
  2%|▏         | 33014/1600000 [3:20:20<155:17:11,  2.80it/s][A
  2%|▏         | 33015/1600000 [3:20:21<146:26:34,  2.97it/s][A
  2%|▏         | 33016/1600000 [3:20:21<139:42:10,  3.12it/s][A
  2%|▏         | 33017/1

save model at step 36000 ...



  2%|▏         | 36001/1600000 [3:38:38<280:16:28,  1.55it/s][A
  2%|▏         | 36002/1600000 [3:38:38<258:27:57,  1.68it/s][A
  2%|▏         | 36003/1600000 [3:38:39<242:16:37,  1.79it/s][A
  2%|▏         | 36004/1600000 [3:38:39<230:10:51,  1.89it/s][A
  2%|▏         | 36005/1600000 [3:38:40<222:53:45,  1.95it/s][A
  2%|▏         | 36006/1600000 [3:38:40<219:06:58,  1.98it/s][A
  2%|▏         | 36007/1600000 [3:38:41<213:40:07,  2.03it/s][A
  2%|▏         | 36008/1600000 [3:38:41<206:11:02,  2.11it/s][A
  2%|▏         | 36009/1600000 [3:38:42<203:09:02,  2.14it/s][A
  2%|▏         | 36010/1600000 [3:38:42<198:19:11,  2.19it/s][A
  2%|▏         | 36011/1600000 [3:38:42<193:06:43,  2.25it/s][A
  2%|▏         | 36012/1600000 [3:38:43<186:23:27,  2.33it/s][A
  2%|▏         | 36013/1600000 [3:38:43<180:55:29,  2.40it/s][A
  2%|▏         | 36014/1600000 [3:38:44<182:07:31,  2.39it/s][A
  2%|▏         | 36015/1600000 [3:38:44<176:20:47,  2.46it/s][A
  2%|▏         | 36016/1

save model at step 39000 ...



  2%|▏         | 39002/1600000 [3:57:09<209:19:27,  2.07it/s][A
  2%|▏         | 39003/1600000 [3:57:09<186:02:17,  2.33it/s][A
  2%|▏         | 39004/1600000 [3:57:10<161:37:23,  2.68it/s][A
  2%|▏         | 39005/1600000 [3:57:10<147:18:34,  2.94it/s][A
  2%|▏         | 39006/1600000 [3:57:10<133:05:24,  3.26it/s][A
  2%|▏         | 39007/1600000 [3:57:10<122:33:01,  3.54it/s][A
  2%|▏         | 39008/1600000 [3:57:11<112:30:54,  3.85it/s][A
  2%|▏         | 39009/1600000 [3:57:11<125:26:24,  3.46it/s][A
  2%|▏         | 39010/1600000 [3:57:11<145:53:43,  2.97it/s][A
  2%|▏         | 39011/1600000 [3:57:12<161:28:30,  2.69it/s][A
  2%|▏         | 39012/1600000 [3:57:12<172:41:30,  2.51it/s][A
  2%|▏         | 39013/1600000 [3:57:13<180:35:28,  2.40it/s][A
  2%|▏         | 39014/1600000 [3:57:13<183:56:32,  2.36it/s][A
  2%|▏         | 39015/1600000 [3:57:14<184:45:43,  2.35it/s][A
  2%|▏         | 39016/1600000 [3:57:14<182:19:12,  2.38it/s][A
  2%|▏         | 39017/1

save model at step 42000 ...



  3%|▎         | 42002/1600000 [4:15:36<291:48:43,  1.48it/s][A
  3%|▎         | 42003/1600000 [4:15:37<253:27:05,  1.71it/s][A
  3%|▎         | 42004/1600000 [4:15:37<223:25:08,  1.94it/s][A
  3%|▎         | 42005/1600000 [4:15:37<198:40:35,  2.18it/s][A
  3%|▎         | 42006/1600000 [4:15:38<182:47:43,  2.37it/s][A
  3%|▎         | 42007/1600000 [4:15:38<170:03:08,  2.54it/s][A
  3%|▎         | 42008/1600000 [4:15:38<170:13:11,  2.54it/s][A
  3%|▎         | 42009/1600000 [4:15:39<156:34:17,  2.76it/s][A
  3%|▎         | 42010/1600000 [4:15:39<146:20:10,  2.96it/s][A
  3%|▎         | 42011/1600000 [4:15:39<135:56:42,  3.18it/s][A
  3%|▎         | 42012/1600000 [4:15:39<130:38:50,  3.31it/s][A
  3%|▎         | 42013/1600000 [4:15:40<123:30:29,  3.50it/s][A
  3%|▎         | 42014/1600000 [4:15:40<118:12:29,  3.66it/s][A
  3%|▎         | 42015/1600000 [4:15:40<112:59:16,  3.83it/s][A
  3%|▎         | 42016/1600000 [4:15:40<107:12:00,  4.04it/s][A
  3%|▎         | 42017/1

save model at step 45000 ...



  3%|▎         | 45002/1600000 [4:34:01<252:12:32,  1.71it/s][A
  3%|▎         | 45003/1600000 [4:34:02<228:07:45,  1.89it/s][A
  3%|▎         | 45004/1600000 [4:34:02<216:07:43,  2.00it/s][A
  3%|▎         | 45005/1600000 [4:34:03<202:47:58,  2.13it/s][A
  3%|▎         | 45006/1600000 [4:34:03<191:31:06,  2.26it/s][A
  3%|▎         | 45007/1600000 [4:34:03<186:37:20,  2.31it/s][A
  3%|▎         | 45008/1600000 [4:34:04<177:28:45,  2.43it/s][A
  3%|▎         | 45009/1600000 [4:34:04<171:44:56,  2.51it/s][A
  3%|▎         | 45010/1600000 [4:34:04<163:32:26,  2.64it/s][A
  3%|▎         | 45011/1600000 [4:34:05<155:49:16,  2.77it/s][A
  3%|▎         | 45012/1600000 [4:34:05<153:40:25,  2.81it/s][A
  3%|▎         | 45013/1600000 [4:34:05<150:33:55,  2.87it/s][A
  3%|▎         | 45014/1600000 [4:34:06<145:14:03,  2.97it/s][A
  3%|▎         | 45015/1600000 [4:34:06<140:34:41,  3.07it/s][A
  3%|▎         | 45016/1600000 [4:34:06<142:56:06,  3.02it/s][A
  3%|▎         | 45017/1

save model at step 48000 ...



  3%|▎         | 48001/1600000 [4:52:23<248:24:46,  1.74it/s][A
  3%|▎         | 48002/1600000 [4:52:23<232:52:53,  1.85it/s][A
  3%|▎         | 48003/1600000 [4:52:23<225:38:46,  1.91it/s][A
  3%|▎         | 48004/1600000 [4:52:24<215:32:54,  2.00it/s][A
  3%|▎         | 48005/1600000 [4:52:24<212:23:45,  2.03it/s][A
  3%|▎         | 48006/1600000 [4:52:25<205:40:32,  2.10it/s][A
  3%|▎         | 48007/1600000 [4:52:25<203:19:04,  2.12it/s][A
  3%|▎         | 48008/1600000 [4:52:26<196:55:10,  2.19it/s][A
  3%|▎         | 48009/1600000 [4:52:26<192:40:45,  2.24it/s][A
  3%|▎         | 48010/1600000 [4:52:27<191:08:56,  2.26it/s][A
  3%|▎         | 48011/1600000 [4:52:27<192:19:57,  2.24it/s][A
  3%|▎         | 48012/1600000 [4:52:27<193:25:26,  2.23it/s][A
  3%|▎         | 48013/1600000 [4:52:28<192:19:32,  2.24it/s][A
  3%|▎         | 48014/1600000 [4:52:28<195:51:34,  2.20it/s][A
  3%|▎         | 48015/1600000 [4:52:29<192:25:29,  2.24it/s][A
  3%|▎         | 48016/1

save model at step 51000 ...



  3%|▎         | 51002/1600000 [5:10:46<191:03:09,  2.25it/s][A
  3%|▎         | 51003/1600000 [5:10:46<165:40:02,  2.60it/s][A
  3%|▎         | 51004/1600000 [5:10:47<146:25:47,  2.94it/s][A
  3%|▎         | 51005/1600000 [5:10:47<130:57:02,  3.29it/s][A
  3%|▎         | 51006/1600000 [5:10:47<121:41:21,  3.54it/s][A
  3%|▎         | 51007/1600000 [5:10:47<110:27:51,  3.90it/s][A
  3%|▎         | 51008/1600000 [5:10:48<102:49:28,  4.18it/s][A
  3%|▎         | 51009/1600000 [5:10:48<115:01:07,  3.74it/s][A
  3%|▎         | 51010/1600000 [5:10:48<137:15:33,  3.13it/s][A
  3%|▎         | 51011/1600000 [5:10:49<155:21:45,  2.77it/s][A
  3%|▎         | 51012/1600000 [5:10:49<167:01:18,  2.58it/s][A
  3%|▎         | 51013/1600000 [5:10:50<174:52:30,  2.46it/s][A
  3%|▎         | 51014/1600000 [5:10:50<180:46:33,  2.38it/s][A
  3%|▎         | 51015/1600000 [5:10:51<184:06:26,  2.34it/s][A
  3%|▎         | 51016/1600000 [5:10:51<186:54:10,  2.30it/s][A
  3%|▎         | 51017/1

save model at step 54000 ...



  3%|▎         | 54002/1600000 [5:29:17<282:15:09,  1.52it/s][A
  3%|▎         | 54003/1600000 [5:29:18<249:44:12,  1.72it/s][A
  3%|▎         | 54004/1600000 [5:29:18<224:03:37,  1.92it/s][A
  3%|▎         | 54005/1600000 [5:29:19<204:49:19,  2.10it/s][A
  3%|▎         | 54006/1600000 [5:29:19<187:52:58,  2.29it/s][A
  3%|▎         | 54007/1600000 [5:29:19<172:23:15,  2.49it/s][A
  3%|▎         | 54008/1600000 [5:29:20<163:55:01,  2.62it/s][A
  3%|▎         | 54009/1600000 [5:29:20<157:57:05,  2.72it/s][A
  3%|▎         | 54010/1600000 [5:29:20<148:42:38,  2.89it/s][A
  3%|▎         | 54011/1600000 [5:29:20<139:48:30,  3.07it/s][A
  3%|▎         | 54012/1600000 [5:29:21<134:36:43,  3.19it/s][A
  3%|▎         | 54013/1600000 [5:29:21<129:10:27,  3.32it/s][A
  3%|▎         | 54014/1600000 [5:29:21<118:41:35,  3.62it/s][A
  3%|▎         | 54015/1600000 [5:29:22<112:41:10,  3.81it/s][A
  3%|▎         | 54016/1600000 [5:29:22<110:07:18,  3.90it/s][A
  3%|▎         | 54017/1

# Установка

In [None]:
%%bash

pip install -r requirements.txt
pip install .

gdown https://drive.google.com/u/0/uc?id=1-4cIK7IXOlpQYNqFoyF3RLMiy14JufGn
unzip fastspeech2_dataset.zip
rm -rf fastspeech2_dataset.zip

gdown https://drive.google.com/u/0/uc?id=1WsibBTsuRg_SF2Z6L6NFRTT-NjEy1oTx
mkdir -p waveglow/pretrained_model/
mv waveglow_256channels_ljs_v2.pt waveglow/pretrained_model/waveglow_256channels.pt

git clone https://github.com/xcmyz/FastSpeech.git
mv ./FastSpeech/text .
mv ./FastSpeech/audio .
mv ./FastSpeech/waveglow/* waveglow/
mv ./FastSpeech/utils.py .
mv ./FastSpeech/glow.py .
mv ./FastSpeech/hparams.py .
rm -rf ./FastSpeech/