# 准备数据 

In [1]:
!git clone https://github.com/Anti-Entrophic/tacotron2-multispeaker-streaming-PPG.git tacotron2
!git submodule init
!git submodule update

Cloning into 'tacotron2'...
remote: Enumerating objects: 147, done.[K
remote: Counting objects: 100% (90/90), done.[K
remote: Compressing objects: 100% (84/84), done.[K
remote: Total 147 (delta 46), reused 0 (delta 0), pack-reused 57[K
Receiving objects: 100% (147/147), 1.74 MiB | 11.08 MiB/s, done.
Resolving deltas: 100% (55/55), done.
fatal: not a git repository (or any of the parent directories): .git
fatal: not a git repository (or any of the parent directories): .git


## 下载LibriSpeech


In [2]:
%cd /content
!wget --no-check-certificate -r https://us.openslr.org/resources/12/train-clean-100.tar.gz

/content
--2023-02-12 15:30:13--  https://us.openslr.org/resources/12/train-clean-100.tar.gz
Resolving us.openslr.org (us.openslr.org)... 46.101.158.64
Connecting to us.openslr.org (us.openslr.org)|46.101.158.64|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6387309499 (5.9G) [application/x-gzip]
Saving to: ‘us.openslr.org/resources/12/train-clean-100.tar.gz’


2023-02-12 15:34:01 (26.8 MB/s) - ‘us.openslr.org/resources/12/train-clean-100.tar.gz’ saved [6387309499/6387309499]

FINISHED --2023-02-12 15:34:01--
Total wall clock time: 3m 48s
Downloaded: 1 files, 5.9G in 3m 47s (26.8 MB/s)


In [None]:
# 解压LibriSpeech
%cd /content/us.openslr.org/resources/12
import tarfile
filename = "train-clean-100.tar.gz"
tf = tarfile.open(filename)
tf.extractall('/content')

# 安装依赖

In [None]:
!pip install -U tensorflow==1.15.2
!pip install -q unidecode tensorboardX
!pip install librosa==0.8.0
!pip install pysoundfile==0.9.0.post1
!pip install inflect==5.6.2
!pip install janome==0.4.2
!pip install resemblyzer

In [4]:
# 为了下面能够直接下载预训练模型
!pip install --upgrade --no-cache-dir gdown

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gdown
  Downloading gdown-4.6.2-py3-none-any.whl (14 kB)
Installing collected packages: gdown
  Attempting uninstall: gdown
    Found existing installation: gdown 4.4.0
    Uninstalling gdown-4.4.0:
      Successfully uninstalled gdown-4.4.0
Successfully installed gdown-4.6.2


## 创建文件夹 & 下载预训练模型

In [6]:
import os
if os.getcwd() != '/content/tacotron2':
    os.chdir('/content/tacotron2')
# 但是这个预训练模型,之后应该是不能用的.
! gdown --id 1c5ZTuT7J08wLUoVZ2KkUs_VdZuJ86ZqA
if not os.path.isdir("wavs"):
    os.mkdir('wavs')
if not os.path.isdir("filelists"):
    os.mkdir('filelists')
if not os.path.isdir("outdir"):
    os.mkdir("outdir")

Downloading...
From: https://drive.google.com/uc?id=1c5ZTuT7J08wLUoVZ2KkUs_VdZuJ86ZqA
To: /content/tacotron2/tacotron2_statedict.pt
100% 113M/113M [00:00<00:00, 175MB/s]


# 整理数据

In [None]:
# 需要手动做的事情:

# 准备训练

## 准备日志模块, 输出调试信息

In [10]:
%cd /content/tacotron2
if not os.path.isdir("log"):
    os.mkdir('log')
import logging
logging.basicConfig(level=logging.DEBUG #设置日志输出格式
                ,filename="log/experiment1.log" #log日志输出的文件位置和文件名
                ,format="%(asctime)s-%(levelname)s: %(message)s" #日志输出的格式
                  # -8表示占位符，让输出左对齐，输出长度都为8位
                ,datefmt="%Y-%m-%d %H:%M:%S" #时间输出的格式
                ,force=True
                )
# 使用logging.debug就可以输出调试信息了，直接print到终端其它输出多了可能不好找
logging.debug("debug!")

%cd /content/tacotron2/tacotron2

/content/tacotron2
/content/tacotron2/tacotron2


In [None]:
# 训练模型的代码
%matplotlib inline
import os
import time
import argparse
import math
from numpy import finfo

import torch
from distributed import apply_gradient_allreduce
import torch.distributed as dist
from torch.utils.data.distributed import DistributedSampler
from torch.utils.data import DataLoader

from model import Tacotron2
from data_utils import TextMelLoader, TextMelCollate, PPG_MelLoader
from loss_function import Tacotron2Loss
from logger import Tacotron2Logger
from hparams import create_hparams
 
import random
import numpy as np

import layers
from utils import load_wav_to_torch, load_filepaths_and_text
from text import text_to_sequence # 在我导入的时候，是不是会自动调用text文件夹中的__init__.py？
from math import e
#from tqdm import tqdm # Terminal
#from tqdm import tqdm_notebook as tqdm # Legacy Notebook TQDM
from tqdm.notebook import tqdm # Modern Notebook TQDM # tqdm可以实现进度条的显示
from distutils.dir_util import copy_tree
import matplotlib.pylab as plt

def create_mels():
    print("Generating Mels")
    stft = layers.TacotronSTFT(
                hparams.filter_length, hparams.hop_length, hparams.win_length,
                hparams.n_mel_channels, hparams.sampling_rate, hparams.mel_fmin,
                hparams.mel_fmax)
    def save_mel(filename):
        audio, sampling_rate = load_wav_to_torch(filename)
        if sampling_rate != stft.sampling_rate:
            raise ValueError("{} {} SR doesn't match target {} SR".format(filename, 
                sampling_rate, stft.sampling_rate))
        audio_norm = audio / hparams.max_wav_value
        audio_norm = audio_norm.unsqueeze(0)
        audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
        melspec = stft.mel_spectrogram(audio_norm)
        melspec = torch.squeeze(melspec, 0).cpu().numpy()
        np.save(filename.replace('.wav', ''), melspec)

    import glob
    wavs = glob.glob('wavs/*.wav')
    for i in tqdm(wavs):
        save_mel(i)


def reduce_tensor(tensor, n_gpus):
    rt = tensor.clone()
    dist.all_reduce(rt, op=dist.reduce_op.SUM)
    rt /= n_gpus
    return rt


def init_distributed(hparams, n_gpus, rank, group_name):
    assert torch.cuda.is_available(), "Distributed mode requires CUDA."
    print("Initializing Distributed")

    # Set cuda device so everything is done on the right GPU.
    torch.cuda.set_device(rank % torch.cuda.device_count())

    # Initialize distributed communication
    dist.init_process_group(
        backend=hparams.dist_backend, init_method=hparams.dist_url,
        world_size=n_gpus, rank=rank, group_name=group_name)

    print("Done initializing distributed")


def prepare_dataloaders(hparams):
    # Get data, data loaders and collate function ready
    
    # 在data_utils中定义的类,这个类是继承的torch.utils.data.Dataset
    # 它的作用是，读取"音频，文本"对，将文本信息转化为sequence，计算音频文件的梅尔谱
    trainset = TextMelLoader(hparams.training_files, hparams) 
    valset = TextMelLoader(hparams.validation_files, hparams)
    # 用于整理sequence
    collate_fn = TextMelCollate(hparams.n_frames_per_step)

    # TODO
    trainset = PPGMelLoader(hparams.training_files, hparams)
    valset = PPGMelLoader(hparams.validation_files, hparams)
    # 我不确定PPG需不需要collate

    if hparams.distributed_run:
        train_sampler = DistributedSampler(trainset)
        shuffle = False
    else:
        train_sampler = None
        shuffle = True

    # 调用pytorch的DataLoader
    train_loader = DataLoader(trainset, num_workers=1, # 这个参数是指几线程读取数据
                shuffle=shuffle, sampler=train_sampler,
                batch_size=hparams.batch_size, pin_memory=False, drop_last=True, collate_fn=collate_fn)
    
    return train_loader, valset, collate_fn


def prepare_directories_and_logger(output_directory, log_directory, rank):
    # 准备输出路径 和 输出调试信息的工具
    if rank == 0:
        logging.debug("在prepare_directories_and_logger中，rank=0")
        if not os.path.isdir(output_directory): # 检查是否有输出路径，没有就创建一个。（前面应该创建过了）
            logging.debug("没有output_directory，创建一个")
            os.makedirs(output_directory)
            os.chmod(output_directory, 0o775) # 反正这步就是设置权限，不懂linux。775，第一个7指文件所有者可读可写可执行，第二个7指与文件所有者同属一个用户组的其他用户可读可执行，第三个5指其它用户组可读可执行
        logger = Tacotron2Logger(os.path.join(output_directory, log_directory)) # 在 logger.py中定义的类
        # 这个类继承了torch.utils.tensorboard中的SummaryWriter类，顾名思义是记录信息的
    else:
        logging.debug("在prepare_directories_and_logger中，rank!=0")
        logger = None
    return logger


def load_model(hparams):
    model = Tacotron2(hparams).cuda()
    if hparams.fp16_run:
      logging.debug("在load_model()中，启用fp16_run")
      model.decoder.attention_layer.score_mask_value = finfo('float16').min

    if hparams.distributed_run:
      logging.debug("在load_model()中，启用distributed_run")
      model = apply_gradient_allreduce(model)

    return model


def warm_start_model(checkpoint_path, model, ignore_layers):
    # 这个函数暂时没看，因为还没用上warm_start
    assert os.path.isfile(checkpoint_path)
    print("Warm starting model from checkpoint '{}'".format(checkpoint_path))
    checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
    model_dict = checkpoint_dict['state_dict']
    if len(ignore_layers) > 0:
        model_dict = {k: v for k, v in model_dict.items()
                      if k not in ignore_layers}
        dummy_dict = model.state_dict()
        dummy_dict.update(model_dict)
        model_dict = dummy_dict
    model.load_state_dict(model_dict)
    return model


def load_checkpoint(checkpoint_path, model, optimizer):
    assert os.path.isfile(checkpoint_path) # 打开对应路径下的模型
    print("Loading checkpoint '{}'".format(checkpoint_path)) # 调试信息：Loading checkpoint '/content/drive/MyDrive/colab/outdir/test'

    # torch.load()用来加载torch.save()保存的文件
    checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')

    # 继承自torch.nn的方法，用于将预训练的参数权重加载到新的模型之中
    logging.debug(type(checkpoint_dict['state_dict']))
    model.load_state_dict(checkpoint_dict['state_dict']) # 不懂这步checkpoint_dict['state_dict']是什么意思
    # 大概是把一些预训练的参数加到model上吧
    optimizer.load_state_dict(checkpoint_dict['optimizer'])
    learning_rate = checkpoint_dict['learning_rate']
    iteration = checkpoint_dict['iteration'] # 这步就是返回上次训练到哪个iteration了
    # 主要不懂这个checkpoint的保存形式，感觉好像有一些键值对？

    print("Loaded checkpoint '{}' from iteration {}" .format( # 调试信息：Loaded checkpoint '/content/drive/MyDrive/colab/outdir/test' from iteration 17
        checkpoint_path, iteration))
    return model, optimizer, learning_rate, iteration


def save_checkpoint(model, optimizer, learning_rate, iteration, filepath):
    print("Saving model and optimizer state at iteration {} to {}".format(
        iteration, filepath))
    try:
        torch.save({'iteration': iteration,
                'state_dict': model.state_dict(),
                'optimizer': optimizer.state_dict(),
                'learning_rate': learning_rate}, filepath)
    except KeyboardInterrupt:
        print("interrupt received while saving, waiting for save to complete.")
        torch.save({'iteration': iteration,'state_dict': model.state_dict(),'optimizer': optimizer.state_dict(),'learning_rate': learning_rate}, filepath)
    print("Model Saved")

def plot_alignment(alignment, info=None):
    %matplotlib inline
    fig, ax = plt.subplots(figsize=(int(alignment_graph_width/100), int(alignment_graph_height/100)))
    im = ax.imshow(alignment, cmap='inferno', aspect='auto', origin='lower',
                   interpolation='none')
    ax.autoscale(enable=True, axis="y", tight=True)
    fig.colorbar(im, ax=ax)
    xlabel = 'Decoder timestep'
    if info is not None:
        xlabel += '\n\n' + info
    plt.xlabel(xlabel)
    plt.ylabel('Encoder timestep')
    plt.tight_layout()
    fig.canvas.draw()
    plt.show()

def validate(model, criterion, valset, iteration, batch_size, n_gpus,
             collate_fn, logger, distributed_run, rank, epoch, start_eposh, learning_rate):
    """Handles all the validation scoring and printing"""
    model.eval()
    with torch.no_grad():
        val_sampler = DistributedSampler(valset) if distributed_run else None
        val_loader = DataLoader(valset, sampler=val_sampler, num_workers=1,
                                shuffle=False, batch_size=batch_size,
                                pin_memory=False, collate_fn=collate_fn)

        val_loss = 0.0
        for i, batch in enumerate(val_loader):
            x, y = model.parse_batch(batch)
            y_pred = model(x)
            loss = criterion(y_pred, y)
            if distributed_run:
                reduced_val_loss = reduce_tensor(loss.data, n_gpus).item()
            else:
                reduced_val_loss = loss.item()
            val_loss += reduced_val_loss
        val_loss = val_loss / (i + 1)

    model.train()
    if rank == 0:
        print("Epoch: {} Validation loss {}: {:9f}  Time: {:.1f}m LR: {:.6f}".format(epoch, iteration, val_loss,(time.perf_counter()-start_eposh)/60, learning_rate))
        logger.log_validation(val_loss, model, y, y_pred, iteration)
        if hparams.show_alignments:
            %matplotlib inline
            _, mel_outputs, gate_outputs, alignments = y_pred
            idx = random.randint(0, alignments.size(0) - 1)
            plot_alignment(alignments[idx].data.cpu().numpy().T)

def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus,
          rank, group_name, hparams, log_directory2):
    """Training and validation logging results to tensorboard and stdout

    Params
    ------
    output_directory (string): directory to save checkpoints
    log_directory (string) directory to save tensorboard logs
    checkpoint_path(string): checkpoint path
    n_gpus (int): number of gpus
    rank (int): rank of current gpu
    hparams (object): comma separated list of "name=value" pairs.
    """
    logging.debug("----------------")
    logging.debug("start trainning")
    logging.debug("checkpoint_path = %s", checkpoint_path)
    if hparams.distributed_run:
        init_distributed(hparams, n_gpus, rank, group_name)

    torch.manual_seed(hparams.seed)
    torch.cuda.manual_seed(hparams.seed)

    logging.debug('调用load_model函数')
    model = load_model(hparams) # model是model.py中的Tacotron2类

    learning_rate = hparams.learning_rate
    logging.debug('learning_rate = %d',learning_rate)

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate,
                                 weight_decay=hparams.weight_decay)

    # if hparams.fp16_run: # 未开启
    #   from apex import amp
    #   model, optimizer = amp.initialize(
    #     model, optimizer, opt_level='O2')

    # if hparams.distributed_run: # 未开启
    #     model = apply_gradient_allreduce(model)

    criterion = Tacotron2Loss() # 在loss_function.py中的一个类
    # 也是继承了torch.nn
    # 定义了一个forward函数，之后用到再看
 
    logger = prepare_directories_and_logger(output_directory, log_directory, rank) # 调用了上文中定义的函数，准备输出路径和输出调试信息的工具，不重要

    train_loader, valset, collate_fn = prepare_dataloaders(hparams)

    # Load checkpoint if one exists
    iteration = 0
    epoch_offset = 0
    logging.debug('尝试读取checkpoint')
    logging.debug('checkpoint_path = %s', checkpoint_path)
    if checkpoint_path is not None and os.path.isfile(checkpoint_path):
        logging.debug("读取了checkpoint")
        if warm_start:
            logging.debug("启动warm_start")
            model = warm_start_model(
                checkpoint_path, model, hparams.ignore_layers)
        else:
            logging.debug("没有启动warm_start")
            model, optimizer, _learning_rate, iteration = load_checkpoint( # 调用上文定义的load_checkpoint函数
                checkpoint_path, model, optimizer)
            if hparams.use_saved_learning_rate:
                logging.debug("选择use_saved_learning_rate")
                learning_rate = _learning_rate
            iteration += 1  # 从上次训练结束时的iteration + 1开始
            epoch_offset = max(0, int(iteration / len(train_loader))) # len(train_loader) = batches的数量 ； 好理解，iteration是训练一个bath_size，epoch是所有数据训练一遍
    else:
      # 如果不在checkpoint_path指定预训练模型的话，程序就会走这里，调用默认的预训练模型tacotron2_statedict.pt，也默认启动warm_start
      os.path.isfile("tacotron2_statedict.pt")
      model = warm_start_model("tacotron2_statedict.pt", model, hparams.ignore_layers)
      # download LJSpeech pretrained model if no checkpoint already exists
    
    start_eposh = time.perf_counter() # 这个函数大概可以看作一个高精度的计时器
    learning_rate = 0.0
    model.train() # 将当前模块设置成训练模式.
    is_overflow = False
    # ================ MAIN TRAINNIG LOOP! ===================
    for epoch in tqdm(range(epoch_offset, hparams.epochs)): # hparams.epochs就是训练的epoch次数，就是接着上次训练剩余的次数继续。
        # tqdm是展示进度条的库
        # 上面这个tqdm(range)也比较好理解，比如说是tqdm(range(4,50))，那训练到epoch = 27时这个进度条就走了一半了
        print("\nStarting Epoch: {} Iteration: {}".format(epoch, iteration))
        start_eposh = time.perf_counter() # eposh is russian, not a typo

        # 这个enumerate(train_loader)比较重要，返回的i是batch的序号，batch里包含两个信息，一个是训练数据，一个是label。
        for i, batch in tqdm(enumerate(train_loader), total=len(train_loader)): 
            start = time.perf_counter() # 初始化计时器为0
            # 反正是在调整学习率
            if iteration < hparams.decay_start: learning_rate = hparams.A_ # 这个decay_start是什么意思？？一开始的hparams.A设的很小，一开始进去的数据热热身？
            else: iteration_adjusted = iteration - hparams.decay_start; learning_rate = (hparams.A_*(e**(-iteration_adjusted/hparams.B_))) + hparams.C_


            learning_rate = max(hparams.min_learning_rate, learning_rate) # output the largest number
            logging.debug("epoch = %d", epoch)
            logging.debug("batch_num = %d", i)
            logging.debug("learning_rate = %f", learning_rate)

            # 这步不懂
            for param_group in optimizer.param_groups:
                param_group['lr'] = learning_rate


            model.zero_grad() # 将模型的所有参数的梯度清零.
            x, y = model.parse_batch(batch)
            y_pred = model(x)
            
            print("断点1")
            logging.debug("断点1")

            loss = criterion(y_pred, y)
            if hparams.distributed_run: # 不启用
                reduced_loss = reduce_tensor(loss.data, n_gpus).item()
            else:
                reduced_loss = loss.item()
            if hparams.fp16_run: # 不启用
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                #走这里
                loss.backward()

            if hparams.fp16_run:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    amp.master_params(optimizer), hparams.grad_clip_thresh)
                is_overflow = math.isnan(grad_norm)
            else:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    model.parameters(), hparams.grad_clip_thresh)

            optimizer.step()
            print("断点2")
            logging.debug("断点2")
            if not is_overflow and rank == 0:
                duration = time.perf_counter() - start
                logger.log_training(
                    reduced_loss, grad_norm, learning_rate, duration, iteration)
                #print("Batch {} loss {:.6f} Grad Norm {:.6f} Time {:.6f}".format(iteration, reduced_loss, grad_norm, duration), end='\r', flush=True)

            iteration += 1
        print("断点3")
        logging.debug("断点3")
        validate(model, criterion, valset, iteration,
                 hparams.batch_size, n_gpus, collate_fn, logger,
                 hparams.distributed_run, rank, epoch, start_eposh, learning_rate)
        save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path)
        if log_directory2 != None:
            copy_tree(log_directory, log_directory2)
def check_dataset(hparams):
    from utils import load_wav_to_torch, load_filepaths_and_text
    import os
    import numpy as np
    def check_arr(filelist_arr):
        for i, file in enumerate(filelist_arr):
            if len(file) > 2:
                print("|".join(file), "\nhas multiple '|', this may not be an error.")
            if hparams.load_mel_from_disk and '.wav' in file[0]:
                print("[WARNING]", file[0], " in filelist while expecting .npy .")
            else:
                if not hparams.load_mel_from_disk and '.npy' in file[0]:
                    print("[WARNING]", file[0], " in filelist while expecting .wav .")
            if (not os.path.exists(file[0])):
                print("|".join(file), "\n[WARNING] does not exist.")
            if len(file[1]) < 3:
                print("|".join(file), "\n[info] has no/very little text.")
            if not ((file[1].strip())[-1] in r"!?,.;:"):
                print("|".join(file), "\n[info] has no ending punctuation.")
            mel_length = 1
            if hparams.load_mel_from_disk and '.npy' in file[0]:
                melspec = torch.from_numpy(np.load(file[0], allow_pickle=True))
                mel_length = melspec.shape[1]
            if mel_length == 0:
                print("|".join(file), "\n[WARNING] has 0 duration.")
    print("Checking Training Files")
    audiopaths_and_text = load_filepaths_and_text(hparams.training_files) # get split lines from training_files text file.
    check_arr(audiopaths_and_text)
    print("Checking Validation Files")
    audiopaths_and_text = load_filepaths_and_text(hparams.validation_files) # get split lines from validation_files text file.
    check_arr(audiopaths_and_text)
    print("Finished Checking")

warm_start=False #sorry about that
n_gpus=1
rank=0
group_name=None

# ---- 这是定义的默认参数，可以不用管 ----
hparams = create_hparams()
model_filename = 'current_model'
hparams.training_files = "filelists/clipper_train_filelist.txt"
hparams.validation_files = "filelists/clipper_val_filelist.txt"
#hparams.use_mmi=True,          # not used in this notebook
#hparams.use_gaf=True,          # not used in this notebook
#hparams.max_gaf=0.5,           # not used in this notebook
#hparams.drop_frame_rate = 0.2  # not used in this notebook
hparams.p_attention_dropout=0.1
hparams.p_decoder_dropout=0.1
hparams.decay_start = 15000
hparams.A_ = 5e-4
hparams.B_ = 8000
hparams.C_ = 0
hparams.min_learning_rate = 1e-5
generate_mels = True
hparams.show_alignments = True
alignment_graph_height = 600
alignment_graph_width = 1000
hparams.batch_size = 32
hparams.load_mel_from_disk = True
hparams.ignore_layers = []
hparams.epochs = 10000

torch.backends.cudnn.enabled = hparams.cudnn_enabled
torch.backends.cudnn.benchmark = hparams.cudnn_benchmark
output_directory = '/content/drive/MyDrive/colab/outdir' # Location to save Checkpoints
log_directory = '/content/tacotron2/logs' # Location to save Log files locally
log_directory2 = '/content/drive/MyDrive/colab/logs' # Location to copy log files (done at the end of each epoch to cut down on I/O)e
checkpoint_path = output_directory+(r'/')+model_filename
