In [1]:
import os, sys, glob, shutil
import torch
import torch.nn as nn
import numpy as np
import argparse
import yaml

In [2]:
%cd "D:\Schoolwork\TERM 3\WORK\visual_prosody"

D:\Schoolwork\TERM 3\WORK\visual_prosody


In [3]:
from transformer.SubLayers import MultiHeadAttention, MultiHeadAttention_VariableQuery, PositionwiseFeedForward

from transformer.Layers import FFTBlock, VisualFFTBlock

from transformer.Models import VisualEncoder

from utils.tools import get_mask_from_lengths

In [4]:
parser = argparse.ArgumentParser()
parser.add_argument("--restore_step", type=int, default=0)
parser.add_argument(
    "-p",
    "--preprocess_config",
    type=str,
    required=True,
    help="path to preprocess.yaml",
)
parser.add_argument(
    "-m", "--model_config", type=str, required=True, help="path to model.yaml"
)
parser.add_argument(
    "-t", "--train_config", type=str, required=True, help="path to train.yaml"
)

argString = '-p ./config/Ego4D_final_v6/0726b_preprocess.yaml -m ./config/Ego4D_final_v6/0726b_model.yaml -t ./config/Ego4D_final_v6/0726b_train.yaml'
# args = parser.parse_args()
args = parser.parse_args(argString.split())

In [5]:
from utils.model import get_model, get_vocoder, get_param_num
from utils.tools import to_device, log, synth_one_sample
from model import FastSpeech2Loss
from dataset import Dataset, VideoDataset
from utils.auto_tqdm import tqdm
from evaluate import evaluate

from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [6]:
config = yaml.load(open(args.preprocess_config, "r"), Loader=yaml.FullLoader)

# Read Config
preprocess_config = yaml.load(
    open(args.preprocess_config, "r"), Loader=yaml.FullLoader
)
model_config = yaml.load(open(args.model_config, "r"), Loader=yaml.FullLoader)
train_config = yaml.load(open(args.train_config, "r"), Loader=yaml.FullLoader)
configs = (preprocess_config, model_config, train_config)

In [7]:
print("Prepare training ...")

preprocess_config, model_config, train_config = configs
# Get dataset
dataset = VideoDataset(
    "train.txt", 'train', preprocess_config, train_config, sort=True, drop_last=True
)

Prepare training ...


In [8]:
len(dataset)

27292

In [9]:
dataset.batch_size

24

In [10]:
batch_size = train_config["optimizer"]["batch_size"]
group_size = 4  # Set this larger than 1 to enable sorting in Dataset
assert batch_size * group_size < len(dataset)
loader = DataLoader(
    dataset,
    batch_size=batch_size * group_size,
    shuffle=True,
    collate_fn=dataset.collate_fn,
)

In [11]:
# return (
#     ids,
#     raw_texts,
#     speakers,
#     texts,
#     text_lens,
#     max(text_lens),
#     mels,
#     mel_lens,
#     max(mel_lens),
#     pitches,
#     energies,
#     durations,
#     speaker_embeddings,
#     video_embeddings,
#     vid_lens,
# )

In [12]:
# Prepare model
model, optimizer = get_model(args, configs, device, train=True)
model = nn.DataParallel(model)
num_param = get_param_num(model)
Loss = FastSpeech2Loss(preprocess_config, model_config).to(device)
print("Number of FastSpeech2 Parameters:", num_param)

# Load vocoder
vocoder = get_vocoder(model_config, device)

=> Using speaker embeddings.
=> Using VarianceAdaptorWithSpeaker.
=> Using VisualEncoder.
Using prosody vector for visual encoder.
Successfully loaded from ./output/LibriTTS/LibriTTS_800000.pth.tar
Number of FastSpeech2 Parameters: 55461233
Removing weight norm...


In [13]:
def log_gpu_usage():
    max_memory_allocated = torch.cuda.max_memory_allocated()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    device_properties = torch.cuda.get_device_properties(device)
    available_memory = device_properties.total_memory - torch.cuda.max_memory_allocated()
    message = ""
    message += f"Maximum GPU memory allocated by PyTorch: {max_memory_allocated / 1024**3:.2f} GB\n"
    message += f"Available GPU memory: {available_memory / 1024**3:.2f} GB\n"
    print(message)
    return message

In [14]:
# Training
step = args.restore_step + 1
epoch = 1
grad_acc_step = train_config["optimizer"]["grad_acc_step"]
grad_clip_thresh = train_config["optimizer"]["grad_clip_thresh"]
total_step = train_config["step"]["total_step"]
log_step = train_config["step"]["log_step"]
save_step = train_config["step"]["save_step"]
synth_step = train_config["step"]["synth_step"]
val_step = train_config["step"]["val_step"]

In [15]:
for data in loader:
    break
item = data[0]

In [16]:
pitch_target = item[9]

In [17]:
energy_target = item[10]

In [18]:
duration_target = item[11]

In [19]:
duration_target.shape

(24, 116)

In [20]:
def expand(values, durations):
    out = list()
    for value, d in zip(values, durations):
        out += [value] * max(0, int(d))
    return np.array(out)

In [21]:
duration_target

array([[ 9, 15, 10, ...,  2, 48, 12],
       [ 3,  3,  2, ...,  0,  0,  0],
       [ 9,  9,  4, ...,  0,  0,  0],
       ...,
       [ 5, 25,  9, ...,  0,  0,  0],
       [ 3,  6,  6, ...,  0,  0,  0],
       [ 9, 30,  8, ...,  0,  0,  0]], dtype=int64)

In [22]:
# pitch_extended = expand(pitch_target, duration_target)

In [23]:
# pitch_target energy_target duration_target的长度为max text len

In [24]:
text_lens = item[4]

In [25]:
text_lens

array([116,  98,  90,  75,  72,  64,  49,  48,  41,  36,  35,  35,  34,
        34,  33,  32,  32,  31,  30,  29,  28,  27,  27,  26])

In [26]:
(pitch_target!=0).sum(axis=1)

array([116,  98,  90,  75,  72,  64,  49,  48,  41,  36,  35,  35,  34,
        34,  33,  32,  32,  31,  30,  29,  28,  27,  27,  26])

In [27]:
pitch_target.shape

(24, 116)

In [28]:
# lstm_input = nn.utils.rnn.pack_padded_sequence(
#     p_predictions, 
#     lengths=text_lens, 
#     batch_first=True
# )

In [29]:
for batch_idx, batchs in enumerate(loader):
    for batch in batchs:
        batch = to_device(batch, device)
        temp_batch = (batch[2:])
        output = model(*(batch[2:]))
    if batch_idx >= 0:
        break

delta
delta
delta
delta


In [30]:
(
    output,
    postnet_output,
    p_predictions,
    e_predictions,
    log_d_predictions,
    d_rounded,
    src_masks,
    mel_masks,
    src_lens,
    mel_lens,
) = output

In [31]:
src_lens

tensor([5, 5, 5, 5, 5, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2],
       device='cuda:0', dtype=torch.int32)

In [32]:
p_predictions.shape

torch.Size([24, 5])

In [33]:
e_predictions.shape

torch.Size([24, 5])

In [34]:
pitch = p_predictions[0, :src_len].detach().cpu().numpy()

NameError: name 'src_len' is not defined

In [None]:
src_lens

In [None]:
lstm_input = nn.utils.rnn.pack_padded_sequence(
    p_predictions, 
    lengths=src_lens.cpu(), 
    batch_first=True
)

In [None]:
p_predictions.shape

In [None]:
src_lens

In [None]:
log_d_predictions.shape

In [None]:
log_d_predictions

In [None]:
d_rounded

In [None]:
_ = log_gpu_usage()