In [1]:
import os, sys, glob, shutil
import torch
import torch.nn as nn
import numpy as np
import argparse
import yaml

In [2]:
%cd "D:\Schoolwork\TERM 3\WORK\visual_prosody"

D:\Schoolwork\TERM 3\WORK\visual_prosody


In [3]:
from transformer.SubLayers import MultiHeadAttention, MultiHeadAttention_VariableQuery, PositionwiseFeedForward

from transformer.Layers import FFTBlock, VisualFFTBlock

from transformer.Models import VisualEncoder

In [4]:
# padded video_embeddings.shape: (batch_size, batch_max_seq_len, 1536)

In [5]:
from utils.tools import get_mask_from_lengths

In [6]:
parser = argparse.ArgumentParser()
parser.add_argument("--restore_step", type=int, default=0)
parser.add_argument(
    "-p",
    "--preprocess_config",
    type=str,
    required=True,
    help="path to preprocess.yaml",
)
parser.add_argument(
    "-m", "--model_config", type=str, required=True, help="path to model.yaml"
)
parser.add_argument(
    "-t", "--train_config", type=str, required=True, help="path to train.yaml"
)

argString = '-p ./config/Ego4D_final_v6/0719a_preprocess.yaml -m ./config/Ego4D_final_v6/0719a_model.yaml -t ./config/Ego4D_final_v6/0719a_train.yaml'
# args = parser.parse_args()
args = parser.parse_args(argString.split())

In [7]:
from utils.model import get_model, get_vocoder, get_param_num
from utils.tools import to_device, log, synth_one_sample
from model import FastSpeech2Loss
from dataset import Dataset, VideoDataset
from utils.auto_tqdm import tqdm
from evaluate import evaluate

from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [8]:
config = yaml.load(open(args.preprocess_config, "r"), Loader=yaml.FullLoader)

# Read Config
preprocess_config = yaml.load(
    open(args.preprocess_config, "r"), Loader=yaml.FullLoader
)
model_config = yaml.load(open(args.model_config, "r"), Loader=yaml.FullLoader)
train_config = yaml.load(open(args.train_config, "r"), Loader=yaml.FullLoader)
configs = (preprocess_config, model_config, train_config)

In [9]:
print("Prepare training ...")

preprocess_config, model_config, train_config = configs
# Get dataset
dataset = VideoDataset(
    "train.txt", 'train', preprocess_config, train_config, sort=True, drop_last=True
)

Prepare training ...


In [10]:
len(dataset)

27292

In [11]:
dataset.batch_size

24

In [12]:
batch_size = train_config["optimizer"]["batch_size"]
group_size = 4  # Set this larger than 1 to enable sorting in Dataset
assert batch_size * group_size < len(dataset)
loader = DataLoader(
    dataset,
    batch_size=batch_size * group_size,
    shuffle=True,
    collate_fn=dataset.collate_fn,
)

In [13]:
# return (
#     ids,
#     raw_texts,
#     speakers,
#     texts,
#     text_lens,
#     max(text_lens),
#     mels,
#     mel_lens,
#     max(mel_lens),
#     pitches,
#     energies,
#     durations,
#     speaker_embeddings,
#     video_embeddings,
#     vid_lens,
# )

In [14]:
for data in loader:
    break

In [15]:
item = data[0]

In [16]:
item[-2].shape

(24, 27, 1536)

In [17]:
# Prepare model
model, optimizer = get_model(args, configs, device, train=True)
model = nn.DataParallel(model)
num_param = get_param_num(model)
Loss = FastSpeech2Loss(preprocess_config, model_config).to(device)
print("Number of FastSpeech2 Parameters:", num_param)

# Load vocoder
vocoder = get_vocoder(model_config, device)

=> Using speaker embeddings.
=> Using VarianceAdaptorWithSpeaker.
=> Using VisualEncoder.
Using prosody vector for visual encoder.
Successfully loaded from ./output/LibriTTS/LibriTTS_800000.pth.tar
Number of FastSpeech2 Parameters: 56828529
Removing weight norm...


In [18]:
# Training
step = args.restore_step + 1
epoch = 1
grad_acc_step = train_config["optimizer"]["grad_acc_step"]
grad_clip_thresh = train_config["optimizer"]["grad_clip_thresh"]
total_step = train_config["step"]["total_step"]
log_step = train_config["step"]["log_step"]
save_step = train_config["step"]["save_step"]
synth_step = train_config["step"]["synth_step"]
val_step = train_config["step"]["val_step"]

In [19]:
for batch_idx, batchs in enumerate(loader):
    for batch in batchs:
        batch = to_device(batch, device)
        temp_batch = (batch[2:])
        output = model(*(batch[2:]))
    if batch_idx >=2:
        break

In [20]:
def log_gpu_usage():
    max_memory_allocated = torch.cuda.max_memory_allocated()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    device_properties = torch.cuda.get_device_properties(device)
    available_memory = device_properties.total_memory - torch.cuda.max_memory_allocated()
    message = ""
    message += f"Maximum GPU memory allocated by PyTorch: {max_memory_allocated / 1024**3:.2f} GB\n"
    message += f"Available GPU memory: {available_memory / 1024**3:.2f} GB\n"
    print(message)
    return message

In [21]:
_ = log_gpu_usage()

Maximum GPU memory allocated by PyTorch: 4.98 GB
Available GPU memory: 1.02 GB



In [20]:
visual_out = torch.randn([4, 256])

In [21]:
torch.unsqueeze(visual_out, dim=1).shape

torch.Size([4, 1, 256])

In [22]:
visual_out = torch.randn([4, 36, 1536])

In [23]:
vid_lens = torch.tensor([4, 36, 10, 2])

In [24]:
avg_pool_x = visual_out.sum(dim=1) / vid_lens.reshape(-1, 1)

In [25]:
max_pool_x, _ = visual_out.max(dim=1)

In [26]:
avg_pool_x.shape

torch.Size([4, 1536])

In [27]:
max_pool_x.shape

torch.Size([4, 1536])

In [28]:
avgmax_x = torch.cat([avg_pool_x, max_pool_x], dim=1)

In [29]:
avgmax_x.shape

torch.Size([4, 3072])

In [30]:
print(avgmax_x.mean(dim=1))
print(avgmax_x.std(dim=1))
print(avgmax_x.max(dim=1)[0])
print(avgmax_x.min(dim=1)[0])

tensor([1.0737, 1.0542, 1.0609, 1.0389])
tensor([1.5047, 1.1141, 1.2054, 2.3611])
tensor([5.1919, 3.8772, 4.3998, 8.7978])
tensor([ -4.6305,  -0.5015,  -2.3392, -11.1474])


In [31]:
ln = nn.LayerNorm(2*model_config["transformer"]["visual_encoder_hidden"])

In [32]:
avgmax_x = ln(avgmax_x)

In [33]:
avgmax_x.shape

torch.Size([4, 3072])

In [34]:
print(avgmax_x.mean(dim=1))
print(avgmax_x.std(dim=1))
print(avgmax_x.max(dim=1)[0])
print(avgmax_x.min(dim=1)[0])

tensor([ 4.5324e-08, -5.4017e-08,  2.0800e-08,  2.2973e-08],
       grad_fn=<MeanBackward1>)
tensor([1.0002, 1.0002, 1.0002, 1.0002], grad_fn=<StdBackward0>)
tensor([2.7373, 2.5342, 2.7703, 3.2867], grad_fn=<MaxBackward0>)
tensor([-3.7915, -1.3967, -2.8211, -5.1621], grad_fn=<MinBackward0>)


In [35]:
avgmax_x.shape

torch.Size([4, 3072])

In [38]:
vid_lens = torch.tensor([4, 8, 10, 2])
vid_max_len = vid_lens.max().item()
vid_mask = get_mask_from_lengths(vid_lens.to("cuda"), max_len=vid_max_len)
vid_mask = vid_mask.to('cpu')

In [39]:
BATCH_SIZE = 4
BATCH_SEQ_LEN = vid_max_len
EMBED_DIM = 1536
QUERY_DIM = 128

In [40]:
vid_embeddings = torch.randn(BATCH_SIZE, BATCH_SEQ_LEN, EMBED_DIM)

In [41]:
vid_embeddings.shape

torch.Size([4, 10, 1536])

In [42]:
vid_mask.shape

torch.Size([4, 10])

In [43]:
n_head = 2
d_model = EMBED_DIM
d_k = d_v = EMBED_DIM // n_head
dropout = 0.2


slf_attn = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout)

In [44]:
slf_attn_mask = vid_mask.unsqueeze(1).expand(-1, vid_max_len, -1)

In [45]:
enc_output, enc_slf_attn = slf_attn(
    q=vid_embeddings, 
    k=vid_embeddings, 
    v=vid_embeddings, 
    mask=slf_attn_mask,
)

In [46]:
vid_attn = MultiHeadAttention_VariableQuery(
    n_head=n_head, 
    d_model=d_model, 
    d_k=d_k, 
    d_v=d_v, 
    q_input_dim=QUERY_DIM,
    dropout=dropout)

In [47]:
vid_query = torch.randn(BATCH_SIZE, 1, QUERY_DIM)

In [48]:
vid_query.shape

torch.Size([4, 1, 128])

In [49]:
vid_query.repeat(1, 10, 1).shape

torch.Size([4, 10, 128])

In [50]:
vid_query.expand(-1, 10, -1).shape

torch.Size([4, 10, 128])

In [51]:
vid_query_repeated = vid_query.repeat(1, vid_max_len, 1)

In [52]:
vid_query_expanded = vid_query.expand(-1, 10, -1)

In [53]:
slf_attn_mask = vid_mask.unsqueeze(1).expand(-1, vid_max_len, -1)

enc_output, enc_slf_attn = vid_attn(
    q=vid_query_expanded, 
    k=vid_embeddings, 
    v=vid_embeddings, 
    mask=slf_attn_mask,
)

In [54]:
d_inner = 512
kernel_size = [5, 1]

In [55]:
fft_block = FFTBlock(
    n_head=n_head, 
    d_model=d_model, 
    d_k=d_k, 
    d_v=d_v, 
    # q_input_dim=QUERY_DIM,
    d_inner=d_inner, 
    kernel_size=kernel_size, 
    dropout=dropout,
)

In [56]:
out, _ = fft_block(vid_embeddings, mask=vid_mask, slf_attn_mask=slf_attn_mask)

In [57]:
out.shape

torch.Size([4, 10, 1536])

In [58]:
vid_block = VisualFFTBlock(
    n_head=n_head, 
    d_model=d_model, 
    d_k=d_k, 
    d_v=d_v, 
    d_inner=d_inner, 
    kernel_size=kernel_size, 
    q_input_dim=QUERY_DIM,
    dropout=dropout,
)

In [59]:
out, _ = vid_block(vid_embeddings, 
                   q_vec_expanded=vid_query_expanded,
                   mask=vid_mask, 
                   slf_attn_mask=slf_attn_mask)

In [60]:
visual_encoder = VisualEncoder(model_config)

Using self attention for visual encoder.


In [61]:
vid_query.shape

torch.Size([4, 1, 128])

In [62]:
q_vecs = torch.randn(BATCH_SIZE, QUERY_DIM)

In [63]:
# out = visual_encoder(vid_embeddings, vid_mask, q_vecs)