In [2]:
import os 
import torch
import numpy as np
os.chdir("/workspace/")
from torch.utils.tensorboard import SummaryWriter
from os.path import join as pjoin
from torch.distributions import Categorical
import json
import clip
import random
# import options.option_transformer as option_trans
import models.vqvae as vqvae
import utils.utils_model as utils_model
import utils.eval_trans as eval_trans
from dataset import dataset_TM_train
from dataset import dataset_TM_eval
from dataset import dataset_tokenize
import models.t2m_trans as trans
from options.get_eval_option import get_opt
from models.evaluator_wrapper import EvaluatorModelWrapper
import warnings
warnings.filterwarnings('ignore')


In [3]:
import easydict

args = easydict.EasyDict({
    'dataname': 't2m',
    'batch_size': 64,
    'fps': [20],
    'seq_len': 64,
    'total_iter': 300000,
    'warm_up_iter': 1000,
    'lr': 0.0001,
    'lr_scheduler': [150000],
    'gamma': 0.05,
    'weight_decay': 1e-6,
    'decay_option': 'all',
    'optimizer': 'adamw',
    'code_dim': 512,
    'nb_code': 512,
    'mu': 0.99,
    'down_t': 2,
    'stride_t': 2,
    'width': 512,
    'depth': 3,
    'dilation_growth_rate': 3,
    'output_emb_width': 512,
    'vq_act': 'relu',
    'block_size': 51,
    'embed_dim_gpt': 1024,
    'clip_dim': 512,
    'num_layers': 12,
    'n_head_gpt': 16,
    'ff_rate': 4,
    'drop_out_rate': 0.1,
    'quantizer': 'ema_reset',
    'quantbeta': 1.0,
    'resume_pth': "output_vqfinal/VQ-VAE/eval/net_last.pth",
    'resume_trans': None,
    'out_dir': 'output_GPT_Final/',
    'exp_name': 'pkeep_12layer_0.5/',
    'vq_name': 'VQ-VAE',
    'print_iter': 200,
    'eval_iter': 10000,
    'seed': 123,
    'if_maxtest': False,
    'pkeep': 0.5
})

In [4]:
torch.manual_seed(args.seed)

<torch._C.Generator at 0x7fa9aa5f2e10>

In [5]:
args.out_dir = os.path.join(args.out_dir, f'{args.exp_name}')
args.vq_dir= os.path.join("./dataset/KIT-ML" if args.dataname == 'kit' else "./dataset/HumanML3D", f'{args.vq_name}')
os.makedirs(args.out_dir, exist_ok = True)
os.makedirs(args.vq_dir, exist_ok = True)

In [6]:
logger = utils_model.get_logger(args.out_dir)
writer = SummaryWriter(args.out_dir)
logger.info(json.dumps(vars(args), indent=4, sort_keys=True))

2023-07-18 12:42:05,500 INFO {
    "batch_size": 64,
    "block_size": 51,
    "clip_dim": 512,
    "code_dim": 512,
    "dataname": "t2m",
    "decay_option": "all",
    "depth": 3,
    "dilation_growth_rate": 3,
    "down_t": 2,
    "drop_out_rate": 0.1,
    "embed_dim_gpt": 1024,
    "eval_iter": 10000,
    "exp_name": "pkeep_12layer_0.5/",
    "ff_rate": 4,
    "fps": [
        20
    ],
    "gamma": 0.05,
    "if_maxtest": false,
    "lr": 0.0001,
    "lr_scheduler": [
        150000
    ],
    "mu": 0.99,
    "n_head_gpt": 16,
    "nb_code": 512,
    "num_layers": 12,
    "optimizer": "adamw",
    "out_dir": "output_GPT_Final/pkeep_12layer_0.5/",
    "output_emb_width": 512,
    "pkeep": 0.5,
    "print_iter": 200,
    "quantbeta": 1.0,
    "quantizer": "ema_reset",
    "resume_pth": "output_vqfinal/VQ-VAE/eval/net_last.pth",
    "resume_trans": null,
    "seed": 123,
    "seq_len": 64,
    "stride_t": 2,
    "total_iter": 300000,
    "vq_act": "relu",
    "vq_dir": "./dataset/Hu

In [7]:
train_loader_token = dataset_tokenize.DATALoader(args.dataname, 1, unit_length=2**args.down_t)

100%|██████████| 23384/23384 [00:16<00:00, 1422.87it/s]


In [8]:
from utils.word_vectorizer import WordVectorizer
w_vectorizer = WordVectorizer('./glove', 'our_vab')
val_loader = dataset_TM_eval.DATALoader(args.dataname, False, 32, w_vectorizer)

100%|██████████| 1460/1460 [00:01<00:00, 1349.12it/s]

Pointer Pointing at 0





In [9]:
dataset_opt_path = 'checkpoints/kit/Comp_v6_KLD005/opt.txt' if args.dataname == 'kit' else 'checkpoints/t2m/Comp_v6_KLD005/opt.txt'

In [10]:
wrapper_opt = get_opt(dataset_opt_path, torch.device('cuda'))
eval_wrapper = EvaluatorModelWrapper(wrapper_opt)

Reading checkpoints/t2m/Comp_v6_KLD005/opt.txt
Loading Evaluation Model Wrapper (Epoch 28) Completed!!


In [11]:
clip_model, clip_preprocess = clip.load("ViT-B/32", device=torch.device('cuda'), jit=False)  # Must set jit=False for training
clip.model.convert_weights(clip_model)  # Actually this line is unnecessary since clip by default already on float16
clip_model.eval()
for p in clip_model.parameters():
    p.requires_grad = False

In [12]:
net = vqvae.HumanVQVAE(args, ## use args to define different parameters in different quantizers
                       args.nb_code,
                       args.code_dim,
                       args.output_emb_width,
                       args.down_t,
                       args.stride_t,
                       args.width,
                       args.depth,
                       args.dilation_growth_rate)


In [13]:
trans_encoder = trans.Text2Motion_Transformer(num_vq=args.nb_code, 
                                embed_dim=args.embed_dim_gpt, 
                                clip_dim=args.clip_dim, 
                                block_size=args.block_size, 
                                num_layers=args.num_layers, 
                                n_head=args.n_head_gpt, 
                                drop_out_rate=args.drop_out_rate, 
                                fc_rate=args.ff_rate)


In [14]:
print ('loading checkpoint from {}'.format(args.resume_pth))
ckpt = torch.load(args.resume_pth, map_location='cpu')
net.load_state_dict(ckpt['net'], strict=True)
net.eval()
net.cuda()

loading checkpoint from output_vqfinal/VQ-VAE/eval/net_last.pth


HumanVQVAE(
  (vqvae): VQVAE_251(
    (encoder): Encoder(
      (model): Sequential(
        (0): Conv1d(263, 512, kernel_size=(3,), stride=(1,), padding=(1,))
        (1): ReLU()
        (2): Sequential(
          (0): Conv1d(512, 512, kernel_size=(4,), stride=(2,), padding=(1,))
          (1): Resnet1D(
            (model): Sequential(
              (0): ResConv1DBlock(
                (norm1): Identity()
                (norm2): Identity()
                (activation1): ReLU()
                (activation2): ReLU()
                (conv1): Conv1d(512, 512, kernel_size=(3,), stride=(1,), padding=(9,), dilation=(9,))
                (conv2): Conv1d(512, 512, kernel_size=(1,), stride=(1,))
              )
              (1): ResConv1DBlock(
                (norm1): Identity()
                (norm2): Identity()
                (activation1): ReLU()
                (activation2): ReLU()
                (conv1): Conv1d(512, 512, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,))
 

In [15]:
if args.resume_trans is not None:
    print ('loading transformer checkpoint from {}'.format(args.resume_trans))
    ckpt = torch.load(args.resume_trans, map_location='cpu')
    trans_encoder.load_state_dict(ckpt['trans'], strict=True)
trans_encoder.train()
trans_encoder.cuda()

Text2Motion_Transformer(
  (trans_base): CrossCondTransBase(
    (tok_emb): Embedding(514, 1024)
    (cond_emb): Linear(in_features=512, out_features=1024, bias=True)
    (pos_embedding): Embedding(51, 1024)
    (drop): Dropout(p=0.1, inplace=False)
    (blocks): Sequential(
      (0): Block(
        (ln1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (ln2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): CausalCrossConditionalSelfAttention(
          (key): Linear(in_features=1024, out_features=1024, bias=True)
          (query): Linear(in_features=1024, out_features=1024, bias=True)
          (value): Linear(in_features=1024, out_features=1024, bias=True)
          (attn_drop): Dropout(p=0.1, inplace=False)
          (resid_drop): Dropout(p=0.1, inplace=False)
          (proj): Linear(in_features=1024, out_features=1024, bias=True)
        )
        (mlp): Sequential(
          (0): Linear(in_features=1024, out_features=4096, bias=True)
      

In [16]:
optimizer = utils_model.initial_optim(args.decay_option, args.lr, args.weight_decay, trans_encoder, args.optimizer)
scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=args.lr_scheduler, gamma=args.gamma)

In [17]:
##### ---- Optimization goals ---- #####
loss_ce = torch.nn.CrossEntropyLoss()

nb_iter, avg_loss_cls, avg_acc = 0, 0., 0.
right_num = 0
nb_sample_train = 0


GET Motion Code

In [17]:
for batch in train_loader_token:
    pose, name = batch
    bs, seq = pose.shape[0], pose.shape[1]

    pose = pose.cuda().float() # bs, nb_joints, joints_dim, seq_len
    target = net.encode(pose)
    target = target.cpu().numpy()
    np.save(pjoin(args.vq_dir, name[0] +'.npy'), target)




In [18]:
train_loader = dataset_TM_train.DATALoader(args.dataname, args.batch_size, args.nb_code, args.vq_name, unit_length=2**args.down_t)
train_loader_iter = dataset_TM_train.cycle(train_loader)

100%|██████████| 23384/23384 [00:08<00:00, 2878.80it/s]


In [19]:
best_fid, best_iter, best_div, best_top1, best_top2, best_top3, best_matching, writer, logger = eval_trans.evaluation_transformer(args.out_dir, val_loader, net, trans_encoder, logger, writer, 0, best_fid=1000, best_iter=0, best_div=100, best_top1=0, best_top2=0, best_top3=0, best_matching=100, clip_model=clip_model, eval_wrapper=eval_wrapper, savegif=False)
while nb_iter <= args.total_iter:
    
    batch = next(train_loader_iter)

    ## text, (128, 51), length
    clip_text, m_tokens, m_tokens_len = batch
    m_tokens, m_tokens_len = m_tokens.cuda(), m_tokens_len.cuda()


    bs = m_tokens.shape[0] # (128)
    target = m_tokens      # (bs, 51)
    target = target.cuda()
    
    text = clip.tokenize(clip_text, truncate=True).cuda()  # (128, 77)
    
    feat_clip_text = clip_model.encode_text(text).float()  # (128, 512)

    input_index = target[:,:-1] # [128, 50])

    if args.pkeep == -1:
        proba = np.random.rand(1)[0]
        mask = torch.bernoulli(proba * torch.ones(input_index.shape,
                                                         device=input_index.device))
    else:
        # 베르누이 분포에서 난수 0,1로 그림
        #pkeep 만큼 올바른 코드 인덱스 부가 
        #([128, 50])
        mask = torch.bernoulli(args.pkeep * torch.ones(input_index.shape,
                                                         device=input_index.device))
    mask = mask.round().to(dtype=torch.int64)

    # ([128,50]) 난수 생성
    r_indices = torch.randint_like(input_index, args.nb_code)

    #mask 값이 1인 위치에는 input_index의 값을 넣고 0에서는 r_indices -> 노이즈 추가 효과
    a_indices = mask*input_index+(1-mask)*r_indices

    cls_pred = trans_encoder(a_indices, feat_clip_text)
    cls_pred = cls_pred.contiguous() #([128, 51, 513])
    
    loss_cls = 0.0
    for i in range(bs):
        # loss function     (26), (26, 513)
        # (50),(50, 513)
        loss_cls += loss_ce(cls_pred[i][:m_tokens_len[i] + 1], target[i][:m_tokens_len[i] + 1]) / bs

        # Accuracy
        probs = torch.softmax(cls_pred[i][:m_tokens_len[i] + 1], dim=-1)
        #어떤 codebook(512)일 확률

        if args.if_maxtest:
            _, cls_pred_index = torch.max(probs, dim=-1)
            

        else:
            #확률분포르 뽑은 최대가 아닌!!
            dist = Categorical(probs)
            cls_pred_index = dist.sample()
        right_num += (cls_pred_index.flatten(0) == target[i][:m_tokens_len[i] + 1].flatten(0)).sum().item()

    ## global loss
    optimizer.zero_grad()
    loss_cls.backward()
    optimizer.step()
    scheduler.step()

    avg_loss_cls = avg_loss_cls + loss_cls.item()
    nb_sample_train = nb_sample_train + (m_tokens_len + 1).sum().item()

    nb_iter += 1
    if nb_iter % args.print_iter ==  0 :
        avg_loss_cls = avg_loss_cls / args.print_iter
        avg_acc = right_num * 100 / nb_sample_train
        writer.add_scalar('./Loss/train', avg_loss_cls, nb_iter)
        writer.add_scalar('./ACC/train', avg_acc, nb_iter)
        msg = f"Train. Iter {nb_iter} : Loss. {avg_loss_cls:.5f}, ACC. {avg_acc:.4f}"
        logger.info(msg)
        avg_loss_cls = 0.
        right_num = 0
        nb_sample_train = 0

    if nb_iter % args.eval_iter ==  0:
        best_fid, best_iter, best_div, best_top1, best_top2, best_top3, best_matching, writer, logger = eval_trans.evaluation_transformer(args.out_dir, val_loader, net, trans_encoder, logger, writer, nb_iter, best_fid, best_iter, best_div, best_top1, best_top2, best_top3, best_matching, clip_model=clip_model, eval_wrapper=eval_wrapper)

    if nb_iter == args.total_iter: 
        msg_final = f"Train. Iter {best_iter} : FID. {best_fid:.5f}, Diversity. {best_div:.4f}, TOP1. {best_top1:.4f}, TOP2. {best_top2:.4f}, TOP3. {best_top3:.4f}"
        logger.info(msg_final)
        break

2023-07-18 12:49:18,103 INFO --> 	 Eva. Iter 0 :, FID. 104.9955, Diversity Real. 9.5495, Diversity. 0.6463, R_precision_real. [0.52327128 0.71742021 0.80452128], R_precision. [0.03257979 0.06117021 0.08909574], matching_score_real. 2.8916438538977443, matching_score_pred. 10.524096083133779
moviepy is installed, but can't import moviepy.editor. Some packages could be missing [imageio, requests]
moviepy is installed, but can't import moviepy.editor. Some packages could be missing [imageio, requests]
moviepy is installed, but can't import moviepy.editor. Some packages could be missing [imageio, requests]
moviepy is installed, but can't import moviepy.editor. Some packages could be missing [imageio, requests]
moviepy is installed, but can't import moviepy.editor. Some packages could be missing [imageio, requests]
moviepy is installed, but can't import moviepy.editor. Some packages could be missing [imageio, requests]
moviepy is installed, but can't import moviepy.editor. Some packages cou