In [1]:
import os 
os.chdir("/workspace/")
import torch
import numpy as np
from torch.utils.tensorboard import SummaryWriter
import json
import clip

# import options.option_transformer as option_trans
import models.vqvae as vqvae
import utils.utils_model as utils_model
import utils.eval_trans as eval_trans
from dataset import dataset_TM_eval
import models.t2m_trans as trans
from options.get_eval_option import get_opt
from models.evaluator_wrapper import EvaluatorModelWrapper
import warnings
import random
warnings.filterwarnings('ignore')


In [2]:
import easydict

args = easydict.EasyDict({
    'dataname': 't2m',
    'batch_size': 128,
    'fps': [20],
    'seq_len': 64,
    'total_iter': 300000,
    'warm_up_iter': 1000,
    'lr': 0.0001,
    'lr_scheduler': [150000],
    'gamma': 0.05,
    'weight_decay': 1e-6,
    'decay_option': 'all',
    'optimizer': 'adamw',
    'code_dim': 512,
    'nb_code': 512,
    'mu': 0.99,
    'down_t': 2,
    'stride_t': 2,
    'width': 512,
    'depth': 3,
    'dilation_growth_rate': 3,
    'output_emb_width': 512,
    'vq_act': 'relu',
    'block_size': 51,
    'embed_dim_gpt': 1024,
    'clip_dim': 512,
    'num_layers': 12,
    'n_head_gpt': 16,
    'ff_rate': 4,
    'drop_out_rate': 0.1,
    'quantizer': 'ema_reset',
    'quantbeta': 1.0,
    'resume_pth': "output_vqfinal/VQ-VAE/eval/net_last.pth",
    'out_dir': 'output_GPT_Final/',
    'exp_name': 't_t_12layer_pkeep_0.5',
    'vq_name': 'VQ-VAE',
    'print_iter': 200,
    'eval_iter': 10000,
    'seed': 123,
    'if_maxtest': False,
    'pkeep': 0.5,
    'resume_trans' : 'output_GPT_Final/pkeep_12layer_0.5/net_best_fid.pth'
})

In [3]:
args.resume_trans

'output_GPT_Final/pkeep_12layer_0.5/net_best_fid.pth'

In [4]:
##### ---- Exp dirs ---- #####
# args = option_vq.get_args_parser()
torch.manual_seed(args.seed)

args.out_dir = os.path.join(args.out_dir, f'{args.exp_name}')
os.makedirs(args.out_dir, exist_ok = True)

In [5]:
args.out_dir

'output_GPT_Final/t_t_12layer_pkeep_0.5'

In [6]:
##### ---- Logger ---- #####
logger = utils_model.get_logger(args.out_dir)
writer = SummaryWriter(args.out_dir)
# logger.info(json.dumps(vars(args), indent=4, sort_keys=True))


In [7]:
from utils.word_vectorizer import WordVectorizer
w_vectorizer = WordVectorizer('./glove', 'our_vab')
val_loader = dataset_TM_eval.DATALoader(args.dataname, True, 32, w_vectorizer)

100%|██████████| 4384/4384 [00:03<00:00, 1241.25it/s]

Pointer Pointing at 0





In [8]:
dataset_opt_path = 'checkpoints/kit/Comp_v6_KLD005/opt.txt' if args.dataname == 'kit' else 'checkpoints/t2m/Comp_v6_KLD005/opt.txt'

wrapper_opt = get_opt(dataset_opt_path, torch.device('cuda'))
eval_wrapper = EvaluatorModelWrapper(wrapper_opt)

Reading checkpoints/t2m/Comp_v6_KLD005/opt.txt
Loading Evaluation Model Wrapper (Epoch 28) Completed!!


In [9]:
## load clip model and datasets
clip_model, clip_preprocess = clip.load("ViT-B/32", device=torch.device('cuda'), jit=False)  # Must set jit=False for training
clip.model.convert_weights(clip_model)  # Actually this line is unnecessary since clip by default already on float16
clip_model.eval()
for p in clip_model.parameters():
    p.requires_grad = False


In [10]:
net = vqvae.HumanVQVAE(args, ## use args to define different parameters in different quantizers
                       args.nb_code,
                       args.code_dim,
                       args.output_emb_width,
                       args.down_t,
                       args.stride_t,
                       args.width,
                       args.depth,
                       args.dilation_growth_rate)


trans_encoder = trans.Text2Motion_Transformer(num_vq=args.nb_code, 
                                embed_dim=args.embed_dim_gpt, 
                                clip_dim=args.clip_dim, 
                                block_size=args.block_size, 
                                num_layers=args.num_layers, 
                                n_head=args.n_head_gpt, 
                                drop_out_rate=args.drop_out_rate, 
                                fc_rate=args.ff_rate)


In [11]:
print ('loading checkpoint from {}'.format(args.resume_pth))
ckpt = torch.load(args.resume_pth, map_location='cpu')
net.load_state_dict(ckpt['net'], strict=True)
net.eval()
net.cuda()

loading checkpoint from output_vqfinal/VQ-VAE/eval/net_last.pth


HumanVQVAE(
  (vqvae): VQVAE_251(
    (encoder): Encoder(
      (model): Sequential(
        (0): Conv1d(263, 512, kernel_size=(3,), stride=(1,), padding=(1,))
        (1): ReLU()
        (2): Sequential(
          (0): Conv1d(512, 512, kernel_size=(4,), stride=(2,), padding=(1,))
          (1): Resnet1D(
            (model): Sequential(
              (0): ResConv1DBlock(
                (norm1): Identity()
                (norm2): Identity()
                (activation1): ReLU()
                (activation2): ReLU()
                (conv1): Conv1d(512, 512, kernel_size=(3,), stride=(1,), padding=(9,), dilation=(9,))
                (conv2): Conv1d(512, 512, kernel_size=(1,), stride=(1,))
              )
              (1): ResConv1DBlock(
                (norm1): Identity()
                (norm2): Identity()
                (activation1): ReLU()
                (activation2): ReLU()
                (conv1): Conv1d(512, 512, kernel_size=(3,), stride=(1,), padding=(3,), dilation=(3,))
 

In [12]:
if args.resume_trans is not None:
    print ('loading transformer checkpoint from {}'.format(args.resume_trans))
    ckpt = torch.load(args.resume_trans, map_location='cpu')
    trans_encoder.load_state_dict(ckpt['trans'], strict=True)
trans_encoder.train()
trans_encoder.cuda()

loading transformer checkpoint from output_GPT_Final/pkeep_12layer_0.5/net_best_fid.pth


Text2Motion_Transformer(
  (trans_base): CrossCondTransBase(
    (tok_emb): Embedding(514, 1024)
    (cond_emb): Linear(in_features=512, out_features=1024, bias=True)
    (pos_embedding): Embedding(51, 1024)
    (drop): Dropout(p=0.1, inplace=False)
    (blocks): Sequential(
      (0): Block(
        (ln1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (ln2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): CausalCrossConditionalSelfAttention(
          (key): Linear(in_features=1024, out_features=1024, bias=True)
          (query): Linear(in_features=1024, out_features=1024, bias=True)
          (value): Linear(in_features=1024, out_features=1024, bias=True)
          (attn_drop): Dropout(p=0.1, inplace=False)
          (resid_drop): Dropout(p=0.1, inplace=False)
          (proj): Linear(in_features=1024, out_features=1024, bias=True)
        )
        (mlp): Sequential(
          (0): Linear(in_features=1024, out_features=4096, bias=True)
      

In [13]:
fid = []
div = []
top1 = []
top2 = []
top3 = []
matching = []
multi = []
repeat_time = 10

for i in range(repeat_time):
    if i>=2:
        print(i)
        best_fid, best_iter, best_div, best_top1, best_top2, best_top3, best_matching, best_multi, writer, logger = eval_trans.evaluation_transformer_test(args.out_dir, val_loader, net, trans_encoder, logger, writer, 0, best_fid=1000, best_iter=0, best_div=100, best_top1=0, best_top2=0, best_top3=0, best_matching=100, best_multi=0, clip_model=clip_model, eval_wrapper=eval_wrapper, draw=False, savegif=False, save=False, savenpy=False)    
        fid.append(best_fid)
        div.append(best_div)
        top1.append(best_top1)
        top2.append(best_top2)
        top3.append(best_top3)
        matching.append(best_matching)
        multi.append(best_multi)

print('final result:')
print('fid: ', sum(fid)/repeat_time)
print('div: ', sum(div)/repeat_time)
print('top1: ', sum(top1)/repeat_time)
print('top2: ', sum(top2)/repeat_time)
print('top3: ', sum(top3)/repeat_time)
print('matching: ', sum(matching)/repeat_time)
print('multi: ', sum(multi)/repeat_time)

fid = np.array(fid)
div = np.array(div)
top1 = np.array(top1)
top2 = np.array(top2)
top3 = np.array(top3)
matching = np.array(matching)
multi = np.array(multi)
msg_final = f"FID. {np.mean(fid):.3f}, conf. {np.std(fid)*1.96/np.sqrt(repeat_time):.3f}, Diversity. {np.mean(div):.3f}, conf. {np.std(div)*1.96/np.sqrt(repeat_time):.3f}, TOP1. {np.mean(top1):.3f}, conf. {np.std(top1)*1.96/np.sqrt(repeat_time):.3f}, TOP2. {np.mean(top2):.3f}, conf. {np.std(top2)*1.96/np.sqrt(repeat_time):.3f}, TOP3. {np.mean(top3):.3f}, conf. {np.std(top3)*1.96/np.sqrt(repeat_time):.3f}, Matching. {np.mean(matching):.3f}, conf. {np.std(matching)*1.96/np.sqrt(repeat_time):.3f}, Multi. {np.mean(multi):.3f}, conf. {np.std(multi)*1.96/np.sqrt(repeat_time):.3f}"
logger.info(msg_final)

2
2023-07-20 19:20:29,113 INFO --> 	 Eva. Iter 0 :, FID. 0.1684, Diversity Real. 9.4626, Diversity. 10.0246, R_precision_real. [0.52025862 0.70387931 0.79719828], R_precision. [0.49051724 0.66831897 0.7700431 ], matching_score_real. 2.9668814675561315, matching_score_pred. 3.143495191376785, multimodality. 2.0688
3
2023-07-21 01:20:47,325 INFO --> 	 Eva. Iter 0 :, FID. 0.1969, Diversity Real. 9.3390, Diversity. 9.6946, R_precision_real. [0.50948276 0.70560345 0.79612069], R_precision. [0.46939655 0.66206897 0.76336207], matching_score_real. 3.000256005648909, matching_score_pred. 3.177171958726028, multimodality. 2.0835
4
