In [14]:
import sys
import os
# sys.argv = ['GPT_eval_multi.py']
os.chdir("/workspace/")
# import options.option_transformer as option_trans
# args = option_trans.get_args_parser()

import clip
import torch
import numpy as np
import models.vqvae as vqvae
import models.t2m_trans as trans
import warnings
import pandas as pd
warnings.filterwarnings('ignore')
from lion_pytorch import Lion

In [15]:
import easydict

args = easydict.EasyDict({
    'dataname': 't2m',
    'batch_size': 64,
    'fps': [20],
    'seq_len': 64,
    'total_iter': 300000,
    'warm_up_iter': 1000,
    'lr': 0.0001,
    'lr_scheduler': [150000],
    'gamma': 0.05,
    'weight_decay': 1e-6,
    'decay_option': 'all',
    'optimizer': 'adamw',
    'code_dim': 512,
    'nb_code': 512,
    'mu': 0.99,
    'down_t': 2,
    'stride_t': 2,
    'width': 512,
    'depth': 3,
    'dilation_growth_rate': 3,
    'output_emb_width': 512,
    'vq_act': 'relu',
    'block_size': 51,
    'embed_dim_gpt': 1024,
    'clip_dim': 512,
    'num_layers': 9,
    'n_head_gpt': 16,
    'ff_rate': 4,
    'drop_out_rate': 0.1,
    'quantizer': 'ema_reset',
    'quantbeta': 1.0,
    'resume_pth': "output_vqfinal/VQ-VAE/eval/net_last.pth",
    'resume_trans': "output_GPT_Final/pkeep_0.5/net_best_fid.pth",
    'resume_trans2': "output_GPT_Final/pkeep_12layer_0.5/net_best_fid.pth",
    'out_dir': 'output_GPT_Final/',
    'exp_name': 'pkeep_0.5/',
    'vq_name': 'VQ-VAE',
    'print_iter': 200,
    'eval_iter': 10000,
    'seed': 123,
    'if_maxtest': False,
    'pkeep': 0.5
})

In [16]:
## load clip model and datasets
clip_model, clip_preprocess = clip.load("ViT-B/32", device=torch.device('cuda'), jit=False, download_root='./')  # Must set jit=False for training
clip.model.convert_weights(clip_model)  # Actually this line is unnecessary since clip by default already on float16
clip_model.eval()
for p in clip_model.parameters():
    p.requires_grad = False


In [17]:
net = vqvae.HumanVQVAE(args, ## use args to define different parameters in different quantizers
                       args.nb_code,
                       args.code_dim,
                       args.output_emb_width,
                       args.down_t,
                       args.stride_t,
                       args.width,
                       args.depth,
                       args.dilation_growth_rate)


trans_encoder = trans.Text2Motion_Transformer(num_vq=args.nb_code, 
                                embed_dim=1024, 
                                clip_dim=args.clip_dim, 
                                block_size=args.block_size, 
                                num_layers=9, 
                                n_head=16, 
                                drop_out_rate=args.drop_out_rate, 
                                fc_rate=args.ff_rate)


print ('loading checkpoint from {}'.format(args.resume_pth))
ckpt = torch.load(args.resume_pth, map_location='cpu')
net.load_state_dict(ckpt['net'], strict=True)
net.eval()
net.cuda()

print ('loading transformer checkpoint from {}'.format(args.resume_trans))
ckpt = torch.load(args.resume_trans, map_location='cpu')
trans_encoder.load_state_dict(ckpt['trans'], strict=True)
trans_encoder.eval()
trans_encoder.cuda()

mean = torch.from_numpy(np.load('./checkpoints/t2m/VQVAEV3_CB1024_CMT_H1024_NRES3/meta/mean.npy')).cuda()
std = torch.from_numpy(np.load('./checkpoints/t2m/VQVAEV3_CB1024_CMT_H1024_NRES3/meta/std.npy')).cuda()


loading checkpoint from output_vqfinal/VQ-VAE/eval/net_last.pth
loading transformer checkpoint from output_GPT_Final/pkeep_0.5/net_best_fid.pth


In [18]:
trans_encoder2 = trans.Text2Motion_Transformer(num_vq=args.nb_code, 
                                embed_dim=1024, 
                                clip_dim=args.clip_dim, 
                                block_size=args.block_size, 
                                num_layers=12, 
                                n_head=16, 
                                drop_out_rate=args.drop_out_rate, 
                                fc_rate=args.ff_rate)

print ('loading transformer checkpoint from {}'.format(args.resume_trans2))
ckpt1 = torch.load(args.resume_trans2, map_location='cpu')
trans_encoder2.load_state_dict(ckpt1['trans'], strict=True)
trans_encoder2.eval()
trans_encoder2.cuda()

loading transformer checkpoint from output_GPT_Final/pkeep_12layer_0.5/net_best_fid.pth


Text2Motion_Transformer(
  (trans_base): CrossCondTransBase(
    (tok_emb): Embedding(514, 1024)
    (cond_emb): Linear(in_features=512, out_features=1024, bias=True)
    (pos_embedding): Embedding(51, 1024)
    (drop): Dropout(p=0.1, inplace=False)
    (blocks): Sequential(
      (0): Block(
        (ln1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (ln2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): CausalCrossConditionalSelfAttention(
          (key): Linear(in_features=1024, out_features=1024, bias=True)
          (query): Linear(in_features=1024, out_features=1024, bias=True)
          (value): Linear(in_features=1024, out_features=1024, bias=True)
          (attn_drop): Dropout(p=0.1, inplace=False)
          (resid_drop): Dropout(p=0.1, inplace=False)
          (proj): Linear(in_features=1024, out_features=1024, bias=True)
        )
        (mlp): Sequential(
          (0): Linear(in_features=1024, out_features=4096, bias=True)
      

In [19]:
from utils.motion_process import recover_from_ric
import visualization.plot_3d_global as plot_3d


def generate_motion(clip_text, transdecoder, num_layer, i = None, path = None):
    
    if not os.path.isdir(path):
        os.makedirs(path)
     
    text = clip.tokenize(clip_text, truncate=True).cuda()
    feat_clip_text = clip_model.encode_text(text).float()
    index_motion = transdecoder.sample(feat_clip_text[0:1], False)
    pred_pose = net.forward_decoder(index_motion)
    pred_xyz = recover_from_ric((pred_pose*std+mean).float(), 22)
    xyz = pred_xyz.reshape(1, -1, 22, 3)
    if i is None:
        if num_layer == 9:
            # np.save(f'./test_9layer/motion_{clip_text}.npy', xyz.detach().cpu().numpy())
            pose_vis = plot_3d.draw_to_batch(xyz.detach().cpu().numpy(),clip_text, [f'{num_layer}_{clip_text}.gif'], path)
        elif num_layer == 12:
            # np.save(f'./test_12layer/motion_{clip_text}.npy', xyz.detach().cpu().numpy())
            pose_vis = plot_3d.draw_to_batch(xyz.detach().cpu().numpy(),clip_text, [f'{num_layer}_{clip_text}.gif'], path)
    else:
        if num_layer == 9:
            # np.save(f'./test_9layer/motion_{clip_text}.npy', xyz.detach().cpu().numpy())
            pose_vis = plot_3d.draw_to_batch(xyz.detach().cpu().numpy(),clip_text, [f'{num_layer}_{clip_text}_{i}.gif'], path)
        elif num_layer == 12:
            # np.save(f'./test_12layer/motion_{clip_text}.npy', xyz.detach().cpu().numpy())
            pose_vis = plot_3d.draw_to_batch(xyz.detach().cpu().numpy(),clip_text, [f'{num_layer}_{clip_text}_{i}.gif'], path)

def  gt_motion(name, path = None):
    if not os.path.isdir(path):
        os.makedirs(path)
        
    gt = np.load(f"./dataset/HumanML3D/new_joints/{name}.npy")
    gt = gt.reshape(1, -1, 22, 3)
    data = pd.read_csv("test_gt.csv")
    data.drop(columns="Unnamed: 0")
    text = data[data['name'] == str(name)].text
    clip_text = list(text)
    pose_vis = plot_3d.draw_to_batch(gt, clip_text, [f'{name}_{clip_text}_gt.gif'], path)
    

생성 

In [20]:
import pandas as pd 

df = pd.read_csv("test_gt.csv")

In [27]:
clip_text= ['a person raises both hands to their chest, rotates their torso, squats in this position then lowers their left hand and rotates their torso again. now lowering their hand']
dir = "need"
generate_motion(clip_text, trans_encoder, 9, None,path= "./test_9layer/{dir}/")
# gt_motion("010167", f"./test_gt/{dir}")

In [30]:
clip_text= ['a person steps forward and shakes both hands together in a begging manner, and then steps back. the person steps forward again and shakes both hands in a begging manner but more aggressively.']
name = df[df['text']== clip_text[0]]["name"]
print(name)
dir = "motondiffuse"
generate_motion(clip_text, trans_encoder, 9, None,path= f"./test_9layer/{dir}/")
# generate_motion(clip_text, trans_encoder2, 12,None,path="./test_12layer/arm/")
gt_motion(str(list(name)[0]), f"./test_gt/{dir}")

133    003812
Name: name, dtype: object


같은말이지만 다양한 의미인 동작

In [9]:
clip_text= [["person takes a step forward with left leg and then raise the right leg."],
            ["person takes a step forward with left leg and then put up the right leg."]]

for i, clip_text in enumerate(clip_text):
    print(clip_text)
    generate_motion(clip_text, trans_encoder, 9, i,path="./test_9layer/a/")
    # generate_motion(clip_text, trans_encoder2, 12,i,path="./test_12layer/a/")

['person takes a step forward with left leg and then raise the right leg.']
['person takes a step forward with left leg and then put up the right leg.']


이상함
"M004545"
a person jumping while rasing both hands, and  move apart legs
person does squat thrusts three times with legs going outwards and arms going above head
a person jumps with legs open while clapping with hands over head simultaneously.


In [None]:
from IPython.display import HTML
import base64
from utils.motion_process import recover_from_ric

# change the text here
clip_text = ["a man flying"]


text = clip.tokenize(clip_text, truncate=True).cuda()
feat_clip_text = clip_model.encode_text(text).float()
index_motion = trans_encoder.sample(feat_clip_text[0:1], False)
index_motion2 = trans_encoder2.sample(feat_clip_text[0:1], False)
pred_pose = net.forward_decoder(index_motion)
pred_pose2 = net.forward_decoder(index_motion2)

pred_xyz = recover_from_ric((pred_pose*std+mean).float(), 22)
pred_xyz2 = recover_from_ric((pred_pose2*std+mean).float(), 22)
xyz = pred_xyz.reshape(1, -1, 22, 3)
xyz2 = pred_xyz2.reshape(1, -1, 22, 3)

np.save('motion_9layer.npy', xyz.detach().cpu().numpy())
np.save('motion_12layer.npy', xyz2.detach().cpu().numpy())

import visualization.plot_3d_global as plot_3d
pose_vis = plot_3d.draw_to_batch(xyz.detach().cpu().numpy(),clip_text, ['example.gif'])
pose_vis = plot_3d.draw_to_batch(xyz2.detach().cpu().numpy(),clip_text, ['example2.gif'])



b64 = base64.b64encode(open('example.gif','rb').read()).decode('ascii')
display(HTML(f'<img src="data:image/gif;base64,{b64}" />'))

TEST DATA 가져오기

In [9]:
from utils.word_vectorizer import WordVectorizer
from dataset import dataset_TM_eval

w_vectorizer = WordVectorizer('./glove', 'our_vab')
val_loader = dataset_TM_eval.DATALoader(args.dataname, True, 32, w_vectorizer)

100%|██████████| 4384/4384 [00:03<00:00, 1185.73it/s]

Pointer Pointing at 0





In [32]:
test_text = []
name_text = []
for batch in val_loader:
    word_embeddings, pos_one_hots, clip_text, sent_len, pose, m_length, token, name = batch
    name_text.append(list(name))
    test_text.append(list(clip_text))


In [39]:
print(len(test_text))
type(test_text)

145


list

In [123]:
name_list = []
text_list = []
for i in range(144):
    for name,text in zip(name_text[i],test_text[i]):
        name_list.append(str(name))
        text_list.append(text)

In [125]:
import pandas as pd
df = pd.DataFrame({"name" : name_list, "text" : text_list})
df.to_csv("./test_gt.csv", encoding="utf-8")

In [58]:
file_name = './test.txt'

with open(file_name, 'w+') as file:
    for i in range(145):
        for num, i in enumerate(test_text[i]):
            file.write(''.join(i) + "\n")
            #else:
             #   file.write(''.join(i))