In [1]:
import torch
import numpy as np
import os
import pickle
import argparse
import matplotlib.pyplot as plt
from copy import deepcopy
from itertools import repeat
from tqdm import tqdm
from einops import rearrange
import wandb
import time
from torchvision import transforms

from brl_constants import FPS
from brl_constants import PUPPET_GRIPPER_JOINT_OPEN
from brl_constants import TASK_CONFIGS
from utils import load_data # data functions
from utils import sample_box_pose, sample_insertion_pose # robot functions
from utils import compute_dict_mean, set_seed, detach_dict, calibrate_linear_vel, postprocess_base_action # helper functions
from policy import ACTPolicy, CNNMLPPolicy
# from policy import ACTPolicy, CNNMLPPolicy, DiffusionPolicy
from visualize_episodes import save_videos

from detr.models.latent_model import Latent_Model_Transformer
import argparse





    No private macro file found!
    It is recommended to use a private macro file
    To setup, run: python /home/zfei/code/robomimic/robomimic/scripts/setup_macros.py
)[0m


In [2]:
task_name = "act_demo_z1_push_red" 
task_config = TASK_CONFIGS[task_name]
camera_names = task_config['camera_names']

ckpt_dir = "/mnt/data1/act/act_demo_z1_push_red/ckpt" 
policy_class = "ACT" 
kl_weight = 10 
chunk_size = 100 
hidden_dim = 512 
batch_size = 8 
dim_feedforward = 3200 
num_steps = 2000 
lr = 1e-5 
lr_backbone = 1e-5
seed = 0
backbone = 'resnet18'
state_dim = 6
enc_layers = 4
dec_layers = 7
nheads = 8

policy_config = {'lr': lr,
                'num_queries': chunk_size,
                'kl_weight': kl_weight,
                'hidden_dim': hidden_dim,
                'dim_feedforward': dim_feedforward,
                'lr_backbone': lr_backbone,
                'backbone': backbone,
                'enc_layers': enc_layers,
                'dec_layers': dec_layers,
                'nheads': nheads,
                'camera_names': camera_names,
                'vq': False,
                'vq_class': None,
                'vq_dim': None,
                'action_dim': 6,
                'no_encoder': False,
                }

In [3]:

def get_args_parser():
    parser = argparse.ArgumentParser('Set transformer detector', add_help=False)
    parser.add_argument('--lr', default=1e-4, type=float) # will be overridden
    parser.add_argument('--lr_backbone', default=1e-5, type=float) # will be overridden
    parser.add_argument('--batch_size', default=2, type=int) # not used
    parser.add_argument('--weight_decay', default=1e-4, type=float)
    parser.add_argument('--epochs', default=300, type=int) # not used
    parser.add_argument('--lr_drop', default=200, type=int) # not used
    parser.add_argument('--clip_max_norm', default=0.1, type=float, # not used
                        help='gradient clipping max norm')

    # Model parameters
    # * Backbone
    parser.add_argument('--backbone', default='resnet18', type=str, # will be overridden
                        help="Name of the convolutional backbone to use")
    parser.add_argument('--dilation', action='store_true',
                        help="If true, we replace stride with dilation in the last convolutional block (DC5)")
    parser.add_argument('--position_embedding', default='sine', type=str, choices=('sine', 'learned'),
                        help="Type of positional embedding to use on top of the image features")
    parser.add_argument('--camera_names', default=[], type=list, # will be overridden
                        help="A list of camera names")

    # * Transformer
    parser.add_argument('--enc_layers', default=4, type=int, # will be overridden
                        help="Number of encoding layers in the transformer")
    parser.add_argument('--dec_layers', default=6, type=int, # will be overridden
                        help="Number of decoding layers in the transformer")
    parser.add_argument('--dim_feedforward', default=2048, type=int, # will be overridden
                        help="Intermediate size of the feedforward layers in the transformer blocks")
    parser.add_argument('--hidden_dim', default=256, type=int, # will be overridden
                        help="Size of the embeddings (dimension of the transformer)")
    parser.add_argument('--dropout', default=0.1, type=float,
                        help="Dropout applied in the transformer")
    parser.add_argument('--nheads', default=8, type=int, # will be overridden
                        help="Number of attention heads inside the transformer's attentions")
    parser.add_argument('--num_queries', default=400, type=int, # will be overridden
                        help="Number of query slots")
    parser.add_argument('--pre_norm', action='store_true')

    # * Segmentation
    parser.add_argument('--masks', action='store_true',
                        help="Train segmentation head if the flag is provided")

    # repeat args in imitate_episodes just to avoid error. Will not be used
    parser.add_argument('--eval', action='store_true')
    parser.add_argument('--onscreen_render', action='store_true')
    parser.add_argument('--ckpt_dir', action='store', type=str, help='ckpt_dir', required=False)
    parser.add_argument('--policy_class', action='store', type=str, help='policy_class, capitalize', required=False)
    parser.add_argument('--task_name', action='store', type=str, help='task_name', required=False)
    parser.add_argument('--seed', action='store', type=int, help='seed', required=False)
    parser.add_argument('--num_steps', action='store', type=int, help='num_epochs', required=False)
    parser.add_argument('--kl_weight', action='store', type=int, help='KL Weight', required=False)
    parser.add_argument('--chunk_size', action='store', type=int, help='chunk_size', required=False)
    parser.add_argument('--temporal_agg', action='store_true')
    
    parser.add_argument('--use_vq', action='store_true')
    parser.add_argument('--vq_class', action='store', type=int, help='vq_class', required=False)
    parser.add_argument('--vq_dim', action='store', type=int, help='vq_dim', required=False)
    parser.add_argument('--load_pretrain', action='store_true', default=False)
    parser.add_argument('--action_dim', action='store', type=int, required=False)
    parser.add_argument('--eval_every', action='store', type=int, default=500, help='eval_every', required=False)
    parser.add_argument('--validate_every', action='store', type=int, default=500, help='validate_every', required=False)
    parser.add_argument('--save_every', action='store', type=int, default=500, help='save_every', required=False)
    parser.add_argument('--resume_ckpt_path', action='store', type=str, help='load_ckpt_path', required=False)
    parser.add_argument('--no_encoder', action='store_true')
    parser.add_argument('--skip_mirrored_data', action='store_true')
    parser.add_argument('--actuator_network_dir', action='store', type=str, help='actuator_network_dir', required=False)
    parser.add_argument('--history_len', action='store', type=int)
    parser.add_argument('--future_len', action='store', type=int)
    parser.add_argument('--prediction_len', action='store', type=int)
    
    return parser


In [4]:
parser = argparse.ArgumentParser('DETR training and evaluation script', parents=[get_args_parser()])
args = parser.parse_args(["--policy_class", "ACT"])


In [5]:
config_path = "/mnt/data1/act/act_demo_z1_push_red/ckpt/config.pkl"
with open(config_path, 'rb') as f:
    policy_config = pickle.load(f)
policy_config

{'num_steps': 2000,
 'eval_every': 500,
 'validate_every': 500,
 'save_every': 500,
 'ckpt_dir': '/mnt/data1/act/act_demo_z1_push_red/ckpt',
 'resume_ckpt_path': None,
 'episode_len': 400,
 'state_dim': 6,
 'lr': 1e-05,
 'policy_class': 'ACT',
 'onscreen_render': False,
 'policy_config': {'lr': 1e-05,
  'num_queries': 100,
  'kl_weight': 10,
  'hidden_dim': 512,
  'dim_feedforward': 3200,
  'lr_backbone': 1e-05,
  'backbone': 'resnet18',
  'enc_layers': 4,
  'dec_layers': 7,
  'nheads': 8,
  'camera_names': ['wrist'],
  'vq': False,
  'vq_class': None,
  'vq_dim': None,
  'action_dim': 6,
  'no_encoder': False},
 'task_name': 'act_demo_z1_push_red',
 'seed': 0,
 'temporal_agg': False,
 'camera_names': ['wrist'],
 'real_robot': True,
 'load_pretrain': False,
 'actuator_config': {'actuator_network_dir': None,
  'history_len': None,
  'future_len': None,
  'prediction_len': None}}

In [15]:
policy_config = {
    "lr": 1e-05,
    "num_queries": 100,
    "kl_weight": 10,
    "hidden_dim": 512,
    "dim_feedforward": 3200,
    "lr_backbone": 1e-05,
    "backbone": "resnet18",
    "enc_layers": 4,
    "dec_layers": 7,
    "nheads": 8,
    "camera_names": ["wrist"],
    "vq": False,
    "vq_class": None,
    "vq_dim": None,
    "action_dim": 6,
    "no_encoder": False,
    "task_name": "act_demo_z1_push_red",
    "ckpt_dir": "/mnt/data1/act/act_demo_z1_push_red/ckpt/policy_best.ckpt",
    "num_steps": 2000,
    "lr": 1e-5,
    "seed":0,
    "policy_class": "ACT"
}

In [23]:
# ckpt_path = "/mnt/data1/act/act_demo_z1_push_red/ckpt/policy_best.ckpt"

policy = ACTPolicy(policy_config)
loading_status = policy.deserialize(torch.load(policy_config['ckpt_dir']))
print(loading_status)
policy.cuda()
policy.eval()
print('Loaded: ', policy_config['ckpt_dir'])
stats_path = "/mnt/data1/act/act_demo_z1_push_red/ckpt/dataset_stats.pkl"
with open(stats_path, 'rb') as f:
    stats = pickle.load(f)



DETR Args:  Namespace(lr=0.0001, lr_backbone=1e-05, batch_size=2, weight_decay=0.0001, epochs=300, lr_drop=200, clip_max_norm=0.1, backbone='resnet18', dilation=False, position_embedding='sine', camera_names=[], enc_layers=4, dec_layers=6, dim_feedforward=2048, hidden_dim=256, dropout=0.1, nheads=8, num_queries=400, pre_norm=False, masks=False, eval=False, onscreen_render=False, ckpt_dir='/mnt/data1/act/act_demo_z1_push_red/ckpt', policy_class='ACT', task_name='act_demo_z1_push_red', seed=0, num_steps=2000, kl_weight=None, chunk_size=None, temporal_agg=False, use_vq=False, vq_class=None, vq_dim=None, load_pretrain=False, action_dim=None, eval_every=500, validate_every=500, save_every=500, resume_ckpt_path=None, no_encoder=False, skip_mirrored_data=False, actuator_network_dir=None, history_len=None, future_len=None, prediction_len=None)
ACT Args:  Namespace(lr=1e-05, lr_backbone=1e-05, batch_size=2, weight_decay=0.0001, epochs=300, lr_drop=200, clip_max_norm=0.1, backbone='resnet18', di

In [28]:
pre_process = lambda s_qpos: (s_qpos - stats['qpos_mean']) / stats['qpos_std']
post_process = lambda a: a * stats['action_std'] + stats['action_mean']
query_frequency = policy_config['num_queries']
BASE_DELAY = 13
query_frequency -= BASE_DELAY
max_timesteps = int(400) # may increase for real-world tasks


In [26]:
with torch.inference_mode():
    time0 = time.time()
    DT = 1 / FPS
    culmulated_delay = 0 
    for t in range(max_timesteps):
        all_actions = policy(qpos, curr_image)


100

In [2]:
h5data_file = "/mnt/data1/act/act_demo_z1_push_red/episode_50.hdf5"
f = h5py.File(h5_paths[0], 'r')