### installation for colab
To Run in Colab: uncomment and run the following

[Open In Colab](https://colab.research.google.com/github/AssistiveRoboticsUNH/bc_tutorial/blob/main/robomimic_tasks/train_lift_dag.ipynb)

In [None]:
# !git clone https://github.com/ARISE-Initiative/robomimic
# !pip install -e robomimic/

# import sys
# import os
# sys.path.append('./robomimic/')



# # install all system dependencies for mujoco-py
# !sudo apt install curl git libgl1-mesa-dev libgl1-mesa-glx libglew-dev \
#          libosmesa6-dev software-properties-common net-tools unzip vim \
#          virtualenv wget xserver-xorg-dev libglfw3-dev patchelf

# #install mujoco-py
# !pip install mujoco

# #install robosuite
# !pip install robosuite==1.4.1


# # download lift ph dataset.
# !wget http://downloads.cs.stanford.edu/downloads/rt_benchmark/lift/ph/low_dim_v141.hdf5 -O lift_ph_low_dim_v141.hdf5

# # download lift mh dataset
# !wget http://downloads.cs.stanford.edu/downloads/rt_benchmark/lift/mh/low_dim_v141.hdf5 -O lift_mh_low_dim_v141.hdf5

In [1]:
import os
import json
import h5py
import numpy as np

import robomimic
import robomimic.utils.file_utils as FileUtils
import robomimic.utils.env_utils as EnvUtils
import robomimic.utils.obs_utils as ObsUtils
import imageio

import torch
import torch.nn as nn
import torch.nn.functional as F 
from torch.optim import Adam 
import torch
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
from matplotlib import pyplot as plt
from copy import deepcopy
import datetime
from tqdm import tqdm 

    No private macro file found!
    It is recommended to use a private macro file
    To setup, run: python /home/ns/mimicgen/mimicgen_envs/robomimic/robomimic/scripts/setup_macros.py
)[0m


In [2]:
device= torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


### Load Robomimic Lift Dataset

In [None]:
dataset_path = "lift_mh_low_dim_v141.hdf5"

f = h5py.File(dataset_path, "r")
demos = list(f["data"].keys())
num_demos = len(demos)
print(f'Number of demos: {num_demos}')

Number of demos: 300


In [4]:
worse_demo_names = [ b.decode('utf-8') for b in f['mask']['worse_operator_2'] ] 
demos = worse_demo_names[:10]
demos

['demo_50',
 'demo_51',
 'demo_52',
 'demo_53',
 'demo_54',
 'demo_55',
 'demo_56',
 'demo_57',
 'demo_58',
 'demo_59']

In [5]:
select_keys=['object', 'robot0_eef_pos', 'robot0_eef_quat', 'robot0_gripper_qpos']

In [6]:
D_org_states=[]
D_org_actions=[]

for demo_name in demos:  #  demo names
    traj=f['data'][demo_name]
 
    actions=traj['actions']
    select_obs=np.hstack( [traj['obs'][key] for key in select_keys] ) 
    D_org_actions.append(actions)
    D_org_states.append(select_obs)

D_org_actions=np.concatenate(D_org_actions)
D_org_states=np.concatenate(D_org_states)
D_org_states.shape, D_org_actions.shape

((1688, 19), (1688, 7))

### Model

In [7]:
class MLP(nn.Module):
    def __init__(self, input_dim, size=32):
        super(MLP, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim,size),
            nn.ReLU(), 
            nn.Linear(size,size),
            nn.ReLU() 
        )
    def forward(self,x):
        x = self.net(x)
        return x
    
class RegNet(MLP):
    def __init__(self, input_dim , size, action_dim):
        super(RegNet, self).__init__(input_dim, size)
        self.decoder = nn.Linear(size, action_dim)
    def forward(self,x):
        x = self.net(x)
        x = self.decoder(x)
        return x

### Training

In [8]:
learning_rate = 1e-4

state_dim = 19 
action_dim=7

bc = RegNet(state_dim, 64, action_dim)
criterion = nn.MSELoss() 
optimizer = Adam(bc.parameters(), lr = learning_rate)

In [9]:
def train(model, states, actions, batch_size=64, n_epoch = 3000, print_every=100):

    data_loader = DataLoader( list(zip(states, actions)), batch_size=batch_size, shuffle=True)

    loss_list = []  
    
    for itr in range(0, n_epoch+1):
        total_loss = 0
        b=0
        for batch_states, batch_actions in data_loader: 
            y_pred = model(batch_states.float())
            loss   = criterion(y_pred, batch_actions.float()) 
            total_loss += loss.item() 
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            b += 1 
            
            loss_list.append(loss.item())
            
        if itr % print_every==0:
            print(f'Epoch {itr} Loss: {total_loss/b:.8f}')

    return loss_list

In [10]:
losses = train(bc, D_org_states, D_org_actions)

Epoch 0 Loss: 0.12634795
Epoch 100 Loss: 0.04952276
Epoch 200 Loss: 0.01994292
Epoch 300 Loss: 0.01192390
Epoch 400 Loss: 0.01099494
Epoch 500 Loss: 0.01045828
Epoch 600 Loss: 0.01016442
Epoch 700 Loss: 0.01005926
Epoch 800 Loss: 0.00968910
Epoch 900 Loss: 0.00948630
Epoch 1000 Loss: 0.00934580
Epoch 1100 Loss: 0.00931209
Epoch 1200 Loss: 0.00901996
Epoch 1300 Loss: 0.00871296
Epoch 1400 Loss: 0.00862300
Epoch 1500 Loss: 0.00873899
Epoch 1600 Loss: 0.00846909
Epoch 1700 Loss: 0.00830629
Epoch 1800 Loss: 0.00812401
Epoch 1900 Loss: 0.00797418
Epoch 2000 Loss: 0.00781637
Epoch 2100 Loss: 0.00752714
Epoch 2200 Loss: 0.00749926
Epoch 2300 Loss: 0.00725974
Epoch 2400 Loss: 0.00708207
Epoch 2500 Loss: 0.00710755
Epoch 2600 Loss: 0.00683588
Epoch 2700 Loss: 0.00729693
Epoch 2800 Loss: 0.00671106
Epoch 2900 Loss: 0.00678624
Epoch 3000 Loss: 0.00656327


### Inference

In [11]:
env_meta=FileUtils.get_env_metadata_from_dataset(dataset_path)
env = EnvUtils.create_env_from_metadata(
    env_meta=env_meta, 
    render=False,            # no on-screen rendering
    render_offscreen=True,   # off-screen rendering to support rendering video frames
)
dummy_spec = dict(  obs=dict( low_dim=["robot0_eef_pos"], rgb=[], ),)
ObsUtils.initialize_obs_utils_with_obs_specs(obs_modality_specs=dummy_spec)



Created environment with name Lift
Action size is 7


using obs modality: low_dim with keys: ['robot0_eef_pos']
using obs modality: rgb with keys: []


In [None]:
def rollout(model, env, rollout_horizon = 400, video_path=None, seed=40):
    total_reward=0 
    select_keys=['object', 'robot0_eef_pos', 'robot0_eef_quat', 'robot0_gripper_qpos']


    np.random.seed(seed)
    torch.manual_seed(seed)

    obs = env.reset()
    state_dict = env.get_state()

    #saving <obs,action> to be relabelled by expert
    rollout_obss = []
    rollout_actions=[]
 
    for step_i in range(rollout_horizon):
        select_obs=np.hstack( [obs[key] for key in select_keys] ) 
        state=torch.from_numpy(select_obs).float()
        # state=state.to(device='cuda')

        act = model(state).detach().cpu().numpy()

        rollout_obss.append(state)
        rollout_actions.append(act)

        next_obs, r, done, _ = env.step(act)

        # compute reward
        total_reward += r
        success = env.is_success()["task"]

 
        # break if done or if success
        if done or success:
            # print(f'stop: done={done} success={success}')
            break

        # update for next iter
        obs = deepcopy(next_obs)

 
    return total_reward, rollout_obss, rollout_actions

In [13]:
# n_rollout=20
# s=0
# for i in range(n_rollout):
#     r,_,_=rollout(bc, env, video_path=None)
#     s+=r
#     print(f'Rollout {i} Success: {r}')

# print(f'\nAverage Reward: {s/n_rollout:.2f}')

In [None]:
# torch.save(bc.state_dict(), "expert_lift_ph_95.pth")

# torch.save(bc, "bc_lift_expert_full_ph.pth")

In [15]:
# expert_bc

### load expert policy 

In [None]:
!git clone https://github.com/AssistiveRoboticsUNH/bc_tutorial.git

In [None]:
# expert_bc = torch.load("bc_lift_expert_full_ph.pth")
expert_bc = torch.load("/content/bc_tutorial/robomimic_tasks/bc_lift_expert_full_ph.pth")
expert_bc.eval()

  expert_bc = torch.load("bc_lift_expert_full_ph.pth")


RegNet(
  (net): Sequential(
    (0): Linear(in_features=19, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): ReLU()
  )
  (decoder): Linear(in_features=64, out_features=7, bias=True)
)

### Train dagger

In [17]:
#initial dataset.
D_s = D_org_states
D_a = D_org_actions

n_rollout=20
 
for dagger_itr in range(5): 
    print(f"\ndagger itr: {dagger_itr}")
    print(f"collecting {n_rollout} rollouts")

    s, D_s_new, D_a_new = 0, [], [] 
    for i in tqdm( range(n_rollout) ):
        r,ss,aa=rollout(bc, env, video_path=None)
        D_s_new.extend(ss)
        D_a_new.extend(aa)
        s+=r
        # print(f'Rollout {i} Success: {r} new_ss: {len(ss)}')
    print(f'\nAverage Reward: {s/n_rollout:.2f} total new: {len(D_s_new)}')

    D_s_new = np.stack(D_s_new)
    D_a_new = np.stack(D_a_new)

    print("\nRelabelling actions ")
    state=torch.from_numpy(D_s_new).float()
    D_a_new = expert_bc(state).detach().cpu().numpy()  #relabelled action using expert


    print('aggregating dataset... ')
    D_s = np.vstack([D_s , D_s_new])
    D_a = np.vstack([D_a, D_a_new])

    print(f'Updated dataset: {D_s.shape} , {D_a.shape}')

    print('training ... ')
    losses = train(bc, D_s, D_a)


dagger itr: 0
collecting 20 rollouts


100%|██████████| 20/20 [01:42<00:00,  5.15s/it]



Average Reward: 0.25 total new: 6689

Relabelling actions 
aggregating dataset... 
Updated dataset: (8377, 19) , (8377, 7)
training ... 
Epoch 0 Loss: 0.04852031
Epoch 100 Loss: 0.01301017
Epoch 200 Loss: 0.00870638
Epoch 300 Loss: 0.00695585
Epoch 400 Loss: 0.00575945
Epoch 500 Loss: 0.00482371
Epoch 600 Loss: 0.00427983
Epoch 700 Loss: 0.00384200
Epoch 800 Loss: 0.00357729
Epoch 900 Loss: 0.00334985
Epoch 1000 Loss: 0.00316003
Epoch 1100 Loss: 0.00301083
Epoch 1200 Loss: 0.00288521
Epoch 1300 Loss: 0.00272427
Epoch 1400 Loss: 0.00260760
Epoch 1500 Loss: 0.00249570
Epoch 1600 Loss: 0.00240855
Epoch 1700 Loss: 0.00230840
Epoch 1800 Loss: 0.00226058
Epoch 1900 Loss: 0.00218271
Epoch 2000 Loss: 0.00213247
Epoch 2100 Loss: 0.00208134
Epoch 2200 Loss: 0.00205122
Epoch 2300 Loss: 0.00200033
Epoch 2400 Loss: 0.00198254
Epoch 2500 Loss: 0.00192703
Epoch 2600 Loss: 0.00192311
Epoch 2700 Loss: 0.00186622
Epoch 2800 Loss: 0.00187761
Epoch 2900 Loss: 0.00185235
Epoch 3000 Loss: 0.00180494

dagge

100%|██████████| 20/20 [01:33<00:00,  4.66s/it]



Average Reward: 0.35 total new: 5823

Relabelling actions 
aggregating dataset... 
Updated dataset: (14200, 19) , (14200, 7)
training ... 
Epoch 0 Loss: 0.17036391
Epoch 100 Loss: 0.00570416
Epoch 200 Loss: 0.00444431
Epoch 300 Loss: 0.00381565
Epoch 400 Loss: 0.00343662
Epoch 500 Loss: 0.00316255
Epoch 600 Loss: 0.00296795
Epoch 700 Loss: 0.00280661
Epoch 800 Loss: 0.00271333
Epoch 900 Loss: 0.00258774
Epoch 1000 Loss: 0.00251942
Epoch 1100 Loss: 0.00245590
Epoch 1200 Loss: 0.00236516
Epoch 1300 Loss: 0.00230745
Epoch 1400 Loss: 0.00225643
Epoch 1500 Loss: 0.00218715
Epoch 1600 Loss: 0.00216913
Epoch 1700 Loss: 0.00211289
Epoch 1800 Loss: 0.00207446
Epoch 1900 Loss: 0.00205388
Epoch 2000 Loss: 0.00200443
Epoch 2100 Loss: 0.00199523
Epoch 2200 Loss: 0.00197539
Epoch 2300 Loss: 0.00196592
Epoch 2400 Loss: 0.00189338
Epoch 2500 Loss: 0.00191083
Epoch 2600 Loss: 0.00187657
Epoch 2700 Loss: 0.00186087
Epoch 2800 Loss: 0.00183245
Epoch 2900 Loss: 0.00181395
Epoch 3000 Loss: 0.00180557

dag

100%|██████████| 20/20 [01:29<00:00,  4.50s/it]



Average Reward: 0.35 total new: 5709

Relabelling actions 
aggregating dataset... 
Updated dataset: (19909, 19) , (19909, 7)
training ... 
Epoch 0 Loss: 0.01979364
Epoch 100 Loss: 0.00363477
Epoch 200 Loss: 0.00299972
Epoch 300 Loss: 0.00270542
Epoch 400 Loss: 0.00254873
Epoch 500 Loss: 0.00245105
Epoch 600 Loss: 0.00237050
Epoch 700 Loss: 0.00229540
Epoch 800 Loss: 0.00223223
Epoch 900 Loss: 0.00217266
Epoch 1000 Loss: 0.00213656
Epoch 1100 Loss: 0.00212119
Epoch 1200 Loss: 0.00208631
Epoch 1300 Loss: 0.00204508
Epoch 1400 Loss: 0.00201560
Epoch 1500 Loss: 0.00200054
Epoch 1600 Loss: 0.00198614
Epoch 1700 Loss: 0.00196589
Epoch 1800 Loss: 0.00192577
Epoch 1900 Loss: 0.00193222
Epoch 2000 Loss: 0.00190555
Epoch 2100 Loss: 0.00186881
Epoch 2200 Loss: 0.00188694
Epoch 2300 Loss: 0.00185316
Epoch 2400 Loss: 0.00184261
Epoch 2500 Loss: 0.00182280
Epoch 2600 Loss: 0.00181040
Epoch 2700 Loss: 0.00179870
Epoch 2800 Loss: 0.00178643
Epoch 2900 Loss: 0.00178275
Epoch 3000 Loss: 0.00177958

dag

100%|██████████| 20/20 [01:08<00:00,  3.45s/it]



Average Reward: 0.55 total new: 4183

Relabelling actions 
aggregating dataset... 
Updated dataset: (24092, 19) , (24092, 7)
training ... 
Epoch 0 Loss: 0.05826057
Epoch 100 Loss: 0.00429957
Epoch 200 Loss: 0.00347803
Epoch 300 Loss: 0.00316219
Epoch 400 Loss: 0.00294381
Epoch 500 Loss: 0.00282564
Epoch 600 Loss: 0.00271598
Epoch 700 Loss: 0.00261980
Epoch 800 Loss: 0.00254547
Epoch 900 Loss: 0.00250291
Epoch 1000 Loss: 0.00246020
Epoch 1100 Loss: 0.00240868
Epoch 1200 Loss: 0.00236077
Epoch 1300 Loss: 0.00234381
Epoch 1400 Loss: 0.00229690
Epoch 1500 Loss: 0.00226088
Epoch 1600 Loss: 0.00223899
Epoch 1700 Loss: 0.00223163
Epoch 1800 Loss: 0.00221879
Epoch 1900 Loss: 0.00220434
Epoch 2000 Loss: 0.00217011
Epoch 2100 Loss: 0.00216394
Epoch 2200 Loss: 0.00212223
Epoch 2300 Loss: 0.00212612
Epoch 2400 Loss: 0.00210277
Epoch 2500 Loss: 0.00209009
Epoch 2600 Loss: 0.00209287
Epoch 2700 Loss: 0.00207503
Epoch 2800 Loss: 0.00205937
Epoch 2900 Loss: 0.00206275
Epoch 3000 Loss: 0.00203205

dag

100%|██████████| 20/20 [00:38<00:00,  1.95s/it]



Average Reward: 0.85 total new: 2064

Relabelling actions 
aggregating dataset... 
Updated dataset: (26156, 19) , (26156, 7)
training ... 
Epoch 0 Loss: 0.01849106
Epoch 100 Loss: 0.00283443
Epoch 200 Loss: 0.00259822
Epoch 300 Loss: 0.00248698
Epoch 400 Loss: 0.00242445
Epoch 500 Loss: 0.00235309
Epoch 600 Loss: 0.00230224
Epoch 700 Loss: 0.00228584
Epoch 800 Loss: 0.00226342
Epoch 900 Loss: 0.00223026
Epoch 1000 Loss: 0.00220018
Epoch 1100 Loss: 0.00218036
Epoch 1200 Loss: 0.00218770
Epoch 1300 Loss: 0.00216330
Epoch 1400 Loss: 0.00213479
Epoch 1500 Loss: 0.00212338
Epoch 1600 Loss: 0.00210447
Epoch 1700 Loss: 0.00209640
Epoch 1800 Loss: 0.00208735
Epoch 1900 Loss: 0.00206631
Epoch 2000 Loss: 0.00207832
Epoch 2100 Loss: 0.00205897
Epoch 2200 Loss: 0.00206263
Epoch 2300 Loss: 0.00203268
Epoch 2400 Loss: 0.00202370
Epoch 2500 Loss: 0.00204858
Epoch 2600 Loss: 0.00203207
Epoch 2700 Loss: 0.00200695
Epoch 2800 Loss: 0.00199966
Epoch 2900 Loss: 0.00200836
Epoch 3000 Loss: 0.00199141


In [None]:
n_rollout=20
s=0
for i in range(n_rollout):
    r,_,_=rollout(bc, env, video_path=None, seed=42)
    s+=r
    print(f'Rollout {i} Success: {r}')

print(f'\nAverage Reward: {s/n_rollout:.2f}')

Rollout 0 Success: 1.0
Rollout 1 Success: 1.0
Rollout 2 Success: 0.0
Rollout 3 Success: 1.0
Rollout 4 Success: 1.0
Rollout 5 Success: 1.0
Rollout 6 Success: 1.0
Rollout 7 Success: 1.0
Rollout 8 Success: 1.0
Rollout 9 Success: 0.0
Rollout 10 Success: 1.0
Rollout 11 Success: 1.0
Rollout 12 Success: 1.0
Rollout 13 Success: 1.0
Rollout 14 Success: 0.0
Rollout 15 Success: 1.0
Rollout 16 Success: 0.0
Rollout 17 Success: 1.0
Rollout 18 Success: 1.0
Rollout 19 Success: 1.0

Average Reward: 0.80


In [None]:
#TODO: try with 3 different seeds