In [1]:
import robosuite as suite
from robosuite.wrappers.gym_wrapper import GymWrapper
import numpy as np
from stable_baselines3 import DDPG , SAC, PPO
from stable_baselines3.common.buffers import ReplayBuffer
from stable_baselines3.common.noise import NormalActionNoise
from sb3_contrib.common.wrappers import TimeFeatureWrapper
import argparse
modelpath = './models/PPO_3.0'

import torch
from torch.utils.data import Dataset,TensorDataset,random_split,DataLoader,SubsetRandomSampler
from torch.utils.data.dataset import Subset
import torch.nn as nn
import torch.nn.functional as F
from torchvision import transforms
import torchvision.models as models

In [2]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("torch.device(cuda)")
    print("torch.cuda.device_count(): ", torch.cuda.device_count())
    for i in range(torch.cuda.device_count()):
        print(torch.cuda.get_device_name())
    print("torch.cuda.current_device()", torch.cuda.current_device())
else:
    device = torch.device("cpu")
    print("torch.device(cpu)")

torch.device(cuda)
torch.cuda.device_count():  1
Tesla V100-SXM2-16GB
torch.cuda.current_device() 0


# Load Env

In [3]:
env = suite.make(
    "MaholoLaboratory",
    "Maholo",
    has_renderer=False,
    has_offscreen_renderer=False,
    use_camera_obs=False,
    control_freq=50,
    render_camera="frontview",
    render_gpu_device_id=0,
    horizon=400,
)
for key,value in env.reset().items():
    print(f"Key: {key}, Value.shape: {value.shape}", flush=True)

Key: robot0_joint_pos_cos, Value.shape: (15,)
Key: robot0_joint_pos_sin, Value.shape: (15,)
Key: robot0_joint_vel, Value.shape: (15,)
Key: robot0_right_eef_pos, Value.shape: (3,)
Key: robot0_right_eef_quat, Value.shape: (4,)
Key: robot0_right_gripper_qpos, Value.shape: (2,)
Key: robot0_right_gripper_qvel, Value.shape: (2,)
Key: robot0_left_eef_pos, Value.shape: (3,)
Key: robot0_left_eef_quat, Value.shape: (4,)
Key: robot0_left_gripper_qpos, Value.shape: (2,)
Key: robot0_left_gripper_qvel, Value.shape: (2,)
Key: tube_initial_pos, Value.shape: (3,)
Key: pipette_initial_pos, Value.shape: (3,)
Key: pipette004_pos, Value.shape: (3,)
Key: pipette004_quat, Value.shape: (4,)
Key: tube008_pos, Value.shape: (3,)
Key: tube008_quat, Value.shape: (4,)
Key: gripper1_to_pipette004, Value.shape: (3,)
Key: pipette004_to_tube008, Value.shape: (3,)
Key: tube008_to_initial, Value.shape: (3,)
Key: pipette004_to_initial, Value.shape: (3,)
Key: robot0_proprio-state, Value.shape: (67,)
Key: object-state, Valu

In [4]:
env = GymWrapper(env)
env = TimeFeatureWrapper(env)
print(f"TimeFeature GYM Wrapper obs.shape: {env.reset().shape}", flush=True)

  logger.warn(


TimeFeature GYM Wrapper obs.shape: (100,)


# Load Model

In [5]:
model_name = "SAC"
learning_rate = 0.001
total_timesteps = 400*1
policy_kwargs = {'net_arch' : [512, 512, 512, 512], 
                'n_critics' : 4,
                }
n_actions = env.robots[0].action_dim
print(f"n_actions: {n_actions}", flush=True)
action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.2)

if model_name == "DDPG":
    model = DDPG(policy="MlpPolicy", env=env, replay_buffer_class=ReplayBuffer, verbose=1, gamma=0.95, batch_size=4096, 
                buffer_size=100000, learning_rate=learning_rate, action_noise=action_noise, policy_kwargs=policy_kwargs)
elif model_name == "SAC":
    model = SAC(policy="MlpPolicy", env=env, replay_buffer_class=ReplayBuffer, verbose=1, gamma = 0.95, batch_size=4096, 
                buffer_size=100000, learning_rate=learning_rate, action_noise=action_noise, policy_kwargs=policy_kwargs)
elif model_name == "PPO":
    model = PPO(policy="MlpPolicy", env=env, verbose=1, gamma=0.95, batch_size=4096)

n_actions: 17
Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [6]:
actor_model = model.policy.actor
actor_model.to(device)

Actor(
  (features_extractor): FlattenExtractor(
    (flatten): Flatten(start_dim=1, end_dim=-1)
  )
  (latent_pi): Sequential(
    (0): Linear(in_features=100, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=512, bias=True)
    (5): ReLU()
    (6): Linear(in_features=512, out_features=512, bias=True)
    (7): ReLU()
  )
  (mu): Linear(in_features=512, out_features=17, bias=True)
  (log_std): Linear(in_features=512, out_features=17, bias=True)
)

In [7]:
critic_model = model.policy.critic
critic_model.to(device)

ContinuousCritic(
  (features_extractor): FlattenExtractor(
    (flatten): Flatten(start_dim=1, end_dim=-1)
  )
  (qf0): Sequential(
    (0): Linear(in_features=117, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=512, bias=True)
    (5): ReLU()
    (6): Linear(in_features=512, out_features=512, bias=True)
    (7): ReLU()
    (8): Linear(in_features=512, out_features=1, bias=True)
  )
  (qf1): Sequential(
    (0): Linear(in_features=117, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=512, bias=True)
    (5): ReLU()
    (6): Linear(in_features=512, out_features=512, bias=True)
    (7): ReLU()
    (8): Linear(in_features=512, out_features=1, bias=True)
  )
  (qf2): Sequential(
    (0): Linear(in_features=117, out_features=512, bias=True)
    (1): Re

In [8]:
# test actor model
test_input = torch.ones(1, 100).to(device)
test_output = actor_model(test_input)
print(test_output.size())

torch.Size([1, 17])


In [9]:
# test critic model
state_input = torch.ones(1, 100).to(device)
action_input = torch.ones(1, 17).to(device)

test_output = critic_model(state_input, action_input)
for i, value in enumerate(test_output):
    print(f"Critic {i + 1} Output:", value.item())

Critic 1 Output: 0.0030027865432202816
Critic 2 Output: 0.02625722438097
Critic 3 Output: 0.008443258702754974
Critic 4 Output: -0.03417135775089264


# Train Model

In [27]:
action_seq = np.load("./collectdata/action_seq.npy")
obs_seq = np.load("./collectdata/obs_seq.npy")
reward_seq = np.load("./collectdata/reward_seq.npy")
print(action_seq.shape)
print(obs_seq.shape)
print(reward_seq.shape)

(10, 17)
(10, 100)
(10,)


In [14]:
class set_dataset(Dataset):
    def __init__(self, x, y):
        self.x = torch.tensor(x, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]

train_data = set_dataset(obs_seq, action_seq)
dataloader_train = DataLoader(train_data, batch_size=64, shuffle=True)

In [15]:
loss_function = nn.MSELoss()

optimizer = torch.optim.Adam(actor_model.parameters(), lr=0.00001)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.99)

def train(actor_model, device, dataloader_train, loss_function, optimizer):
    losses_train = []
    optimizer.step()
    actor_model.train()
    for x, y in dataloader_train:
        actor_model.zero_grad()
        x = x.to(device)
        y = y.to(device)
        output = actor_model.forward(x)
        loss = loss_function(output, y)
        loss.backward()
        optimizer.step()
        losses_train.append(loss.item())
    return np.mean(losses_train)

In [21]:
n_epochs = 1000
for epoch in range(n_epochs):
    loss_train = train(actor_model, device, dataloader_train, loss_function, optimizer)
    scheduler.step()
    print('EPOCH: {}, Train Loss: {:.3f}'.format(epoch, loss_train))

EPOCH: 0, Train Loss: 0.602
EPOCH: 1, Train Loss: 0.582
EPOCH: 2, Train Loss: 0.647
EPOCH: 3, Train Loss: 0.533
EPOCH: 4, Train Loss: 0.705
EPOCH: 5, Train Loss: 0.653
EPOCH: 6, Train Loss: 0.618
EPOCH: 7, Train Loss: 0.635
EPOCH: 8, Train Loss: 0.659
EPOCH: 9, Train Loss: 0.636
EPOCH: 10, Train Loss: 0.610
EPOCH: 11, Train Loss: 0.733
EPOCH: 12, Train Loss: 0.604
EPOCH: 13, Train Loss: 0.625
EPOCH: 14, Train Loss: 0.604
EPOCH: 15, Train Loss: 0.637
EPOCH: 16, Train Loss: 0.646
EPOCH: 17, Train Loss: 0.629
EPOCH: 18, Train Loss: 0.597
EPOCH: 19, Train Loss: 0.700
EPOCH: 20, Train Loss: 0.533
EPOCH: 21, Train Loss: 0.611
EPOCH: 22, Train Loss: 0.613
EPOCH: 23, Train Loss: 0.579
EPOCH: 24, Train Loss: 0.687
EPOCH: 25, Train Loss: 0.583
EPOCH: 26, Train Loss: 0.728
EPOCH: 27, Train Loss: 0.763
EPOCH: 28, Train Loss: 0.687
EPOCH: 29, Train Loss: 0.775
EPOCH: 30, Train Loss: 0.692
EPOCH: 31, Train Loss: 0.558
EPOCH: 32, Train Loss: 0.769
EPOCH: 33, Train Loss: 0.549
EPOCH: 34, Train Loss: 0

EPOCH: 295, Train Loss: 0.570
EPOCH: 296, Train Loss: 0.510
EPOCH: 297, Train Loss: 0.494
EPOCH: 298, Train Loss: 0.448
EPOCH: 299, Train Loss: 0.552
EPOCH: 300, Train Loss: 0.515
EPOCH: 301, Train Loss: 0.545
EPOCH: 302, Train Loss: 0.576
EPOCH: 303, Train Loss: 0.449
EPOCH: 304, Train Loss: 0.551
EPOCH: 305, Train Loss: 0.468
EPOCH: 306, Train Loss: 0.530
EPOCH: 307, Train Loss: 0.554
EPOCH: 308, Train Loss: 0.541
EPOCH: 309, Train Loss: 0.571
EPOCH: 310, Train Loss: 0.602
EPOCH: 311, Train Loss: 0.458
EPOCH: 312, Train Loss: 0.473
EPOCH: 313, Train Loss: 0.523
EPOCH: 314, Train Loss: 0.548
EPOCH: 315, Train Loss: 0.582
EPOCH: 316, Train Loss: 0.508
EPOCH: 317, Train Loss: 0.494
EPOCH: 318, Train Loss: 0.511
EPOCH: 319, Train Loss: 0.606
EPOCH: 320, Train Loss: 0.500
EPOCH: 321, Train Loss: 0.544
EPOCH: 322, Train Loss: 0.526
EPOCH: 323, Train Loss: 0.477
EPOCH: 324, Train Loss: 0.476
EPOCH: 325, Train Loss: 0.534
EPOCH: 326, Train Loss: 0.518
EPOCH: 327, Train Loss: 0.604
EPOCH: 328

EPOCH: 583, Train Loss: 0.495
EPOCH: 584, Train Loss: 0.483
EPOCH: 585, Train Loss: 0.490
EPOCH: 586, Train Loss: 0.567
EPOCH: 587, Train Loss: 0.467
EPOCH: 588, Train Loss: 0.494
EPOCH: 589, Train Loss: 0.487
EPOCH: 590, Train Loss: 0.482
EPOCH: 591, Train Loss: 0.519
EPOCH: 592, Train Loss: 0.510
EPOCH: 593, Train Loss: 0.539
EPOCH: 594, Train Loss: 0.550
EPOCH: 595, Train Loss: 0.492
EPOCH: 596, Train Loss: 0.525
EPOCH: 597, Train Loss: 0.502
EPOCH: 598, Train Loss: 0.461
EPOCH: 599, Train Loss: 0.441
EPOCH: 600, Train Loss: 0.494
EPOCH: 601, Train Loss: 0.503
EPOCH: 602, Train Loss: 0.537
EPOCH: 603, Train Loss: 0.556
EPOCH: 604, Train Loss: 0.632
EPOCH: 605, Train Loss: 0.618
EPOCH: 606, Train Loss: 0.529
EPOCH: 607, Train Loss: 0.581
EPOCH: 608, Train Loss: 0.512
EPOCH: 609, Train Loss: 0.615
EPOCH: 610, Train Loss: 0.495
EPOCH: 611, Train Loss: 0.560
EPOCH: 612, Train Loss: 0.553
EPOCH: 613, Train Loss: 0.560
EPOCH: 614, Train Loss: 0.494
EPOCH: 615, Train Loss: 0.482
EPOCH: 616

EPOCH: 885, Train Loss: 0.494
EPOCH: 886, Train Loss: 0.540
EPOCH: 887, Train Loss: 0.554
EPOCH: 888, Train Loss: 0.525
EPOCH: 889, Train Loss: 0.482
EPOCH: 890, Train Loss: 0.476
EPOCH: 891, Train Loss: 0.543
EPOCH: 892, Train Loss: 0.443
EPOCH: 893, Train Loss: 0.433
EPOCH: 894, Train Loss: 0.511
EPOCH: 895, Train Loss: 0.554
EPOCH: 896, Train Loss: 0.517
EPOCH: 897, Train Loss: 0.518
EPOCH: 898, Train Loss: 0.507
EPOCH: 899, Train Loss: 0.552
EPOCH: 900, Train Loss: 0.429
EPOCH: 901, Train Loss: 0.475
EPOCH: 902, Train Loss: 0.483
EPOCH: 903, Train Loss: 0.457
EPOCH: 904, Train Loss: 0.544
EPOCH: 905, Train Loss: 0.594
EPOCH: 906, Train Loss: 0.503
EPOCH: 907, Train Loss: 0.495
EPOCH: 908, Train Loss: 0.531
EPOCH: 909, Train Loss: 0.431
EPOCH: 910, Train Loss: 0.523
EPOCH: 911, Train Loss: 0.606
EPOCH: 912, Train Loss: 0.533
EPOCH: 913, Train Loss: 0.539
EPOCH: 914, Train Loss: 0.495
EPOCH: 915, Train Loss: 0.506
EPOCH: 916, Train Loss: 0.513
EPOCH: 917, Train Loss: 0.446
EPOCH: 918

In [28]:
savepath = "./models/maholo_SAC_actor_weights.pth"
torch.save(actor_model.state_dict(), savepath)
print("Saved to", savepath)

Saved to ./models/maholo_SAC_actor_weights.pth


***

# Render

In [4]:
env = suite.make(
    env_name="Lift",
    robots="Panda",
    has_renderer=True,
    has_offscreen_renderer=True,
    use_camera_obs=False,
    control_freq=50,
    horizon = 50,
)
env = GymWrapper(env)
env = TimeFeatureWrapper(env)
model = DDPG.load(modelpath, env = env)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [5]:
done = False
obs = env.reset()
while not done:
    action, _states = model.predict(obs, deterministic = True)
    obs, reward, done, _ = env.step(action)
    env.unwrapped.render()
env.close()