In [39]:
import robosuite as suite
from robosuite.wrappers.gym_wrapper import GymWrapper
import numpy as np
from stable_baselines3 import DDPG , SAC, PPO
from stable_baselines3.common.buffers import ReplayBuffer
from stable_baselines3.common.noise import NormalActionNoise
from sb3_contrib.common.wrappers import TimeFeatureWrapper
import argparse
modelpath = './models/PPO_3.0'

import torch
from torch.utils.data import Dataset,TensorDataset,random_split,DataLoader,SubsetRandomSampler
from torch.utils.data.dataset import Subset
import torch.nn as nn
import torch.nn.functional as F
from torchvision import transforms
import torchvision.models as models

In [40]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("torch.device(cuda)")
    print("torch.cuda.device_count(): ", torch.cuda.device_count())
    for i in range(torch.cuda.device_count()):
        print(torch.cuda.get_device_name())
    print("torch.cuda.current_device()", torch.cuda.current_device())
else:
    device = torch.device("cpu")
    print("torch.device(cpu)")

torch.device(cuda)
torch.cuda.device_count():  1
Tesla V100-SXM2-16GB
torch.cuda.current_device() 0


# Train

In [11]:
env = suite.make(
    "MaholoLaboratory",
    "Maholo",
    has_renderer=False,
    has_offscreen_renderer=False,
    use_camera_obs=False,
    control_freq=50,
    render_camera="frontview",
    render_gpu_device_id=0,
    horizon=400,
)
for key,value in env.reset().items():
    print(f"Key: {key}, Value.shape: {value.shape}", flush=True)

Key: robot0_joint_pos_cos, Value.shape: (15,)
Key: robot0_joint_pos_sin, Value.shape: (15,)
Key: robot0_joint_vel, Value.shape: (15,)
Key: robot0_right_eef_pos, Value.shape: (3,)
Key: robot0_right_eef_quat, Value.shape: (4,)
Key: robot0_right_gripper_qpos, Value.shape: (2,)
Key: robot0_right_gripper_qvel, Value.shape: (2,)
Key: robot0_left_eef_pos, Value.shape: (3,)
Key: robot0_left_eef_quat, Value.shape: (4,)
Key: robot0_left_gripper_qpos, Value.shape: (2,)
Key: robot0_left_gripper_qvel, Value.shape: (2,)
Key: tube_initial_pos, Value.shape: (3,)
Key: pipette_initial_pos, Value.shape: (3,)
Key: pipette004_pos, Value.shape: (3,)
Key: pipette004_quat, Value.shape: (4,)
Key: tube008_pos, Value.shape: (3,)
Key: tube008_quat, Value.shape: (4,)
Key: gripper1_to_pipette004, Value.shape: (3,)
Key: pipette004_to_tube008, Value.shape: (3,)
Key: tube008_to_initial, Value.shape: (3,)
Key: pipette004_to_initial, Value.shape: (3,)
Key: robot0_proprio-state, Value.shape: (67,)
Key: object-state, Valu

In [12]:
env = GymWrapper(env)
env = TimeFeatureWrapper(env)
print(f"TimeFeature GYM Wrapper obs.shape: {env.reset().shape}", flush=True)

TimeFeature GYM Wrapper obs.shape: (100,)


In [29]:
model_name = "SAC"
learning_rate = 0.001
total_timesteps = 400*1
policy_kwargs = {'net_arch' : [512, 512, 512, 512], 
                'n_critics' : 4,
                }
n_actions = env.robots[0].action_dim
print(f"n_actions: {n_actions}", flush=True)
action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.2)

if model_name == "DDPG":
    model = DDPG(policy="MlpPolicy", env=env, replay_buffer_class=ReplayBuffer, verbose=1, gamma=0.95, batch_size=4096, 
                buffer_size=100000, learning_rate=learning_rate, action_noise=action_noise, policy_kwargs=policy_kwargs)
elif model_name == "SAC":
    model = SAC(policy="MlpPolicy", env=env, replay_buffer_class=ReplayBuffer, verbose=1, gamma = 0.95, batch_size=4096, 
                buffer_size=100000, learning_rate=learning_rate, action_noise=action_noise, policy_kwargs=policy_kwargs)
elif model_name == "PPO":
    model = PPO(policy="MlpPolicy", env=env, verbose=1, gamma=0.95, batch_size=4096)

n_actions: 17
Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [33]:
print(model)
# print(model.policy)
print(model.actor)
print("#############################################################")
print(model.critic)

<stable_baselines3.sac.sac.SAC object at 0x149521d85120>
Actor(
  (features_extractor): FlattenExtractor(
    (flatten): Flatten(start_dim=1, end_dim=-1)
  )
  (latent_pi): Sequential(
    (0): Linear(in_features=100, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=512, bias=True)
    (5): ReLU()
    (6): Linear(in_features=512, out_features=512, bias=True)
    (7): ReLU()
  )
  (mu): Linear(in_features=512, out_features=17, bias=True)
  (log_std): Linear(in_features=512, out_features=17, bias=True)
)
#############################################################
ContinuousCritic(
  (features_extractor): FlattenExtractor(
    (flatten): Flatten(start_dim=1, end_dim=-1)
  )
  (qf0): Sequential(
    (0): Linear(in_features=117, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_fe

In [52]:
actor_model = model.policy.actor
actor_model.to(device)

Actor(
  (features_extractor): FlattenExtractor(
    (flatten): Flatten(start_dim=1, end_dim=-1)
  )
  (latent_pi): Sequential(
    (0): Linear(in_features=100, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=512, bias=True)
    (5): ReLU()
    (6): Linear(in_features=512, out_features=512, bias=True)
    (7): ReLU()
  )
  (mu): Linear(in_features=512, out_features=17, bias=True)
  (log_std): Linear(in_features=512, out_features=17, bias=True)
)

In [53]:
critic_model = model.policy.critic
critic_model.to(device)

ContinuousCritic(
  (features_extractor): FlattenExtractor(
    (flatten): Flatten(start_dim=1, end_dim=-1)
  )
  (qf0): Sequential(
    (0): Linear(in_features=117, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=512, bias=True)
    (5): ReLU()
    (6): Linear(in_features=512, out_features=512, bias=True)
    (7): ReLU()
    (8): Linear(in_features=512, out_features=1, bias=True)
  )
  (qf1): Sequential(
    (0): Linear(in_features=117, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=512, bias=True)
    (5): ReLU()
    (6): Linear(in_features=512, out_features=512, bias=True)
    (7): ReLU()
    (8): Linear(in_features=512, out_features=1, bias=True)
  )
  (qf2): Sequential(
    (0): Linear(in_features=117, out_features=512, bias=True)
    (1): Re

In [61]:
# test actor model
test_input = torch.ones(1, 100).to(device)
test_output = actor_model(test_input)
print(test_output.size())

torch.Size([1, 17])


In [64]:
# test critic model
state_input = torch.ones(1, 100).to(device)
action_input = torch.ones(1, 17).to(device)

test_output = critic_model(state_input, action_input)
for i, value in enumerate(test_output):
    print(f"Critic {i + 1} Output:", value.item())

Critic 1 Output: -0.005036994814872742
Critic 2 Output: -0.008083839900791645
Critic 3 Output: -0.02584124729037285
Critic 4 Output: -0.039726100862026215


In [9]:
model.learn(total_timesteps=1000)
model.save(modelpath)
print("Saved to ", modelpath)

Saved to  ./models/DDPG


***

# Render

In [4]:
env = suite.make(
    env_name="Lift",
    robots="Panda",
    has_renderer=True,
    has_offscreen_renderer=True,
    use_camera_obs=False,
    control_freq=50,
    horizon = 50,
)
env = GymWrapper(env)
env = TimeFeatureWrapper(env)
model = DDPG.load(modelpath, env = env)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [5]:
done = False
obs = env.reset()
while not done:
    action, _states = model.predict(obs, deterministic = True)
    obs, reward, done, _ = env.step(action)
    env.unwrapped.render()
env.close()