In [2]:
# import robosuite as suite
# from robosuite import load_controller_config
# from robosuite.wrappers.gym_wrapper import GymWrapper
import numpy as np
np.set_printoptions(precision=4, suppress=True)
from stable_baselines3 import DDPG , SAC, PPO
from stable_baselines3.common.buffers import ReplayBuffer
from stable_baselines3.common.noise import NormalActionNoise
from sb3_contrib.common.wrappers import TimeFeatureWrapper
import argparse, os, glob

import torch
from torch.utils.data import Dataset,TensorDataset,random_split,DataLoader,SubsetRandomSampler
from torch.utils.data.dataset import Subset
import torch.nn as nn
import torch.nn.functional as F
from torchvision import transforms
import torchvision.models as models

In [2]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("torch.device(cuda)")
    print("torch.cuda.device_count(): ", torch.cuda.device_count())
    for i in range(torch.cuda.device_count()):
        print(torch.cuda.get_device_name())
    print("torch.cuda.current_device()", torch.cuda.current_device())
else:
    device = torch.device("cpu")
    print("torch.device(cpu)")

torch.device(cuda)
torch.cuda.device_count():  1
Tesla V100-SXM2-16GB
torch.cuda.current_device() 0


# Load Env

In [72]:
controller_config = load_controller_config(default_controller="JOINT_POSITION")
env = suite.make(
    "MaholoLaboratory_eefR_Move2Pipette",
    "Maholo",
    controller_configs=controller_config,
    has_renderer=False,
    has_offscreen_renderer=False,
    use_camera_obs=False,
    control_freq=50,
    render_camera="frontview",
    render_gpu_device_id=0,
    horizon=500,
    initialization_noise=None,
)
for key,value in env.reset().items():
    print(f"Key: {key}, Value.shape: {value.shape}", flush=True)

Key: robot0_joint_pos_cos, Value.shape: (15,)
Key: robot0_joint_pos_sin, Value.shape: (15,)
Key: robot0_joint_vel, Value.shape: (15,)
Key: robot0_right_eef_pos, Value.shape: (3,)
Key: robot0_right_eef_quat, Value.shape: (4,)
Key: robot0_right_gripper_qpos, Value.shape: (2,)
Key: robot0_right_gripper_qvel, Value.shape: (2,)
Key: robot0_left_eef_pos, Value.shape: (3,)
Key: robot0_left_eef_quat, Value.shape: (4,)
Key: robot0_left_gripper_qpos, Value.shape: (2,)
Key: robot0_left_gripper_qvel, Value.shape: (2,)
Key: tube008_initial_pos, Value.shape: (3,)
Key: tube008_pos, Value.shape: (3,)
Key: tube008_quat, Value.shape: (4,)
Key: pipette004_initial_pos, Value.shape: (3,)
Key: pipette004_pos, Value.shape: (3,)
Key: pipette004_quat, Value.shape: (4,)
Key: target_pos, Value.shape: (3,)
Key: target_quat, Value.shape: (4,)
Key: g1_to_target_pos, Value.shape: (3,)
Key: g1_to_target_quat, Value.shape: ()
Key: g0_to_target_pos, Value.shape: (3,)
Key: g0_to_target_quat, Value.shape: ()
Key: robot0_

In [73]:
env = GymWrapper(env)
env = TimeFeatureWrapper(env)
print(f"TimeFeature GYM Wrapper obs.shape: {env.reset().shape}", flush=True)

TimeFeature GYM Wrapper obs.shape: (103,)


# Policy Network

In [20]:
input_size = env.reset().shape[0]
output_size = 17

## 1. Transformer

In [34]:
transformer_model = nn.Transformer(nhead=16, num_encoder_layers=12)
src = torch.rand((10, 32, 512))
tgt = torch.rand((20, 32, 512))
out = transformer_model(src, tgt)
print(out.shape)

torch.Size([20, 32, 512])


In [45]:
class TransformerNet(nn.Module):
    def __init__(self, d_model, nhead, num_layers):
        super(TransformerNet, self).__init__()
        
        self.embedding_src = nn.Linear(input_size, d_model)
        self.embedding_tgt = nn.Linear(output_size, d_model)
        
        self.transformer = nn.Transformer(d_model, nhead, num_layers)
        self.fc = nn.Linear(d_model, output_size)

    def forward(self, src, tgt):
        # Embedding layers
        src = self.embedding_src(src).unsqueeze(0)  # Add sequence dimension
        tgt = self.embedding_tgt(tgt).unsqueeze(0)  # Add sequence dimension
        
        # Transformer
        output = self.transformer(src, tgt)
        
        # Final layer to get the action sequence
        return self.fc(output.squeeze(0))

# Example usage:
d_model = 512
nhead = 16
num_layers = 12

model = TransformerNet(d_model, nhead, num_layers)

src = torch.rand(input_size)
tgt = torch.rand(output_size)
action = model(src, tgt)
print(action.shape)  # torch.Size([17])

torch.Size([17])


## 2. ResNet

In [None]:
from stable_baselines3 import SAC
from stable_baselines3.common.policies import ActorCriticPolicy
from gym import spaces

## PPO

In [45]:
class ResidualBlock(nn.Module):
    def __init__(self, dim):
        super(ResidualBlock, self).__init__()
        self.block = nn.Sequential(
            nn.Linear(dim, dim),
            nn.ReLU(),
            nn.Linear(dim, dim)
        )
    
    def forward(self, x):
        return x + self.block(x)

class ResNet(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_dim=512):
        super(ResNet, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            ResidualBlock(hidden_dim),
            nn.ReLU(),
            ResidualBlock(hidden_dim),
            nn.ReLU(),
            ResidualBlock(hidden_dim),
            nn.ReLU(),
            ResidualBlock(hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim)
        )
    
    def forward(self, x):
        return self.network(x)

In [None]:
class CustomSACPolicy(ActorCriticPolicy):
    def __init__(self, observation_space: spaces.Space, action_space: spaces.Space, lr_schedule: Callable[[float], float], *args, **kwargs):
        super(CustomSACPolicy, self).__init__(observation_space, action_space, lr_schedule, *args, **kwargs)
        self._build_custom_extractor(lr_schedule)

    def _build_custom_extractor(self, lr_schedule):
        # Define custom actor network
        self.actor = ResNet(self.observation_space.shape[0], self.action_space.shape[0])
        # Define optimizer for the actor
        self.actor.optimizer = torch.optim.Adam(self.actor.parameters(), lr=lr_schedule(1))
        
        # Define critic networks for SAC (Q-values)
        self.critic = ResNet(self.observation_space.shape[0] + self.action_space.shape[0], 1)
        self.critic.optimizer = torch.optim.Adam(self.critic.parameters(), lr=lr_schedule(1))
        
        self.critic_target = ResNet(self.observation_space.shape[0] + self.action_space.shape[0], 1)
        self.critic_target.load_state_dict(self.critic.state_dict())

    def forward(self, obs: torch.Tensor, deterministic: bool = False):
        # This method should return the action and the actor's latent
        # Here we overwrite the default behavior to use our custom ResNet
        return self.actor(obs), None

In [41]:
# 使用自定义策略训练模型
model = PPO(CustomActorCriticPolicy, env, verbose=1)
model.learn(500*10)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 2e+03    |
|    ep_rew_mean     | -447     |
| time/              |          |
|    fps             | 55       |
|    iterations      | 1        |
|    time_elapsed    | 36       |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 2e+03       |
|    ep_rew_mean          | -444        |
| time/                   |             |
|    fps                  | 54          |
|    iterations           | 2           |
|    time_elapsed         | 75          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.014249872 |
|    clip_fraction        | 0.204       |
|    clip_range           | 0.2         |
|    entropy_loss  

<stable_baselines3.ppo.ppo.PPO at 0x14e2de84d7b0>

## SAC

In [78]:
from stable_baselines3 import SAC
from stable_baselines3.common.policies import ActorCriticPolicy
from gym import spaces
import torch as th

class ActorNetwork(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_dim=512):
        super(ActorNetwork, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            ResidualBlock(hidden_dim),
            nn.ReLU(),
            ResidualBlock(hidden_dim),
            nn.Linear(hidden_dim, output_dim)
        )
        self.action_scale = 1.0
        self.action_bias = 0.0

    def action_log_prob(self, x):
        # Get the logits for the actions
        logits = self.network(x)
        # Create a distribution from the logits
        dist = th.distributions.Normal(logits, 1.0)
        # Sample an action from the distribution
        action = dist.rsample()
        # Scale and shift the action to the desired range
        action = th.tanh(action) * self.action_scale + self.action_bias
        # Calculate the log probability of the action
        log_prob = dist.log_prob(action).sum(axis=-1)
        # Return the action and the log probability
        return action, log_prob

    def forward(self, x):
        return self.network(x)

class CriticNetwork(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_dim=512):
        super(CriticNetwork, self).__init__()
        self.q1_network = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            ResidualBlock(hidden_dim),
            nn.ReLU(),
            ResidualBlock(hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim)
        )
        self.q2_network = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            ResidualBlock(hidden_dim),
            nn.ReLU(),
            ResidualBlock(hidden_dim),
            nn.Linear(hidden_dim, output_dim)
        )
    
    def forward(self, obs, action):
        q1 = self.q1_network(th.cat([obs, action], dim=1))
        q2 = self.q2_network(th.cat([obs, action], dim=1))
        return q1, q2


class CustomSACPolicy(ActorCriticPolicy):
    def __init__(self, observation_space: spaces.Space, action_space: spaces.Space, lr_schedule: Callable[[float], float], *args, **kwargs):
        super(CustomSACPolicy, self).__init__(observation_space, action_space, lr_schedule, *args, **kwargs)
        self._build_custom_extractor(lr_schedule)

    def _build_custom_extractor(self, lr_schedule):
        # Define actor network
        self.actor = ActorNetwork(self.observation_space.shape[0], self.action_space.shape[0])
        self.actor.optimizer = th.optim.Adam(self.actor.parameters(), lr=lr_schedule(1))
        
        # Define critic networks for SAC (Q-values)
        self.critic = CriticNetwork(self.observation_space.shape[0]+self.action_space.shape[0], 1)
        self.critic.optimizer = th.optim.Adam(self.critic.parameters(), lr=lr_schedule(1))
        
        self.critic_target = CriticNetwork(self.observation_space.shape[0]+self.action_space.shape[0], 1)
        self.critic_target.load_state_dict(self.critic.state_dict())

    def forward(self, obs: th.Tensor, deterministic: bool = False):
        # This method should return the action and the actor's latent
        # Here we overwrite the default behavior to use our custom ActorNetwork
        action, log_prob = self.actor.action_log_prob(obs)
        return action, None

In [79]:
# 使用自定义策略训练SAC模型
action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.2)
model = SAC(CustomSACPolicy, env=env, replay_buffer_class=ReplayBuffer, verbose=1, gamma = 0.9, batch_size=1000, 
            buffer_size=500000, learning_rate=0.001, action_noise=action_noise)
model.learn(10000)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/           |           |
|    ep_len_mean     | 500       |
|    ep_rew_mean     | -116      |
| time/              |           |
|    episodes        | 4         |
|    fps             | 34        |
|    time_elapsed    | 58        |
|    total_timesteps | 2000      |
| train/             |           |
|    actor_loss      | -8.7e+23  |
|    critic_loss     | inf       |
|    ent_coef        | 0.847     |
|    ent_coef_loss   | -1.71e+23 |
|    learning_rate   | 0.001     |
|    n_updates       | 1899      |
----------------------------------
----------------------------------
| rollout/           |           |
|    ep_len_mean     | 500       |
|    ep_rew_mean     | -116      |
| time/              |           |
|    episodes        | 8         |
|    fps             | 33        |
|    time_elapsed    | 118       |
|    total_timesteps | 4000   

<stable_baselines3.sac.sac.SAC at 0x14e2dc2c7a00>

# Load Model

In [69]:
actor_model = model.policy.actor.float()
actor_model.to(device)

Actor(
  (features_extractor): FlattenExtractor(
    (flatten): Flatten(start_dim=1, end_dim=-1)
  )
  (latent_pi): Sequential(
    (0): Linear(in_features=100, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=512, bias=True)
    (5): ReLU()
    (6): Linear(in_features=512, out_features=512, bias=True)
    (7): ReLU()
  )
  (mu): Linear(in_features=512, out_features=14, bias=True)
  (log_std): Linear(in_features=512, out_features=14, bias=True)
)

In [54]:
critic_model = model.policy.critic.float()
critic_model.to(device)

ContinuousCritic(
  (features_extractor): FlattenExtractor(
    (flatten): Flatten(start_dim=1, end_dim=-1)
  )
  (qf0): Sequential(
    (0): Linear(in_features=114, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=512, bias=True)
    (5): ReLU()
    (6): Linear(in_features=512, out_features=512, bias=True)
    (7): ReLU()
    (8): Linear(in_features=512, out_features=1, bias=True)
  )
  (qf1): Sequential(
    (0): Linear(in_features=114, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=512, bias=True)
    (5): ReLU()
    (6): Linear(in_features=512, out_features=512, bias=True)
    (7): ReLU()
    (8): Linear(in_features=512, out_features=1, bias=True)
  )
  (qf2): Sequential(
    (0): Linear(in_features=114, out_features=512, bias=True)
    (1): Re

In [11]:
# test actor model
test_input = torch.ones(1, 100).to(device)
test_output = actor_model(test_input)
print(test_output.size())

torch.Size([1, 14])


In [12]:
# test critic model
state_input = torch.ones(1, 100).to(device)
action_input = torch.ones(1, 14).to(device)

test_output = critic_model(state_input, action_input)
for i, value in enumerate(test_output):
    print(f"Critic {i + 1} Output:", value.item())

Critic 1 Output: 0.0009697331115603447
Critic 2 Output: 0.03320334851741791
Critic 3 Output: -0.0352470688521862
Critic 4 Output: -0.06494784355163574


# Human cloning

In [59]:
class NPYFolder(Dataset):
    def __init__(self, obs_dir, action_dir, transform=None):
        self.transform = transform
        
        # 使用glob匹配文件模式并获取所有的obs和action_OSC文件
        self.obs_files = sorted(glob.glob(os.path.join(obs_dir, "obs_seq_OSC_*")))
        self.action_files = sorted(glob.glob(os.path.join(action_dir, "action_seq_OSC_*")))
        
        assert len(self.obs_files) == len(self.action_files), "Number of obs and action_OSC files must be the same!"

    def __len__(self):
        return len(self.obs_files)

    def __getitem__(self, idx):
        obs = np.load(self.obs_files[idx])
        action = np.load(self.action_files[idx])
        
        sample = {'obs': obs, 'action': action}
        
        if self.transform:
            sample = self.transform(sample)
        
        return sample


obs_dir = "./collectdata/obs"
action_dir = "./collectdata/action_OSC"
dataset = NPYFolder(obs_dir, action_dir)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=64, shuffle=True)


sample_batch = next(iter(dataloader))
obs_shape = sample_batch['obs'].shape
action_shape = sample_batch['action'].shape
print(f"obs shape: {obs_shape}")
print(f"action shape: {action_shape}")

obs shape: torch.Size([10, 3000, 100])
action shape: torch.Size([10, 3000, 14])


In [60]:
loss_function = nn.MSELoss()

optimizer = torch.optim.Adam(actor_model.parameters(), lr=0.00001)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.99)

def train(actor_model, device, dataloader, loss_function, optimizer):
    losses_train = []
    optimizer.step()
    actor_model.train()
    for databatch in dataloader:
        for n in range(databatch['obs'].shape[1]):
            actor_model.zero_grad()
            x = databatch['obs'][:,n,:].float().to(device)
            y = databatch['action'][:,n,:].float().to(device)
            output = actor_model.forward(x)
            loss = loss_function(output, y)
            loss.backward()
            optimizer.step()
            losses_train.append(loss.item())
    return np.mean(losses_train)

In [64]:
n_epochs = 100
for epoch in range(n_epochs):
    loss_train = train(actor_model, device, dataloader, loss_function, optimizer)
    scheduler.step()
    print('EPOCH: {}, Train Loss: {:.3f}'.format(epoch, loss_train))

EPOCH: 0, Train Loss: 0.003
EPOCH: 1, Train Loss: 0.003
EPOCH: 2, Train Loss: 0.003
EPOCH: 3, Train Loss: 0.003
EPOCH: 4, Train Loss: 0.003
EPOCH: 5, Train Loss: 0.003
EPOCH: 6, Train Loss: 0.003
EPOCH: 7, Train Loss: 0.003
EPOCH: 8, Train Loss: 0.003
EPOCH: 9, Train Loss: 0.003
EPOCH: 10, Train Loss: 0.003
EPOCH: 11, Train Loss: 0.003
EPOCH: 12, Train Loss: 0.003
EPOCH: 13, Train Loss: 0.003
EPOCH: 14, Train Loss: 0.003
EPOCH: 15, Train Loss: 0.003
EPOCH: 16, Train Loss: 0.003
EPOCH: 17, Train Loss: 0.003
EPOCH: 18, Train Loss: 0.003
EPOCH: 19, Train Loss: 0.003
EPOCH: 20, Train Loss: 0.003
EPOCH: 21, Train Loss: 0.003
EPOCH: 22, Train Loss: 0.003
EPOCH: 23, Train Loss: 0.003
EPOCH: 24, Train Loss: 0.003
EPOCH: 25, Train Loss: 0.003
EPOCH: 26, Train Loss: 0.003
EPOCH: 27, Train Loss: 0.003
EPOCH: 28, Train Loss: 0.003
EPOCH: 29, Train Loss: 0.003
EPOCH: 30, Train Loss: 0.003
EPOCH: 31, Train Loss: 0.003
EPOCH: 32, Train Loss: 0.003
EPOCH: 33, Train Loss: 0.003
EPOCH: 34, Train Loss: 0

In [65]:
weightpath = "./models/maholo_SAC_OSC_Actor.pth"
torch.save(actor_model.state_dict(), weightpath)
print("Saved to", savepath)

Saved to ./models/maholo_SAC_actor_weights.pth


In [11]:
weightpath = "./models/maholo_SAC_OSC.pth"
policy_kwargs = {'net_arch' : [512, 512, 512, 512], 
                'n_critics' : 4,
                }
model = SAC(policy="MlpPolicy", env=env, policy_kwargs=policy_kwargs)
# model.policy.actor.load_state_dict(torch.load(weightpath))
model.policy.load_state_dict(torch.load(weightpath))

<All keys matched successfully>

***

# Render

In [4]:
env = suite.make(
    env_name="Lift",
    robots="Panda",
    has_renderer=True,
    has_offscreen_renderer=True,
    use_camera_obs=False,
    control_freq=50,
    horizon = 50,
)
env = GymWrapper(env)
env = TimeFeatureWrapper(env)
model = DDPG.load(modelpath, env = env)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [5]:
done = False
obs = env.reset()
while not done:
    action, _states = model.predict(obs, deterministic = True)
    obs, reward, done, _ = env.step(action)
    env.unwrapped.render()
env.close()