In [1]:
import pybullet
from hier_env import HierarchicalHumanoidEnv
from low_level_env import LowLevelHumanoidEnv
import time
import numpy as np
import pandas as pd
import ray
from ray.rllib.agents.ppo import PPOTrainer
from ray.tune.registry import register_env
from ray.tune import function
import pickle

Instructions for updating:
non-resource variables are not supported in the long term


In [2]:
def make_env_low(env_config):
    import pybullet_envs

    return LowLevelHumanoidEnv()


def make_env_hier(env_config):
    import pybullet_envs

    return HierarchicalHumanoidEnv()


def policy_mapping_fn(agent_id):
    if agent_id.startswith("low_level_"):
        return "low_level_policy"
    else:
        return "high_level_policy"

In [3]:
ENV_LOW = "HumanoidBulletEnv-v0-Low"
register_env(ENV_LOW, make_env_low)
config_low = {
    "env": ENV_LOW,
    "num_workers": 0,
    "num_envs_per_worker": 1,
    "log_level": "WARN",
    "num_gpus": 1,
    "monitor": True,
    "evaluation_num_episodes": 50,
    "gamma": 0.995,
    "lambda": 0.95,
    "clip_param": 0.2,
    "kl_coeff": 1.0,
    "num_sgd_iter": 20,
    "lr": 0.0005,
    "sgd_minibatch_size": 8000,
    "train_batch_size": 24000,
    "model": {
        "fcnet_hiddens": [1024, 512],
        "fcnet_activation": "tanh",
        "free_log_std": True,
    },
    "batch_mode": "complete_episodes",
    "observation_filter": "NoFilter",
    "framework": "tf",
}

single_env = HierarchicalHumanoidEnv()

ENV_HIER = "HumanoidBulletEnvHier-v0"
register_env(ENV_HIER, make_env_hier)
highLevelPolicy = (
    None,
    single_env.high_level_obs_space,
    single_env.high_level_act_space,
    {
        "model": {
            "fcnet_hiddens": [512, 256],
            "fcnet_activation": "tanh",
            "free_log_std": False,
        },
    },
)

lowLevelPolicy = (
    None,
    single_env.low_level_obs_space,
    single_env.low_level_act_space,
    {
        "model": {
            "fcnet_hiddens": [1024, 512],
            "fcnet_activation": "tanh",
            "free_log_std": True,
        },
    },
)

config = {
    "env": ENV_HIER,
    "num_workers": 0,
    "num_envs_per_worker": 1,
    "multiagent": {
        "policies": {
            "high_level_policy": highLevelPolicy,
            "low_level_policy": lowLevelPolicy,
        },
        "policy_mapping_fn": function(policy_mapping_fn),
    },
    "log_level": "WARN",
    "num_gpus": 1,
    "monitor": True,
    "evaluation_num_episodes": 50,
    "gamma": 0.995,
    "lambda": 0.95,
    "clip_param": 0.2,
    "kl_coeff": 1.0,
    "num_sgd_iter": 20,
    "lr": 0.0005,
    "sgd_minibatch_size": 12000,
    "train_batch_size": 36000,
    "batch_mode": "complete_episodes",
    "observation_filter": "NoFilter",
}



In [4]:
ray.shutdown()
ray.init(ignore_reinit_error=True)

2021-04-19 19:54:18,652	INFO services.py:1172 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


{'node_ip_address': '192.168.0.108',
 'raylet_ip_address': '192.168.0.108',
 'redis_address': '192.168.0.108:6379',
 'object_store_address': '/tmp/ray/session_2021-04-19_19-54-18_241771_96224/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2021-04-19_19-54-18_241771_96224/sockets/raylet',
 'webui_url': '127.0.0.1:8265',
 'session_dir': '/tmp/ray/session_2021-04-19_19-54-18_241771_96224',
 'metrics_export_port': 42237,
 'node_id': '0cdbc2626075633c8861828439993514de03833c3f2ff0474e6295c4'}

In [6]:
agentLow = PPOTrainer(config_low)
experiment_name = "HWalk_Low_Mimic"
experiment_id = "PPO_HumanoidBulletEnvLow-v0_699c9_00000_0_2021-04-18_22-14-39"
checkpoint_num = "1930"
agentLow.restore(
    "/home/aditya/ray_results/{}/{}/checkpoint_{}/checkpoint-{}".format(
        experiment_name, experiment_id, checkpoint_num, checkpoint_num
    )
)

The following Variables were used a Lambda layer's call (lambda), but
are not present in its tracked objects:
  <tf.Variable 'default_policy/log_std:0' shape=(17,) dtype=float32>
It is possible that this is intended behavior, but it is more likely
an omission. This is a strong indication that this layer should be
formulated as a subclassed Layer rather than a Lambda layer.
2021-04-19 19:54:44,679	INFO trainable.py:371 -- Restored on 192.168.0.108 from checkpoint: /home/aditya/ray_results/HWalk_Low_Mimic/PPO_HumanoidBulletEnvLow-v0_699c9_00000_0_2021-04-18_22-14-39/checkpoint_1930/checkpoint-1930
2021-04-19 19:54:44,680	INFO trainable.py:379 -- Current state after restoring: {'_iteration': 1930, '_timesteps_total': None, '_time_total': 42152.27765059471, '_episodes_total': 1118553}


In [17]:
s1 = agentLow.get_policy("default_policy").get_state()

In [18]:
s1.keys()

odict_keys(['default_policy/log_std', 'default_policy/fc_1/kernel', 'default_policy/fc_1/bias', 'default_policy/fc_2/kernel', 'default_policy/fc_2/bias', 'default_policy/fc_value_1/kernel', 'default_policy/fc_value_1/bias', 'default_policy/fc_out/kernel', 'default_policy/fc_out/bias', 'default_policy/fc_value_2/kernel', 'default_policy/fc_value_2/bias', 'default_policy/value_out/kernel', 'default_policy/value_out/bias', '_optimizer_variables'])

In [33]:
agentHigh = PPOTrainer(config)
experiment_name2 = "HWalk_Hier_Mimic"
experiment_id2 = "PPO_HumanoidBulletEnvHier-v0_3b65d_00000_0_2021-04-19_15-24-09"
checkpoint_num2 = "840"
# agentHigh.restore(
#     "/home/aditya/ray_results/{}/{}/checkpoint_{}/checkpoint-{}".format(
#         experiment_name2, experiment_id2, checkpoint_num2, checkpoint_num2
#     )
# )

The following Variables were used a Lambda layer's call (lambda), but
are not present in its tracked objects:
  <tf.Variable 'low_level_policy/log_std:0' shape=(17,) dtype=float32>
It is possible that this is intended behavior, but it is more likely
an omission. This is a strong indication that this layer should be
formulated as a subclassed Layer rather than a Lambda layer.


In [62]:
s2 = agentHigh.get_policy("low_level_policy").get_state()

In [63]:
s2.keys()

odict_keys(['low_level_policy/log_std', 'low_level_policy/fc_1/kernel', 'low_level_policy/fc_1/bias', 'low_level_policy/fc_2/kernel', 'low_level_policy/fc_2/bias', 'low_level_policy/fc_value_1/kernel', 'low_level_policy/fc_value_1/bias', 'low_level_policy/fc_out/kernel', 'low_level_policy/fc_out/bias', 'low_level_policy/fc_value_2/kernel', 'low_level_policy/fc_value_2/bias', 'low_level_policy/value_out/kernel', 'low_level_policy/value_out/bias', '_optimizer_variables'])

In [64]:
s1.keys()

odict_keys(['default_policy/log_std', 'default_policy/fc_1/kernel', 'default_policy/fc_1/bias', 'default_policy/fc_2/kernel', 'default_policy/fc_2/bias', 'default_policy/fc_value_1/kernel', 'default_policy/fc_value_1/bias', 'default_policy/fc_out/kernel', 'default_policy/fc_out/bias', 'default_policy/fc_value_2/kernel', 'default_policy/fc_value_2/bias', 'default_policy/value_out/kernel', 'default_policy/value_out/bias', '_optimizer_variables'])

In [65]:
s2['low_level_policy/log_std']

array([-1.3385599 , -1.1344466 , -1.1837405 , -1.1487037 , -1.4503701 ,
       -1.5460362 , -1.4583296 , -1.1097896 , -1.42051   , -1.5193747 ,
       -1.5059805 , -0.7800477 , -0.97913575, -1.5724465 , -0.7847039 ,
       -1.0035341 , -1.5840968 ], dtype=float32)

In [66]:
s1['default_policy/log_std']

array([-1.3385599 , -1.1344466 , -1.1837405 , -1.1487037 , -1.4503701 ,
       -1.5460362 , -1.4583296 , -1.1097896 , -1.42051   , -1.5193747 ,
       -1.5059805 , -0.7800477 , -0.97913575, -1.5724465 , -0.7847039 ,
       -1.0035341 , -1.5840968 ], dtype=float32)

In [67]:
w1 = agentLow.get_policy().get_weights()
w2 = agentHigh.get_policy("low_level_policy").get_weights()

In [68]:
w1['default_policy/log_std']

array([-1.3385599 , -1.1344466 , -1.1837405 , -1.1487037 , -1.4503701 ,
       -1.5460362 , -1.4583296 , -1.1097896 , -1.42051   , -1.5193747 ,
       -1.5059805 , -0.7800477 , -0.97913575, -1.5724465 , -0.7847039 ,
       -1.0035341 , -1.5840968 ], dtype=float32)

In [69]:
w2['low_level_policy/log_std']

array([-1.3385599 , -1.1344466 , -1.1837405 , -1.1487037 , -1.4503701 ,
       -1.5460362 , -1.4583296 , -1.1097896 , -1.42051   , -1.5193747 ,
       -1.5059805 , -0.7800477 , -0.97913575, -1.5724465 , -0.7847039 ,
       -1.0035341 , -1.5840968 ], dtype=float32)

In [70]:
agentHigh.get_policy("low_level_policy").set_weights({
    'low_level_policy/log_std': w1['default_policy/log_std']
})

In [71]:
agentHigh.get_policy("low_level_policy").get_weights()['low_level_policy/log_std']

array([-1.3385599 , -1.1344466 , -1.1837405 , -1.1487037 , -1.4503701 ,
       -1.5460362 , -1.4583296 , -1.1097896 , -1.42051   , -1.5193747 ,
       -1.5059805 , -0.7800477 , -0.97913575, -1.5724465 , -0.7847039 ,
       -1.0035341 , -1.5840968 ], dtype=float32)

In [72]:
agentHigh.get_policy("low_level_policy").get_state()['low_level_policy/log_std']

array([-1.3385599 , -1.1344466 , -1.1837405 , -1.1487037 , -1.4503701 ,
       -1.5460362 , -1.4583296 , -1.1097896 , -1.42051   , -1.5193747 ,
       -1.5059805 , -0.7800477 , -0.97913575, -1.5724465 , -0.7847039 ,
       -1.0035341 , -1.5840968 ], dtype=float32)

In [73]:
{ k1: k2 for k1, k2 in zip(s.keys(), s1.keys())}

{'low_level_policy/log_std': 'default_policy/log_std',
 'low_level_policy/fc_1/kernel': 'default_policy/fc_1/kernel',
 'low_level_policy/fc_1/bias': 'default_policy/fc_1/bias',
 'low_level_policy/fc_2/kernel': 'default_policy/fc_2/kernel',
 'low_level_policy/fc_2/bias': 'default_policy/fc_2/bias',
 'low_level_policy/fc_value_1/kernel': 'default_policy/fc_value_1/kernel',
 'low_level_policy/fc_value_1/bias': 'default_policy/fc_value_1/bias',
 'low_level_policy/fc_out/kernel': 'default_policy/fc_out/kernel',
 'low_level_policy/fc_out/bias': 'default_policy/fc_out/bias',
 'low_level_policy/fc_value_2/kernel': 'default_policy/fc_value_2/kernel',
 'low_level_policy/fc_value_2/bias': 'default_policy/fc_value_2/bias',
 'low_level_policy/value_out/kernel': 'default_policy/value_out/kernel',
 'low_level_policy/value_out/bias': 'default_policy/value_out/bias',
 '_optimizer_variables': '_optimizer_variables'}

In [75]:
for a, b in zip(s1.keys(), s2.keys()):
    print(a, b)

default_policy/log_std low_level_policy/log_std
default_policy/fc_1/kernel low_level_policy/fc_1/kernel
default_policy/fc_1/bias low_level_policy/fc_1/bias
default_policy/fc_2/kernel low_level_policy/fc_2/kernel
default_policy/fc_2/bias low_level_policy/fc_2/bias
default_policy/fc_value_1/kernel low_level_policy/fc_value_1/kernel
default_policy/fc_value_1/bias low_level_policy/fc_value_1/bias
default_policy/fc_out/kernel low_level_policy/fc_out/kernel
default_policy/fc_out/bias low_level_policy/fc_out/bias
default_policy/fc_value_2/kernel low_level_policy/fc_value_2/kernel
default_policy/fc_value_2/bias low_level_policy/fc_value_2/bias
default_policy/value_out/kernel low_level_policy/value_out/kernel
default_policy/value_out/bias low_level_policy/value_out/bias
_optimizer_variables _optimizer_variables


In [61]:
s1['_optimizer_variables'].keys()

odict_keys(['default_policy/beta1_power', 'default_policy/beta2_power', 'default_policy/default_policy/log_std/Adam', 'default_policy/default_policy/log_std/Adam_1', 'default_policy/default_policy/fc_1/kernel/Adam', 'default_policy/default_policy/fc_1/kernel/Adam_1', 'default_policy/default_policy/fc_1/bias/Adam', 'default_policy/default_policy/fc_1/bias/Adam_1', 'default_policy/default_policy/fc_2/kernel/Adam', 'default_policy/default_policy/fc_2/kernel/Adam_1', 'default_policy/default_policy/fc_2/bias/Adam', 'default_policy/default_policy/fc_2/bias/Adam_1', 'default_policy/default_policy/fc_value_1/kernel/Adam', 'default_policy/default_policy/fc_value_1/kernel/Adam_1', 'default_policy/default_policy/fc_value_1/bias/Adam', 'default_policy/default_policy/fc_value_1/bias/Adam_1', 'default_policy/default_policy/fc_out/kernel/Adam', 'default_policy/default_policy/fc_out/kernel/Adam_1', 'default_policy/default_policy/fc_out/bias/Adam', 'default_policy/default_policy/fc_out/bias/Adam_1', 'd

In [76]:
s2['_optimizer_variables'].keys()

odict_keys(['low_level_policy/beta1_power', 'low_level_policy/beta2_power', 'low_level_policy/low_level_policy/log_std/Adam', 'low_level_policy/low_level_policy/log_std/Adam_1', 'low_level_policy/low_level_policy/fc_1/kernel/Adam', 'low_level_policy/low_level_policy/fc_1/kernel/Adam_1', 'low_level_policy/low_level_policy/fc_1/bias/Adam', 'low_level_policy/low_level_policy/fc_1/bias/Adam_1', 'low_level_policy/low_level_policy/fc_2/kernel/Adam', 'low_level_policy/low_level_policy/fc_2/kernel/Adam_1', 'low_level_policy/low_level_policy/fc_2/bias/Adam', 'low_level_policy/low_level_policy/fc_2/bias/Adam_1', 'low_level_policy/low_level_policy/fc_value_1/kernel/Adam', 'low_level_policy/low_level_policy/fc_value_1/kernel/Adam_1', 'low_level_policy/low_level_policy/fc_value_1/bias/Adam', 'low_level_policy/low_level_policy/fc_value_1/bias/Adam_1', 'low_level_policy/low_level_policy/fc_out/kernel/Adam', 'low_level_policy/low_level_policy/fc_out/kernel/Adam_1', 'low_level_policy/low_level_policy/f

In [81]:
for a, b in zip(s1['_optimizer_variables'].keys(), s2['_optimizer_variables'].keys()):
    print(a, ' '*(60 - len(a)), b)

default_policy/beta1_power                                    low_level_policy/beta1_power
default_policy/beta2_power                                    low_level_policy/beta2_power
default_policy/default_policy/log_std/Adam                    low_level_policy/low_level_policy/log_std/Adam
default_policy/default_policy/log_std/Adam_1                  low_level_policy/low_level_policy/log_std/Adam_1
default_policy/default_policy/fc_1/kernel/Adam                low_level_policy/low_level_policy/fc_1/kernel/Adam
default_policy/default_policy/fc_1/kernel/Adam_1              low_level_policy/low_level_policy/fc_1/kernel/Adam_1
default_policy/default_policy/fc_1/bias/Adam                  low_level_policy/low_level_policy/fc_1/bias/Adam
default_policy/default_policy/fc_1/bias/Adam_1                low_level_policy/low_level_policy/fc_1/bias/Adam_1
default_policy/default_policy/fc_2/kernel/Adam                low_level_policy/low_level_policy/fc_2/kernel/Adam
default_policy/default_policy/fc_

In [89]:
"default_policy/default_policy/log_std/Adam".replace("default_policy", "low_level_policy")

'low_level_policy/low_level_policy/log_std/Adam'

In [88]:
agentHigh.get_policy("low_level_policy").set_state({
    '_optimizer_variables': s1['_optimizer_variables']
})

AssertionError: No variables in the input matched those in the network. Possible cause: Two networks were defined in the same TensorFlow graph. To fix this, place each network definition in its own tf.Graph.

In [92]:
from collections import OrderedDict

In [97]:
s11 = OrderedDict([(k.replace("default_policy", "low_level_policy"), v) for k, v in s1['_optimizer_variables'].items()])

In [107]:
for a in s11:
    print(a, type(s11[a]))

low_level_policy/beta1_power <class 'numpy.float32'>
low_level_policy/beta2_power <class 'numpy.float32'>
low_level_policy/low_level_policy/log_std/Adam <class 'numpy.ndarray'>
low_level_policy/low_level_policy/log_std/Adam_1 <class 'numpy.ndarray'>
low_level_policy/low_level_policy/fc_1/kernel/Adam <class 'numpy.ndarray'>
low_level_policy/low_level_policy/fc_1/kernel/Adam_1 <class 'numpy.ndarray'>
low_level_policy/low_level_policy/fc_1/bias/Adam <class 'numpy.ndarray'>
low_level_policy/low_level_policy/fc_1/bias/Adam_1 <class 'numpy.ndarray'>
low_level_policy/low_level_policy/fc_2/kernel/Adam <class 'numpy.ndarray'>
low_level_policy/low_level_policy/fc_2/kernel/Adam_1 <class 'numpy.ndarray'>
low_level_policy/low_level_policy/fc_2/bias/Adam <class 'numpy.ndarray'>
low_level_policy/low_level_policy/fc_2/bias/Adam_1 <class 'numpy.ndarray'>
low_level_policy/low_level_policy/fc_value_1/kernel/Adam <class 'numpy.ndarray'>
low_level_policy/low_level_policy/fc_value_1/kernel/Adam_1 <class 'nu

In [102]:
for a, b in zip(s11.keys(), s2['_optimizer_variables'].keys()):
    print(a == b)

True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True


In [101]:
agentHigh.get_policy("low_level_policy").set_state({
    '_optimizer_variables': s11
})

AssertionError: No variables in the input matched those in the network. Possible cause: Two networks were defined in the same TensorFlow graph. To fix this, place each network definition in its own tf.Graph.

In [104]:
s11.keys()

odict_keys(['low_level_policy/beta1_power', 'low_level_policy/beta2_power', 'low_level_policy/low_level_policy/log_std/Adam', 'low_level_policy/low_level_policy/log_std/Adam_1', 'low_level_policy/low_level_policy/fc_1/kernel/Adam', 'low_level_policy/low_level_policy/fc_1/kernel/Adam_1', 'low_level_policy/low_level_policy/fc_1/bias/Adam', 'low_level_policy/low_level_policy/fc_1/bias/Adam_1', 'low_level_policy/low_level_policy/fc_2/kernel/Adam', 'low_level_policy/low_level_policy/fc_2/kernel/Adam_1', 'low_level_policy/low_level_policy/fc_2/bias/Adam', 'low_level_policy/low_level_policy/fc_2/bias/Adam_1', 'low_level_policy/low_level_policy/fc_value_1/kernel/Adam', 'low_level_policy/low_level_policy/fc_value_1/kernel/Adam_1', 'low_level_policy/low_level_policy/fc_value_1/bias/Adam', 'low_level_policy/low_level_policy/fc_value_1/bias/Adam_1', 'low_level_policy/low_level_policy/fc_out/kernel/Adam', 'low_level_policy/low_level_policy/fc_out/kernel/Adam_1', 'low_level_policy/low_level_policy/f