In [37]:
import gym
from gym.spaces import Discrete, Tuple, Box
import pybullet_envs
import pybullet

import ray
from ray import tune
from ray.tune import function
from ray.rllib.agents.ppo import PPOTrainer
from ray.tune.registry import register_env
import time

from low_level_env import LowLevelHumanoidEnv
from hier_env import HierarchicalHumanoidEnv

import timeit

from custom_callback import RewardLogCallback

import numpy as np

In [23]:
ray.shutdown()
ray.init(ignore_reinit_error=True)

2021-03-25 07:10:57,601	INFO services.py:1172 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


{'node_ip_address': '192.168.0.108',
 'raylet_ip_address': '192.168.0.108',
 'redis_address': '192.168.0.108:6379',
 'object_store_address': '/tmp/ray/session_2021-03-25_07-10-57_086284_5564/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2021-03-25_07-10-57_086284_5564/sockets/raylet',
 'webui_url': '127.0.0.1:8265',
 'session_dir': '/tmp/ray/session_2021-03-25_07-10-57_086284_5564',
 'metrics_export_port': 57221,
 'node_id': '31702c85ebbe7e931e2201e09a5352f3783ab2029ba3787622367704'}

In [73]:
def make_env_low(env_config):
    import pybullet_envs
    return LowLevelHumanoidEnv()

def make_env_hier(env_config):
    import pybullet_envs
    return HierarchicalHumanoidEnv()

def make_env(env_config):
    import pybullet_envs
    return gym.make('HumanoidBulletEnv-v0')



ENV = 'HumanoidBulletEnv-v0'
register_env(ENV, make_env)

ENV_LOW = 'HumanoidBulletEnv-v0-Low'
register_env(ENV_LOW, make_env_low)

ENV_HIER = "HumanoidBulletEnv-v0-Hier"
register_env(ENV_HIER, make_env_hier)

TARGET_REWARD = 2000

In [74]:
config = {
        "env": ENV,
        "num_workers": 0,
        "num_gpus": 1,
        "monitor": True,
        "evaluation_num_episodes": 50,
        "gamma": 0.995,
        "lambda": 0.95,
        "clip_param": 0.2,
        "kl_coeff": 1.0,
        "num_sgd_iter": 20,
        "lr": .0005,
        "sgd_minibatch_size": 8000,
        "train_batch_size": 24000,
        "model": {
            "fcnet_hiddens": [256, 128, 64],
            "fcnet_activation": "tanh",
            "free_log_std": True,
        },
        "batch_mode": "complete_episodes",
        "observation_filter": "MeanStdFilter",
    }

In [75]:
config_low = {
    "env": ENV_LOW,
    "num_workers": 0,
    "num_gpus": 1,
    "monitor": False,
    "evaluation_num_episodes": 50,
    "gamma": 0.995,
    "lambda": 0.95,
    "clip_param": 0.2,
    "kl_coeff": 1.0,
    "num_sgd_iter": 20,
    "lr": .0001,
    "sgd_minibatch_size": 8000,
    "train_batch_size": 24000,
    "model": {
        "fcnet_hiddens": [256, 128, 64],
        "fcnet_activation": "tanh",
        "free_log_std": True,
    },
    "batch_mode": "complete_episodes",
    "observation_filter": "MeanStdFilter",
}

In [76]:
def policy_mapping_fn(agent_id):
    if agent_id.startswith("low_level_"):
        return "low_level_policy"
    else:
        return "high_level_policy"

single_env_hier = HierarchicalHumanoidEnv()

highLevelPolicy = (
    None,
    single_env_hier.high_level_obs_space,
    single_env_hier.high_level_act_space,
    {
        "model": {
            "fcnet_hiddens": [256, 128, 64],
            "fcnet_activation": "tanh",
            "free_log_std": True,
        },
    },
)

lowLevelPolicy = (
    None,
    single_env_hier.low_level_obs_space,
    single_env_hier.low_level_act_space,
    {
        "model": {
            "fcnet_hiddens": [256, 128, 64],
            "fcnet_activation": "tanh",
            "free_log_std": True,
        },
    },
)

config_high = {
    "env": ENV_HIER,
    "callbacks": RewardLogCallback,
    "num_workers": 0,
    "num_envs_per_worker": 1,
    "multiagent": {
        "policies": {
            "high_level_policy": highLevelPolicy,
            "low_level_policy": lowLevelPolicy,
        },
        "policy_mapping_fn": function(policy_mapping_fn),
    },
    "log_level": "WARN",
    "num_gpus": 1,
    "monitor": True,
    "evaluation_num_episodes": 50,
    "gamma": 0.995,
    "lambda": 0.95,
    "clip_param": 0.2,
    "kl_coeff": 1.0,
    "num_sgd_iter": 20,
    "lr": 0.001,
    "sgd_minibatch_size": 8000,
    "train_batch_size": 24000,
    "batch_mode": "complete_episodes",
    "observation_filter": "MeanStdFilter",
}



In [77]:
# agent = PPOTrainer(config_low)
# agent = PPOTrainer(config_high)
agent = PPOTrainer(config)

The following Variables were used a Lambda layer's call (lambda), but
are not present in its tracked objects:
  <tf.Variable 'default_policy/log_std:0' shape=(17,) dtype=float32>
It is possible that this is intended behavior, but it is more likely
an omission. This is a strong indication that this layer should be
formulated as a subclassed Layer rather than a Lambda layer.


In [78]:
# Baseline
agent.restore("/home/aditya/ray_results/HWalk/PPO_HumanoidBulletEnv-v0_881b4_00000_0_2021-02-21_17-45-29/checkpoint_710/checkpoint-710")

# Jalan dengan kaki kanan tegak
# agent.restore("/home/aditya/ray_results/HWalk_Low_Mimic/PPO_HumanoidBulletEnv-v0-Low_c0e9e_00000_0_2021-03-23_13-00-57/checkpoint_750/checkpoint-750")

# Frame skip 5
# agent.restore("/home/aditya/ray_results/HWalk_Low_Mimic/PPO_HumanoidBulletEnv-v0-Low_6a5d5_00000_0_2021-03-23_20-29-30/checkpoint_535/checkpoint-535")

# Frame skip 1
# agent.restore("/home/aditya/ray_results/HWalk_Low_Mimic/PPO_HumanoidBulletEnv-v0-Low_32439_00000_0_2021-03-23_22-15-18/checkpoint_545/checkpoint-545")

# Hierarchical high level set frame skip & frame
# agent.restore("/home/aditya/ray_results/HWalk_Hier_Mimic/PPO_HumanoidBulletEnv-v0-Hier_d56bd_00000_0_2021-03-24_23-51-44/checkpoint_2605/checkpoint-2605")

2021-03-25 07:31:10,756	INFO trainable.py:371 -- Restored on 192.168.0.108 from checkpoint: /home/aditya/ray_results/HWalk/PPO_HumanoidBulletEnv-v0_881b4_00000_0_2021-02-21_17-45-29/checkpoint_710/checkpoint-710
2021-03-25 07:31:10,757	INFO trainable.py:379 -- Current state after restoring: {'_iteration': 710, '_timesteps_total': None, '_time_total': 4597.172565937042, '_episodes_total': 229571}


In [79]:
agent.get_weights()['default_policy']['default_policy/fc_1/kernel'].shape
# agent.get_weights()['high_level_policy']['high_level_policy/fc_1/kernel'].shape

(44, 256)

In [72]:
env.close()

In [80]:
# env = make_env_hier(ENV_HIER)
# env = make_env_low(ENV_LOW)
env = make_env(ENV)

In [81]:
env.render()
observation = env.reset()

In [82]:
rew = []
for i in range(5):
    env.render()
    observation = env.reset()
    
    # pybullet.removeAllUserDebugItems()
    # pybullet.addUserDebugLine([0, 0, 0], [env.flat_env.walk_target_x, env.flat_env.walk_target_y, 0])

    done = False
    tempRew = []
    startFrame = env.frame
    start = timeit.default_timer()
    while(not done):
        action = agent.compute_action(observation)
        observation, reward, done, info = env.step(action)
        # tempRew.append(env.calcJointScore())
        time.sleep(1.0/120)
    # stop = timeit.default_timer()
    # rew.append((startFrame, tempRew, stop-start))

# rew = []
# for i in range(5):
#     env.render()
#     observation = env.resetFromFrame(0)
#     done = False
#     tempRew = []
#     startFrame = env.frame
#     start = timeit.default_timer()
#     while(not done):
#         action = dict()
#         if('high_level_agent' in observation):
#             action['high_level_agent'] = agent.compute_action(observation['high_level_agent'], policy_id='high_level_policy')
#         else:
#             action[env.low_level_agent_id] = agent.compute_action(observation[env.low_level_agent_id], policy_id='low_level_policy')
#         observation, reward, f_done, info = env.step(action)
#         done = f_done['__all__']
#         # tempRew.append(env.calcJointScore())
#         time.sleep(1.0/60)
#     # stop = timeit.default_timer()
#     # rew.append((startFrame, tempRew, stop-start))

In [83]:
env.close()

In [84]:
ray.shutdown()

In [43]:
rew[0][2]

23.287814890005393

In [44]:
len(rew[0][1])

1000

In [45]:
1000 / 23.28

42.955326460481096

In [47]:
1000 * 15/23

652.1739130434783

In [50]:
for r in rew:
    print(r[0], len(r[1]))

235 640
125 1000
210 41
140 56
117 1000
46 1000
227 78
47 1000
169 1000
187 43


In [56]:
start = timeit.default_timer()
time.sleep(3.2873985)
stop = timeit.default_timer()
print(stop-start)

3.2890066819963977
