In [1]:
import functools
import gym
import gym.spaces
import pybullet
import pybullet_data
import pybullet_envs
import numpy as np
import torch
from torch import nn

import pfrl
from pfrl import experiments, utils
from pfrl.agents import PPO

In [2]:
torch.cuda.is_available()

True

In [3]:
num_envs = 1
seed = 13517013
process_seeds = np.arange(num_envs) + seed * num_envs

In [4]:
# Buat environment
def make_env(process_idx, test):
    env = gym.make("HumanoidBulletEnv-v0")
    process_seed = int(process_seeds[process_idx])
    env_seed = 2 ** 32 - 1 - process_seed if test else process_seed
    env.seed(env_seed)
    env = pfrl.wrappers.CastObservationToFloat32(env)
    env = pfrl.wrappers.Monitor(env, "out", force=True)
    env = pfrl.wrappers.Render(env, mode='human')
    return env

In [5]:
def make_batch_env(test):
    return pfrl.envs.MultiprocessVectorEnv(
        [
            functools.partial(make_env, idx, test)
            for idx, env in enumerate(range(num_envs))
        ]
    )

In [6]:
sample_env = gym.make("HumanoidBulletEnv-v0")
timestep_limit = sample_env.spec.max_episode_steps
obs_space = sample_env.observation_space
action_space = sample_env.action_space
print("Observation space:", obs_space)
print("Action space:", action_space)

Observation space: Box(-inf, inf, (44,), float32)
Action space: Box(-1.0, 1.0, (17,), float32)




In [7]:
obs_normalizer = pfrl.nn.EmpiricalNormalization(
    obs_space.low.size, clip_threshold=5
)

In [8]:
obs_size = obs_space.low.size
action_size = action_space.low.size

In [43]:
policy = torch.nn.Sequential(
    nn.Linear(obs_size, 1024),
    nn.Tanh(),
    nn.Linear(1024, action_size),
    pfrl.policies.GaussianHeadWithStateIndependentCovariance(
        action_size=action_size,
        var_type="diagonal",
        var_func=lambda x: torch.exp(3 * x),  # Parameterize log std
        var_param_init=0,  # log std = 0 => std = 1
    ),
)

In [44]:
vf = torch.nn.Sequential(
    nn.Linear(obs_size, 1024),
    nn.Tanh(),
    nn.Linear(1024, 1),
)

In [45]:
def ortho_init(layer, gain):
    nn.init.orthogonal_(layer.weight, gain=gain)
    nn.init.zeros_(layer.bias)

# ortho_init(policy[0], gain=1)
# ortho_init(policy[2], gain=1e-2)
# ortho_init(vf[0], gain=1)
# ortho_init(vf[2], gain=1)

In [46]:
model = pfrl.nn.Branched(policy, vf)

In [47]:
opt = torch.optim.Adam(model.parameters(), lr=0.00015, eps=1e-7)

In [48]:
# Hyperparameter
update_interval = 1024
batch_size = 64
epochs = 20

In [49]:
agent = PPO(
        model,
        opt,
        obs_normalizer=obs_normalizer,
        gpu=0,
        update_interval=update_interval,
        minibatch_size=batch_size,
        epochs=epochs,
        clip_eps_vf=None,
        entropy_coef=0,
        standardize_advantages=True,
        gamma=0.995,
        lambd=0.97,
    )

In [50]:
eval_n_runs = 100
eval_interval = 100000
log_interval = 10000

In [51]:
import logging
logging.basicConfig(level=logging.INFO)

In [52]:
experiments.train_agent_batch_with_evaluation(
    agent=agent,
    env=make_batch_env(False),
    eval_env=make_batch_env(True),
    outdir="out",
    steps=2000000,
    eval_n_steps=None,
    eval_n_episodes=eval_n_runs,
    eval_interval=eval_interval,
    log_interval=log_interval,
    max_episode_len=timestep_limit,
    save_best_so_far_agent=False,
)

INFO:pfrl.experiments.train_agent_batch:outdir:out step:10000 episode:513 last_R: -87.18942574971152 average_R:-82.5369220793851
INFO:pfrl.experiments.train_agent_batch:statistics: [('average_value', -44.641235), ('average_entropy', 23.312328), ('average_value_loss', 145.27660858154297), ('average_policy_loss', -0.06800873281434178), ('n_updates', 2880), ('explained_variance', 0.6334418008167624)]
INFO:pfrl.experiments.train_agent_batch:outdir:out step:20000 episode:1017 last_R: -58.50459747700224 average_R:-62.19464056121581
INFO:pfrl.experiments.train_agent_batch:statistics: [('average_value', -32.487648), ('average_entropy', 21.924723), ('average_value_loss', 46.66379871368408), ('average_policy_loss', -0.11448954045772552), ('n_updates', 6080), ('explained_variance', 0.8331256613061919)]
INFO:pfrl.experiments.train_agent_batch:outdir:out step:30000 episode:1515 last_R: -48.97214558664273 average_R:-46.611620747349264
INFO:pfrl.experiments.train_agent_batch:statistics: [('average_va

INFO:pfrl.experiments.train_agent_batch:evaluation episode 40 length: 36 R: -25.466273990763874
INFO:pfrl.experiments.train_agent_batch:evaluation episode 41 length: 26 R: 2.638522980826382
INFO:pfrl.experiments.train_agent_batch:evaluation episode 42 length: 39 R: 8.883444561160285
INFO:pfrl.experiments.train_agent_batch:evaluation episode 43 length: 35 R: 7.674759586533764
INFO:pfrl.experiments.train_agent_batch:evaluation episode 44 length: 29 R: 2.8502010728814633
INFO:pfrl.experiments.train_agent_batch:evaluation episode 45 length: 29 R: 4.107766388692833
INFO:pfrl.experiments.train_agent_batch:evaluation episode 46 length: 28 R: 8.066941942776609
INFO:pfrl.experiments.train_agent_batch:evaluation episode 47 length: 35 R: -8.286732186161682
INFO:pfrl.experiments.train_agent_batch:evaluation episode 48 length: 38 R: 4.519144858936488
INFO:pfrl.experiments.train_agent_batch:evaluation episode 49 length: 27 R: -0.08731545544287744
INFO:pfrl.experiments.train_agent_batch:evaluation ep

INFO:pfrl.experiments.train_agent_batch:outdir:out step:170000 episode:6064 last_R: 8.56794165695464 average_R:20.25621294163793
INFO:pfrl.experiments.train_agent_batch:statistics: [('average_value', 14.0261), ('average_entropy', 6.9025793), ('average_value_loss', 45.46721496582031), ('average_policy_loss', -0.08432886777445674), ('n_updates', 53120), ('explained_variance', 0.7984155570314895)]
INFO:pfrl.experiments.train_agent_batch:outdir:out step:180000 episode:6246 last_R: 5.32391042042145 average_R:16.279872313079057
INFO:pfrl.experiments.train_agent_batch:statistics: [('average_value', 23.716734), ('average_entropy', 6.4508533), ('average_value_loss', 22.135118579864503), ('average_policy_loss', -0.09200455007143318), ('n_updates', 56000), ('explained_variance', 0.7542905209632937)]
INFO:pfrl.experiments.train_agent_batch:outdir:out step:190000 episode:6421 last_R: 34.528274312586284 average_R:27.818822283441282
INFO:pfrl.experiments.train_agent_batch:statistics: [('average_value

INFO:pfrl.experiments.train_agent_batch:evaluation episode 70 length: 72 R: 4.5593619635866975
INFO:pfrl.experiments.train_agent_batch:evaluation episode 71 length: 55 R: 41.997298946231595
INFO:pfrl.experiments.train_agent_batch:evaluation episode 72 length: 71 R: 34.14287819067977
INFO:pfrl.experiments.train_agent_batch:evaluation episode 73 length: 50 R: 40.877447503154684
INFO:pfrl.experiments.train_agent_batch:evaluation episode 74 length: 74 R: 43.41803625414178
INFO:pfrl.experiments.train_agent_batch:evaluation episode 75 length: 51 R: -73.67906335109872
INFO:pfrl.experiments.train_agent_batch:evaluation episode 76 length: 72 R: 51.918179270163826
INFO:pfrl.experiments.train_agent_batch:evaluation episode 77 length: 45 R: 39.15188146285655
INFO:pfrl.experiments.train_agent_batch:evaluation episode 78 length: 67 R: -4.357026763842445
INFO:pfrl.experiments.train_agent_batch:evaluation episode 79 length: 47 R: 42.87332550546998
INFO:pfrl.experiments.train_agent_batch:evaluation epi

KeyboardInterrupt: 