In [1]:
import os
from pathlib import Path
# project_root = os.path.join(str(Path.home()), 'PPGA')
# os.chdir(project_root)
# %pwd # should be PPGA root dir

In [2]:
import pickle

import numpy as np
from attrdict import AttrDict
from RL.ppo import *
from utils.utilities import log
from envs.brax_custom.brax_env import make_vec_env_brax
from models.actor_critic import Actor, PGAMEActor
from pandas import DataFrame

from IPython.display import HTML, Image
from IPython.display import display
from brax.io import html, image

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# params to config
device = torch.device('cuda')
env_name = 'humanoid'
# env_name = 'ant'
# env_name = 'walker2d'
if env_name == 'humanoid':
    clip_obs_rew = True
else:
    clip_obs_rew = False
seed = 1111
normalize_obs = True
normalize_rewards = True
# non-configurable params
obs_shapes = {
    'humanoid': (227,),
    'ant': (87,),
    'halfcheetah': (18,),
    'walker2d': (17,)
}
action_shapes = {
    'humanoid': (17,),
    'ant': (8,),
    'halfcheetah': (6,),
    'walker2d': (6,)
}

# define the final config objects
actor_cfg = AttrDict({
        'obs_shape': obs_shapes[env_name],
        'action_shape': action_shapes[env_name],
        'normalize_obs': normalize_obs,
        'normalize_rewards': normalize_rewards,
})
env_cfg = AttrDict({
        'env_name': env_name,
        'env_batch_size': None,
        'num_dims': 2 if not 'ant' in env_name else 4,
        'envs_per_model': 1,
        'seed': seed,
        'num_envs': 1,
        'clip_obs_rew': clip_obs_rew
})


In [4]:
# now lets load in a saved archive dataframe and scheduler
# change this to be your own checkpoint path
archive_path = f'experiments/paper_ppga_{env_name}/1111/checkpoints/cp_00002000/archive_df_00002000.pkl'
scheduler_path = f'experiments/paper_ppga_{env_name}/1111/checkpoints/cp_00002000/scheduler_00002000.pkl'
with open(archive_path, 'rb') as f:
    archive_df = pickle.load(f)
with open(scheduler_path, 'rb') as f:
    scheduler = pickle.load(f)



In [5]:
# create the environment
env = make_vec_env_brax(env_cfg)

In [6]:
def get_best_elite():
    best_elite = scheduler.archive.best_elite
    print(f'Loading agent with reward {best_elite.objective} and measures {best_elite.measures}')
    agent = Actor(obs_shape=actor_cfg.obs_shape[0], action_shape=actor_cfg.action_shape, normalize_obs=normalize_obs, normalize_returns=normalize_rewards).deserialize(best_elite.solution).to(device)
    if actor_cfg.normalize_obs:
        norm = best_elite.metadata['obs_normalizer']
        if isinstance(norm, dict):
            agent.obs_normalizer.load_state_dict(norm)
        else:
            agent.obs_normalizer = norm
    return agent

In [7]:
def get_random_elite():
    elite = scheduler.archive.sample_elites(1)
    print(f'Loading agent with reward {elite.objective_batch[0]} and measures {elite.measures_batch[0]}')
    agent = Actor(obs_shape=actor_cfg.obs_shape[0], action_shape=actor_cfg.action_shape, normalize_obs=normalize_obs, normalize_returns=normalize_rewards).deserialize(elite.solution_batch.flatten()).to(device)
    if actor_cfg.normalize_obs:
        norm = elite.metadata_batch[0]['obs_normalizer']
        if isinstance(norm, dict):
            agent.obs_normalizer.load_state_dict(norm)
        else:
            agent.obs_normalizer = norm
    return agent

In [8]:
def enjoy_brax(agent, render=True, deterministic=True):
    if actor_cfg.normalize_obs:
        obs_mean, obs_var = agent.obs_normalizer.obs_rms.mean, agent.obs_normalizer.obs_rms.var
        print(f'{obs_mean=}, {obs_var=}')

    obs = env.reset()
    rollout = [env.unwrapped._state]
    total_reward = 0
    measures = torch.zeros(env_cfg.num_dims).to(device)
    done = False
    while not done:
        with torch.no_grad():
            obs = obs.unsqueeze(dim=0).to(device)
            if actor_cfg.normalize_obs:
                obs = (obs - obs_mean) / torch.sqrt(obs_var + 1e-8)

            if deterministic:
                act = agent.actor_mean(obs)
            else:
                act, _, _ = agent.get_action(obs)
            act = act.squeeze()
            obs, rew, done, info = env.step(act.cpu())
            measures += info['measures']
            rollout.append(env.unwrapped._state)
            total_reward += rew
    if render:
        i = HTML(html.render(env.unwrapped._env.sys, [s.qp for s in rollout]))
        display(i)
        print(f'{total_reward=}')
        print(f' Rollout length: {len(rollout)}')
        measures /= len(rollout)
        print(f'Measures: {measures.cpu().numpy()}')
    return total_reward.detach().cpu().numpy()

In [9]:
agent = get_best_elite()
enjoy_brax(agent, render=True, deterministic=True)

Loading agent with reward 9690.89494960308 and measures [0.39760002 0.0024    ]
obs_mean=tensor([ 1.1482e+00,  9.8074e-01,  3.7447e-02,  2.5846e-02,  1.1774e-01,
        -1.9406e-01, -9.7953e-01, -9.7296e-01, -1.8796e-01,  1.0204e-01,
        -2.0870e-01, -6.1069e-01, -4.8137e-01, -2.7638e-01,  9.7254e-02,
        -3.1377e-01, -1.8911e-02, -2.7416e-02,  3.7872e-02, -3.6199e-02,
         3.5579e-01, -5.5705e-03,  3.8614e+00, -8.5031e-02, -6.5890e-02,
         2.0613e-04, -1.3888e-02,  6.1461e-02,  5.4298e-03,  2.8808e-02,
         5.2689e-03, -3.4796e-03, -4.5653e-02,  1.1099e-02, -7.2064e-03,
         2.7921e-04, -1.2132e-02, -4.1929e-03,  8.5052e-02, -7.1529e-02,
         7.8380e-03,  4.9258e-02, -5.0612e-02, -1.0985e-02,  3.4565e-02,
         1.9508e+00,  1.1516e-03,  2.7819e-02,  1.1516e-03,  1.9621e+00,
        -2.3481e-03,  2.7819e-02, -2.3481e-03,  1.8698e+00,  1.0269e+00,
         1.9857e-03,  5.2375e-03,  1.9857e-03,  1.0354e+00, -1.6861e-03,
         5.2375e-03, -1.6861e-03,  

RuntimeError: CUDA error: CUBLAS_STATUS_NOT_INITIALIZED when calling `cublasCreate(handle)`