In [1]:
import os
from pathlib import Path
# project_root = os.path.join(str(Path.home()), 'PPGA')
# os.chdir(project_root)
# %pwd # should be PPGA root dir

In [2]:
import pickle

import numpy as np
from attrdict import AttrDict
from RL.ppo import *
from utils.utilities import log
from envs.brax_custom.brax_env import make_vec_env_brax
from models.actor_critic import Actor, PGAMEActor
from pandas import DataFrame

from IPython.display import HTML, Image
from IPython.display import display
from brax.io import html, image

  self.hub = sentry_sdk.Hub(client)


In [22]:
# params to config
device = torch.device('cuda')
env_name = 'halfcheetah'
env_name = 'humanoid'
# env_name = 'ant'
env_name = 'walker2d'
if env_name == 'humanoid':
    clip_obs_rew = True
else:
    clip_obs_rew = False
seed = 1111
normalize_obs = True
normalize_rewards = True
# non-configurable params
obs_shapes = {
    'humanoid': (227,),
    'ant': (87,),
    'halfcheetah': (18,),
    'walker2d': (17,)
}
action_shapes = {
    'humanoid': (17,),
    'ant': (8,),
    'halfcheetah': (6,),
    'walker2d': (6,)
}

# define the final config objects
actor_cfg = AttrDict({
        'obs_shape': obs_shapes[env_name],
        'action_shape': action_shapes[env_name],
        'normalize_obs': normalize_obs,
        'normalize_rewards': normalize_rewards,
})
env_cfg = AttrDict({
        'env_name': env_name,
        'env_batch_size': None,
        'num_dims': 2 if not 'ant' in env_name else 4,
        'envs_per_model': 1,
        'seed': seed,
        'num_envs': 1,
        'clip_obs_rew': clip_obs_rew
})


In [44]:
# now lets load in a saved archive dataframe and scheduler
# change this to be your own checkpoint path
#archive_path = f'experiments/paper_ppga_{env_name}/1111/checkpoints/cp_00002000/archive_df_00002000.pkl'
archive_path = f'/home/wanzl/project/ppga_il/experiments_4_good_and_diverse_elite_with_measures_top500/IL_ppga_{env_name}_m_cond_gail_archive_bonus_wo_smooth/1111/checkpoints/cp_00002000/archive_df_00002000.pkl'
archive_path = '/home/wanzl/project/ppga_il/experiments_4_good_and_diverse_elite_with_measures_top500/IL_ppga_walker2d_m_cond_reg_gail_RegLoss_MSE_Bonus_single_step_bonus/1111/checkpoints/cp_00002000/archive_df_00002000.pkl'
#scheduler_path = f'experiments/paper_ppga_{env_name}/1111/checkpoints/cp_00002000/scheduler_00002000.pkl'
scheduler_path = f'/home/wanzl/project/ppga_il/experiments_4_good_and_diverse_elite_with_measures_top500/IL_ppga_{env_name}_m_cond_gail_archive_bonus_wo_smooth/1111/checkpoints/cp_00002000/scheduler_00002000.pkl'
scheduler_path = '/home/wanzl/project/ppga_il/experiments_4_good_and_diverse_elite_with_measures_top500/IL_ppga_walker2d_m_cond_reg_gail_RegLoss_MSE_Bonus_single_step_bonus/1111/checkpoints/cp_00002000/scheduler_00002000.pkl'
with open(archive_path, 'rb') as f:
    archive_df = pickle.load(f)
with open(scheduler_path, 'rb') as f:
    scheduler = pickle.load(f)

In [45]:
# create the environment
env = make_vec_env_brax(env_cfg)

In [46]:
def get_best_elite():
    best_elite = scheduler.archive.best_elite
    print(f'Loading agent with reward {best_elite.objective} and measures {best_elite.measures}')
    agent = Actor(obs_shape=actor_cfg.obs_shape[0], action_shape=actor_cfg.action_shape, normalize_obs=normalize_obs, normalize_returns=normalize_rewards).deserialize(best_elite.solution).to(device)
    if actor_cfg.normalize_obs:
        norm = best_elite.metadata['obs_normalizer']
        if isinstance(norm, dict):
            agent.obs_normalizer.load_state_dict(norm)
        else:
            agent.obs_normalizer = norm
    return agent

In [47]:
def get_random_elite():
    elite = scheduler.archive.sample_elites(1)
    print(f'Loading agent with reward {elite.objective_batch[0]} and measures {elite.measures_batch[0]}')
    agent = Actor(obs_shape=actor_cfg.obs_shape[0], action_shape=actor_cfg.action_shape, normalize_obs=normalize_obs, normalize_returns=normalize_rewards).deserialize(elite.solution_batch.flatten()).to(device)
    if actor_cfg.normalize_obs:
        norm = elite.metadata_batch[0]['obs_normalizer']
        if isinstance(norm, dict):
            agent.obs_normalizer.load_state_dict(norm)
        else:
            agent.obs_normalizer = norm
    return agent

In [48]:
def get_measure_elite(measure):
    elite = scheduler.archive.elites_with_measures_single(measure)
    print(f'Loading agent with reward {elite.objective} and measures {elite.measures}')
    agent = Actor(obs_shape=actor_cfg.obs_shape[0], action_shape=actor_cfg.action_shape, normalize_obs=normalize_obs, normalize_returns=normalize_rewards).deserialize(elite.solution).to(device)
    if actor_cfg.normalize_obs:
        norm = elite.metadata['obs_normalizer']
        if isinstance(norm, dict):
            agent.obs_normalizer.load_state_dict(norm)
        else:
            agent.obs_normalizer = norm
    return agent

In [49]:
def enjoy_brax(agent, render=True, deterministic=True):
    if actor_cfg.normalize_obs:
        obs_mean, obs_var = agent.obs_normalizer.obs_rms.mean, agent.obs_normalizer.obs_rms.var
        print(f'{obs_mean=}, {obs_var=}')

    obs = env.reset()
    rollout = [env.unwrapped._state]
    total_reward = 0
    measures = torch.zeros(env_cfg.num_dims).to(device)
    done = False
    while not done:
        with torch.no_grad():
            obs = obs.unsqueeze(dim=0).to(device)
            if actor_cfg.normalize_obs:
                obs = (obs - obs_mean) / torch.sqrt(obs_var + 1e-8)

            if deterministic:
                act = agent.actor_mean(obs)
            else:
                act, _, _ = agent.get_action(obs)
            act = act.squeeze()
            obs, rew, done, info = env.step(act.cpu())
            measures += info['measures']
            rollout.append(env.unwrapped._state)
            total_reward += rew
    if render:
        i = HTML(html.render(env.unwrapped._env.sys, [s.qp for s in rollout]))
        display(i)
        print(f'{total_reward=}')
        print(f' Rollout length: {len(rollout)}')
        measures /= len(rollout)
        print(f'Measures: {measures.cpu().numpy()}')
    return total_reward.detach().cpu().numpy()

In [60]:
agent1 = get_measure_elite([0.8,0.99])
agent2 = get_measure_elite([0.75,0.75])
agent3 = get_measure_elite([0.75,0.25])
agent4 = get_measure_elite([0.25,0.75])
#enjoy_brax(agent1, render=True, deterministic=True)
enjoy_brax(agent1, render=True, deterministic=True)


Loading agent with reward 1015.3101218611002 and measures [0.81980002 0.98550004]
Loading agent with reward 1077.082643074263 and measures [0.75850004 0.75980002]
Loading agent with reward 1641.439247147739 and measures [0.74010003 0.24109998]


  agent = Actor(obs_shape=actor_cfg.obs_shape[0], action_shape=actor_cfg.action_shape, normalize_obs=normalize_obs, normalize_returns=normalize_rewards).deserialize(elite.solution).to(device)


Loading agent with reward 1012.4368193849921 and measures [0.24165498 0.75533593]
obs_mean=tensor([ 1.0761, -0.6388, -0.2689, -0.8916,  0.5126, -0.4979, -1.3253,  0.1942,
         1.0739, -0.0369, -0.1174,  0.0662,  0.1956, -0.0860,  0.1319,  0.3176,
        -0.0181], device='cuda:0'), obs_var=tensor([0.0137, 0.0576, 0.0726, 0.1358, 0.0861, 0.1061, 0.4107, 0.2686, 1.4051,
        0.9117, 1.8437, 3.8457, 7.6250, 4.8381, 4.6738, 8.1415, 9.1122],
       device='cuda:0')


total_reward=tensor(1007.9168, device='cuda:0')
 Rollout length: 1001
Measures: [0.9110889 0.9710289]


array(1007.9168, dtype=float32)