In [4]:
import gym
import ray
import ray.rllib.algorithms.ppo as ppo
from ray.rllib.algorithms.algorithm import Algorithm
import imageio
import numpy as np
from ray import tune, air
from ray.rllib.utils.framework import try_import_torch
from ray.rllib.policy.sample_batch import SampleBatch
torch, _ = try_import_torch()

In [5]:
ray.shutdown()
ray.init()
lstm_cell_size = 64
config = (
    ppo.PPOConfig()
    .environment("CartPole-v1")
    .framework("torch")
    .training(model={"use_lstm": True, "lstm_cell_size": lstm_cell_size})
)

stop = {
    "training_iteration": 10,
    "timesteps_total": 100000,
    "episode_reward_mean": 200,
}

tuner = tune.Tuner(
    "PPO",
    param_space=config.to_dict(),
    run_config=air.RunConfig(stop=stop),
)

results = tuner.fit()
result = results.get_best_result()
algo = Algorithm.from_checkpoint(result.checkpoint)
ppo.PPO(config=config, env="CartPole-v1")
algo.restore(result.checkpoint)
# Perform inference (action computations) based on given env observations

env = gym.make("CartPole-v1")
# Get the initial observation (some value between -10.0 and 10.0).

obs = env.reset()
done = False
# In case the model needs previous-reward/action inputs, keep track of
# these via these variables here (we'll have to pass them into the
# compute_actions methods below).
# Do we need prev-action/reward as part of the input?

init_prev_a = prev_a = None
init_prev_r = prev_r = None
# range(2) b/c h- and c-states of the LSTM.

state = [np.zeros([lstm_cell_size], np.float32) for _ in range(2)]
filename = "testtesttest.mp4"
with imageio.get_writer(filename, fps=30) as video:
    while not done:
        # Compute an action (`a`).
        a, state_out, _ = algo.compute_single_action(
            observation=obs,
            state=state,
            prev_action=prev_a,
            prev_reward=prev_r,
            explore=False,
        )
        # Send the computed action `a` to the env.
        obs, reward, done, info = env.step(a)
        state = state_out
        if init_prev_a is not None:
            prev_a = a
        if init_prev_r is not None:
            prev_r = reward
        video.append_data(env.render(mode="rgb_array"))

2023-02-01 14:47:48,813	INFO worker.py:1538 -- Started a local Ray instance.


0,1
Current time:,2023-02-01 14:49:54
Running for:,00:02:04.83
Memory:,12.6/15.6 GiB

Trial name,status,loc,iter,total time (s),ts,reward,episode_reward_max,episode_reward_min,episode_len_mean
PPO_CartPole-v1_04576_00000,TERMINATED,192.168.1.72:47527,10,116.896,40000,185.91,473,73,185.91


[2m[36m(PPO pid=47527)[0m 2023-02-01 14:47:53,667	INFO algorithm.py:501 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.


Trial name,agent_timesteps_total,counters,custom_metrics,date,done,episode_len_mean,episode_media,episode_reward_max,episode_reward_mean,episode_reward_min,episodes_this_iter,episodes_total,experiment_id,hostname,info,iterations_since_restore,node_ip,num_agent_steps_sampled,num_agent_steps_trained,num_env_steps_sampled,num_env_steps_sampled_this_iter,num_env_steps_trained,num_env_steps_trained_this_iter,num_faulty_episodes,num_healthy_workers,num_in_flight_async_reqs,num_remote_worker_restarts,num_steps_trained_this_iter,perf,pid,policy_reward_max,policy_reward_mean,policy_reward_min,sampler_perf,sampler_results,time_since_restore,time_this_iter_s,time_total_s,timers,timestamp,timesteps_since_restore,timesteps_total,training_iteration,trial_id,warmup_time
PPO_CartPole-v1_04576_00000,40000,"{'num_env_steps_sampled': 40000, 'num_env_steps_trained': 40000, 'num_agent_steps_sampled': 40000, 'num_agent_steps_trained': 40000}",{},2023-02-01_14-49-54,True,185.91,{},473,185.91,73,16,615,bc2a93bcf6614718891b4852b410c7bd,clem-MS-7B24,"{'learner': {'default_policy': {'learner_stats': {'allreduce_latency': 0.0, 'grad_gnorm': 2.3527032748185177, 'cur_kl_coeff': 0.22500000000000006, 'cur_lr': 5.0000000000000016e-05, 'total_loss': 9.599447546723068, 'policy_loss': -0.0037448132440688147, 'vf_loss': 9.601659614809098, 'vf_explained_var': -0.0234281623876223, 'kl': 0.006812256550837265, 'entropy': 0.5892923270502398, 'entropy_coeff': 0.0}, 'model': {}, 'custom_metrics': {}, 'num_agent_steps_trained': 127.74193548387096, 'num_grad_updates_lifetime': 8835.5, 'diff_num_grad_updates_vs_sampler_policy': 464.5}}, 'num_env_steps_sampled': 40000, 'num_env_steps_trained': 40000, 'num_agent_steps_sampled': 40000, 'num_agent_steps_trained': 40000}",10,192.168.1.72,40000,40000,40000,4000,40000,4000,0,2,0,0,4000,"{'cpu_util_percent': 31.30625, 'ram_util_percent': 81.0}",47527,{},{},{},"{'mean_raw_obs_processing_ms': 0.2770729001245672, 'mean_inference_ms': 0.9536920048681804, 'mean_action_processing_ms': 0.04858773256284574, 'mean_env_wait_ms': 0.04717422188142363, 'mean_env_render_ms': 0.0}","{'episode_reward_max': 473.0, 'episode_reward_min': 73.0, 'episode_reward_mean': 185.91, 'episode_len_mean': 185.91, 'episode_media': {}, 'episodes_this_iter': 16, 'policy_reward_min': {}, 'policy_reward_max': {}, 'policy_reward_mean': {}, 'custom_metrics': {}, 'hist_stats': {'episode_reward': [197.0, 124.0, 73.0, 113.0, 100.0, 135.0, 142.0, 87.0, 152.0, 161.0, 178.0, 123.0, 172.0, 158.0, 126.0, 123.0, 191.0, 167.0, 445.0, 182.0, 163.0, 135.0, 148.0, 192.0, 151.0, 119.0, 284.0, 122.0, 176.0, 226.0, 282.0, 160.0, 166.0, 187.0, 181.0, 172.0, 132.0, 131.0, 130.0, 211.0, 133.0, 202.0, 167.0, 162.0, 145.0, 153.0, 81.0, 184.0, 117.0, 147.0, 170.0, 221.0, 152.0, 105.0, 251.0, 165.0, 214.0, 276.0, 116.0, 202.0, 205.0, 242.0, 125.0, 145.0, 223.0, 284.0, 220.0, 183.0, 171.0, 279.0, 261.0, 147.0, 110.0, 186.0, 162.0, 149.0, 127.0, 224.0, 139.0, 122.0, 250.0, 183.0, 286.0, 110.0, 190.0, 192.0, 199.0, 473.0, 245.0, 350.0, 247.0, 268.0, 159.0, 299.0, 103.0, 274.0, 441.0, 162.0, 192.0, 284.0], 'episode_lengths': [197, 124, 73, 113, 100, 135, 142, 87, 152, 161, 178, 123, 172, 158, 126, 123, 191, 167, 445, 182, 163, 135, 148, 192, 151, 119, 284, 122, 176, 226, 282, 160, 166, 187, 181, 172, 132, 131, 130, 211, 133, 202, 167, 162, 145, 153, 81, 184, 117, 147, 170, 221, 152, 105, 251, 165, 214, 276, 116, 202, 205, 242, 125, 145, 223, 284, 220, 183, 171, 279, 261, 147, 110, 186, 162, 149, 127, 224, 139, 122, 250, 183, 286, 110, 190, 192, 199, 473, 245, 350, 247, 268, 159, 299, 103, 274, 441, 162, 192, 284]}, 'sampler_perf': {'mean_raw_obs_processing_ms': 0.2770729001245672, 'mean_inference_ms': 0.9536920048681804, 'mean_action_processing_ms': 0.04858773256284574, 'mean_env_wait_ms': 0.04717422188142363, 'mean_env_render_ms': 0.0}, 'num_faulty_episodes': 0}",116.896,11.3296,116.896,"{'training_iteration_time_ms': 11686.437, 'load_time_ms': 14.615, 'load_throughput': 273693.882, 'learn_time_ms': 8995.509, 'learn_throughput': 444.666, 'synch_weights_time_ms': 1.91}",1675259394,0,40000,10,04576_00000,3.90964


2023-02-01 14:49:55,126	INFO tune.py:762 -- Total run time: 125.32 seconds (124.82 seconds for the tuning loop).
2023-02-01 14:49:55,163	INFO algorithm.py:501 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
2023-02-01 14:50:01,967	INFO trainable.py:790 -- Restored on 192.168.1.72 from checkpoint: /home/clem/ray_results/PPO/PPO_CartPole-v1_04576_00000_0_2023-02-01_14-47-49/checkpoint_000010
2023-02-01 14:50:01,967	INFO trainable.py:799 -- Current state after restoring: {'_iteration': 10, '_timesteps_total': None, '_time_total': 116.89642643928528, '_episodes_total': 615}


In [6]:

# tensorboard --logdir /home/clem/ray_results/
# rllib evaluate /home/clem/ray_results/PPO/PPO_CryptoEnv_fcdc7_00000_0_2023-01-16_15-56-42/checkpoint_000100 --config "{\"env\": \"CartPole-v1\"}" --run PPO --steps 300
