# RLlib Sample Application: CartPole

First, let's make sure that Ray and RLlib are installed…

In [1]:
!pip install ray[rllib]
!pip install ray[debug]
!pip install ray[tune]
!pip install pandas
!pip install requests
!pip install tensorflow







Then we start Ray…

In [2]:
import ray

In [3]:
import ray.rllib.agents.ppo as ppo

In [4]:
ray.shutdown()
ray.init(ignore_reinit_error=True)

2020-03-29 16:09:22,817	INFO resource_spec.py:212 -- Starting Ray with 4.59 GiB memory available for workers and up to 2.31 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).
2020-03-29 16:09:23,315	INFO services.py:1078 -- View the Ray dashboard at [1m[32mlocalhost:8265[39m[22m


{'node_ip_address': '192.168.1.244',
 'redis_address': '192.168.1.244:32105',
 'object_store_address': '/tmp/ray/session_2020-03-29_16-09-22_797419_27655/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2020-03-29_16-09-22_797419_27655/sockets/raylet',
 'webui_url': 'localhost:8265',
 'session_dir': '/tmp/ray/session_2020-03-29_16-09-22_797419_27655'}

After a successful launch, there should be a log output line that reads something to the effect of `View the Ray dashboard at localhost:8265` in which case open another browser tab for the Ray dashboard at <http://localhost:8265/>

Next we'll train an RLlib policy with the `CartPole-v0` environment, which is a relatively simple and quick example. For more details about this problem, see the tutorial [*Cartpole - Introduction to Reinforcement Learning (DQN - Deep Q-Learning)*](https://towardsdatascience.com/cartpole-introduction-to-reinforcement-learning-ed0eb5b58288) by [Greg Surma](https://twitter.com/GSurma).

In [5]:
config = ppo.DEFAULT_CONFIG.copy()
config["log_level"] = "WARN"

n_iter = 10
reward_history = []

agent = ppo.PPOTrainer(config, env="CartPole-v0")

for _ in range(n_iter):
    result = agent.train()
    print(result)

    max_reward = result["episode_reward_max"]
    reward_history.append(max_reward)

    file_name = agent.save("/tmp/ppo")
    print(f"\n{file_name}")

2020-03-29 16:09:24,076	INFO trainer.py:420 -- Tip: set 'eager': true or the --eager flag to enable TensorFlow eager execution
2020-03-29 16:09:24,199	INFO trainer.py:580 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
  obj = yaml.load(type_)


[2m[36m(pid=27702)[0m   obj = yaml.load(type_)
[2m[36m(pid=27699)[0m   obj = yaml.load(type_)
{'episode_reward_max': 77.0, 'episode_reward_min': 8.0, 'episode_reward_mean': 21.884615384615383, 'episode_len_mean': 21.884615384615383, 'episodes_this_iter': 182, 'policy_reward_min': {}, 'policy_reward_max': {}, 'policy_reward_mean': {}, 'custom_metrics': {}, 'hist_stats': {'episode_reward': [16.0, 15.0, 12.0, 17.0, 18.0, 13.0, 17.0, 13.0, 13.0, 19.0, 11.0, 15.0, 16.0, 17.0, 11.0, 38.0, 21.0, 10.0, 35.0, 11.0, 53.0, 24.0, 15.0, 13.0, 25.0, 18.0, 15.0, 28.0, 14.0, 15.0, 17.0, 44.0, 19.0, 20.0, 12.0, 24.0, 13.0, 19.0, 19.0, 19.0, 16.0, 21.0, 36.0, 14.0, 12.0, 33.0, 43.0, 12.0, 12.0, 13.0, 14.0, 26.0, 16.0, 9.0, 12.0, 14.0, 27.0, 15.0, 24.0, 10.0, 31.0, 17.0, 38.0, 14.0, 11.0, 52.0, 28.0, 22.0, 55.0, 23.0, 33.0, 22.0, 10.0, 14.0, 14.0, 10.0, 17.0, 45.0, 17.0, 20.0, 39.0, 15.0, 17.0, 13.0, 26.0, 13.0, 11.0, 36.0, 17.0, 8.0, 14.0, 23.0, 17.0, 12.0, 17.0, 13.0, 23.0, 29.0, 12.0, 30.0, 32.0

{'episode_reward_max': 200.0, 'episode_reward_min': 10.0, 'episode_reward_mean': 67.14, 'episode_len_mean': 67.14, 'episodes_this_iter': 42, 'policy_reward_min': {}, 'policy_reward_max': {}, 'policy_reward_mean': {}, 'custom_metrics': {}, 'hist_stats': {'episode_reward': [123.0, 49.0, 29.0, 117.0, 166.0, 60.0, 54.0, 116.0, 170.0, 149.0, 31.0, 177.0, 85.0, 85.0, 77.0, 79.0, 188.0, 101.0, 50.0, 37.0, 26.0, 27.0, 84.0, 51.0, 77.0, 84.0, 42.0, 50.0, 45.0, 127.0, 23.0, 113.0, 18.0, 151.0, 200.0, 175.0, 13.0, 91.0, 135.0, 152.0, 136.0, 125.0, 18.0, 88.0, 36.0, 36.0, 16.0, 71.0, 61.0, 37.0, 38.0, 12.0, 102.0, 22.0, 102.0, 76.0, 14.0, 13.0, 93.0, 10.0, 59.0, 72.0, 43.0, 14.0, 22.0, 54.0, 22.0, 24.0, 52.0, 117.0, 87.0, 25.0, 68.0, 200.0, 12.0, 47.0, 30.0, 19.0, 11.0, 44.0, 16.0, 24.0, 35.0, 76.0, 18.0, 147.0, 67.0, 17.0, 35.0, 32.0, 24.0, 77.0, 18.0, 62.0, 61.0, 108.0, 32.0, 18.0, 23.0, 69.0], 'episode_lengths': [123, 49, 29, 117, 166, 60, 54, 116, 170, 149, 31, 177, 85, 85, 77, 79, 188, 101, 5

{'episode_reward_max': 200.0, 'episode_reward_min': 13.0, 'episode_reward_mean': 123.8, 'episode_len_mean': 123.8, 'episodes_this_iter': 23, 'policy_reward_min': {}, 'policy_reward_max': {}, 'policy_reward_mean': {}, 'custom_metrics': {}, 'hist_stats': {'episode_reward': [200.0, 200.0, 109.0, 200.0, 200.0, 87.0, 200.0, 200.0, 162.0, 127.0, 200.0, 200.0, 200.0, 200.0, 200.0, 14.0, 200.0, 82.0, 154.0, 200.0, 200.0, 200.0, 200.0, 35.0, 32.0, 24.0, 77.0, 18.0, 62.0, 61.0, 108.0, 32.0, 18.0, 23.0, 69.0, 123.0, 49.0, 29.0, 117.0, 166.0, 60.0, 54.0, 116.0, 170.0, 149.0, 31.0, 177.0, 85.0, 85.0, 77.0, 79.0, 188.0, 101.0, 50.0, 37.0, 26.0, 27.0, 84.0, 51.0, 77.0, 84.0, 42.0, 50.0, 45.0, 127.0, 23.0, 113.0, 18.0, 151.0, 200.0, 175.0, 13.0, 91.0, 135.0, 152.0, 136.0, 125.0, 200.0, 200.0, 200.0, 152.0, 200.0, 175.0, 200.0, 178.0, 71.0, 200.0, 200.0, 200.0, 129.0, 179.0, 163.0, 156.0, 200.0, 200.0, 192.0, 200.0, 200.0, 78.0, 125.0], 'episode_lengths': [200, 200, 109, 200, 200, 87, 200, 200, 162, 12

{'episode_reward_max': 200.0, 'episode_reward_min': 13.0, 'episode_reward_mean': 174.15, 'episode_len_mean': 174.15, 'episodes_this_iter': 20, 'policy_reward_min': {}, 'policy_reward_max': {}, 'policy_reward_mean': {}, 'custom_metrics': {}, 'hist_stats': {'episode_reward': [200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 196.0, 200.0, 200.0, 200.0, 45.0, 127.0, 23.0, 113.0, 18.0, 151.0, 200.0, 175.0, 13.0, 91.0, 135.0, 152.0, 136.0, 125.0, 200.0, 200.0, 200.0, 152.0, 200.0, 175.0, 200.0, 178.0, 71.0, 200.0, 200.0, 200.0, 129.0, 179.0, 163.0, 156.0, 200.0, 200.0, 192.0, 200.0, 200.0, 78.0, 125.0, 200.0, 200.0, 109.0, 200.0, 200.0, 87.0, 200.0, 200.0, 162.0, 127.0, 200.0, 200.0, 200.0, 200.0, 200.0, 14.0, 200.0, 82.0, 154.0, 200.0, 200.0, 200.0, 200.0, 185.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 197.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0], 'episode_lengths': [200, 200, 200,

{'episode_reward_max': 200.0, 'episode_reward_min': 14.0, 'episode_reward_mean': 192.14, 'episode_len_mean': 192.14, 'episodes_this_iter': 20, 'policy_reward_min': {}, 'policy_reward_max': {}, 'policy_reward_mean': {}, 'custom_metrics': {}, 'hist_stats': {'episode_reward': [174.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 172.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 87.0, 200.0, 200.0, 162.0, 127.0, 200.0, 200.0, 200.0, 200.0, 200.0, 14.0, 200.0, 82.0, 154.0, 200.0, 200.0, 200.0, 200.0, 185.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 197.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 196.0, 200.0, 200.0, 200.0, 200.0, 200.0, 176.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 152.0, 200.0, 183.0, 200.0, 157.0, 200.0, 200.0, 200.0, 200.0, 196.0], 'episode_lengths': [174, 20

In [6]:
print(reward_history)

[77.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0]


The history of `max_reward` shows that this model `200` by the third iteration -- which is good, since the [*solution*](https://gym.openai.com/envs/CartPole-v0/) for `CartPole-v0` is to get an average reward of `195.0` over a hundred consecutive trials.

In [7]:
! rllib rollout \
    /tmp/ppo/checkpoint_10/checkpoint-10 \
    --config "{\"env\": \"CartPole-v0\"}" --run PPO \
    --steps 2000

2020-03-29 16:10:23,202	INFO resource_spec.py:212 -- Starting Ray with 4.69 GiB memory available for workers and up to 2.35 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).
2020-03-29 16:10:23,608	INFO services.py:1078 -- View the Ray dashboard at [1m[32mlocalhost:8266[39m[22m
2020-03-29 16:10:24,416	INFO trainer.py:420 -- Tip: set 'eager': true or the --eager flag to enable TensorFlow eager execution
2020-03-29 16:10:24,459	INFO trainer.py:580 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
  obj = yaml.load(type_)
2020-03-29 16:10:30,590	INFO trainable.py:416 -- Restored on 192.168.1.244 from checkpoint: /tmp/ppo/checkpoint_10/checkpoint-10
2020-03-29 16:10:30,590	INFO trainable.py:423 -- Current state after restoring: {'_iteration': 10, '_timesteps_total': 40000, '_time_total': 47.38443112373352, '_episodes_total': 455}
[2m[36m(pid=27935)[0m   obj = yam

Now that we've trained a model, we can look at its resulting policy…

In [8]:
import pprint

policy = agent.get_policy()
model = policy.model

pprint.pprint(model.variables())
pprint.pprint(model.value_function())

[<tf.Variable 'default_policy/fc_1/kernel:0' shape=(4, 256) dtype=float32>,
 <tf.Variable 'default_policy/fc_1/bias:0' shape=(256,) dtype=float32>,
 <tf.Variable 'default_policy/fc_value_1/kernel:0' shape=(4, 256) dtype=float32>,
 <tf.Variable 'default_policy/fc_value_1/bias:0' shape=(256,) dtype=float32>,
 <tf.Variable 'default_policy/fc_2/kernel:0' shape=(256, 256) dtype=float32>,
 <tf.Variable 'default_policy/fc_2/bias:0' shape=(256,) dtype=float32>,
 <tf.Variable 'default_policy/fc_value_2/kernel:0' shape=(256, 256) dtype=float32>,
 <tf.Variable 'default_policy/fc_value_2/bias:0' shape=(256,) dtype=float32>,
 <tf.Variable 'default_policy/fc_out/kernel:0' shape=(256, 2) dtype=float32>,
 <tf.Variable 'default_policy/fc_out/bias:0' shape=(2,) dtype=float32>,
 <tf.Variable 'default_policy/value_out/kernel:0' shape=(256, 1) dtype=float32>,
 <tf.Variable 'default_policy/value_out/bias:0' shape=(1,) dtype=float32>]
<tf.Tensor 'Reshape:0' shape=(?,) dtype=float32>


In [9]:
model.base_model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
observations (InputLayer)       [(None, 4)]          0                                            
__________________________________________________________________________________________________
fc_1 (Dense)                    (None, 256)          1280        observations[0][0]               
__________________________________________________________________________________________________
fc_value_1 (Dense)              (None, 256)          1280        observations[0][0]               
__________________________________________________________________________________________________
fc_2 (Dense)                    (None, 256)          65792       fc_1[0][0]                       
______________________________________________________________________________________________