# RLlib Sample Application

First, let's make sure that Ray and RLlib are installed…

In [1]:
!pip install ray[rllib]
!pip install ray[debug]
!pip install ray[tune]





Then we start Ray…

In [2]:
import ray
import ray.rllib.agents.ppo as ppo

ray.shutdown()
ray.init(ignore_reinit_error=True)

2020-03-21 18:11:05,442	INFO resource_spec.py:212 -- Starting Ray with 2.39 GiB memory available for workers and up to 1.2 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).
2020-03-21 18:11:05,866	INFO services.py:1078 -- View the Ray dashboard at [1m[32mlocalhost:8265[39m[22m


{'node_ip_address': '192.168.1.65',
 'redis_address': '192.168.1.65:50527',
 'object_store_address': '/tmp/ray/session_2020-03-21_18-11-05_431051_2939/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2020-03-21_18-11-05_431051_2939/sockets/raylet',
 'webui_url': 'localhost:8265',
 'session_dir': '/tmp/ray/session_2020-03-21_18-11-05_431051_2939'}

After a successful launch, there should be a log output line that reads something to the effect of `View the Ray dashboard at localhost:8265` in which case open another browser tab for the Ray dashboard at <http://localhost:8265/>

Next we'll train an RLlib policy with the `CartPole-v0` environment, which is a relatively simple and quick example. For more details about this problem, see the tutorial [*Cartpole - Introduction to Reinforcement Learning (DQN - Deep Q-Learning)*](https://towardsdatascience.com/cartpole-introduction-to-reinforcement-learning-ed0eb5b58288) by [Greg Surma](https://twitter.com/GSurma).

In [3]:
config = ppo.DEFAULT_CONFIG.copy()
config["log_level"] = "WARN"

n_iter = 10
reward_history = []

agent = ppo.PPOTrainer(config, env="CartPole-v0")

for _ in range(n_iter):
    result = agent.train()
    print(result)

    max_reward = result["episode_reward_max"]
    reward_history.append(max_reward)

    file_name = agent.save("/tmp/ppo")
    print(f"\n{file_name}")

2020-03-21 18:11:13,165	INFO trainer.py:420 -- Tip: set 'eager': true or the --eager flag to enable TensorFlow eager execution
2020-03-21 18:11:13,211	INFO trainer.py:580 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
  obj = yaml.load(type_)


[2m[36m(pid=2998)[0m   obj = yaml.load(type_)
[2m[36m(pid=2997)[0m   obj = yaml.load(type_)
{'episode_reward_max': 75.0, 'episode_reward_min': 9.0, 'episode_reward_mean': 21.0, 'episode_len_mean': 21.0, 'episodes_this_iter': 187, 'policy_reward_min': {}, 'policy_reward_max': {}, 'policy_reward_mean': {}, 'custom_metrics': {}, 'hist_stats': {'episode_reward': [11.0, 23.0, 18.0, 31.0, 48.0, 21.0, 20.0, 27.0, 13.0, 15.0, 36.0, 26.0, 12.0, 13.0, 23.0, 15.0, 49.0, 24.0, 32.0, 38.0, 16.0, 13.0, 12.0, 24.0, 9.0, 14.0, 14.0, 46.0, 20.0, 44.0, 15.0, 15.0, 27.0, 24.0, 20.0, 14.0, 19.0, 14.0, 34.0, 15.0, 30.0, 19.0, 13.0, 25.0, 27.0, 22.0, 27.0, 31.0, 16.0, 16.0, 75.0, 24.0, 35.0, 21.0, 11.0, 14.0, 25.0, 35.0, 13.0, 42.0, 15.0, 17.0, 19.0, 15.0, 30.0, 16.0, 12.0, 10.0, 22.0, 13.0, 12.0, 16.0, 17.0, 35.0, 11.0, 35.0, 27.0, 15.0, 29.0, 13.0, 12.0, 11.0, 15.0, 18.0, 11.0, 19.0, 10.0, 24.0, 42.0, 26.0, 19.0, 31.0, 19.0, 17.0, 13.0, 17.0, 12.0, 12.0, 12.0, 11.0, 14.0, 23.0, 13.0, 16.0, 12.0, 26.

{'episode_reward_max': 200.0, 'episode_reward_min': 11.0, 'episode_reward_mean': 72.05, 'episode_len_mean': 72.05, 'episodes_this_iter': 37, 'policy_reward_min': {}, 'policy_reward_max': {}, 'policy_reward_mean': {}, 'custom_metrics': {}, 'hist_stats': {'episode_reward': [121.0, 23.0, 88.0, 163.0, 68.0, 115.0, 80.0, 200.0, 53.0, 32.0, 114.0, 116.0, 169.0, 110.0, 146.0, 109.0, 67.0, 200.0, 123.0, 99.0, 99.0, 84.0, 110.0, 133.0, 47.0, 101.0, 116.0, 106.0, 167.0, 200.0, 29.0, 59.0, 83.0, 34.0, 62.0, 200.0, 149.0, 22.0, 19.0, 25.0, 59.0, 54.0, 66.0, 37.0, 33.0, 72.0, 78.0, 11.0, 91.0, 102.0, 75.0, 41.0, 100.0, 16.0, 23.0, 52.0, 35.0, 37.0, 41.0, 77.0, 66.0, 108.0, 76.0, 55.0, 37.0, 67.0, 75.0, 49.0, 48.0, 22.0, 18.0, 82.0, 145.0, 23.0, 17.0, 25.0, 144.0, 15.0, 85.0, 18.0, 62.0, 65.0, 21.0, 51.0, 73.0, 42.0, 65.0, 27.0, 12.0, 22.0, 34.0, 34.0, 82.0, 87.0, 16.0, 23.0, 27.0, 59.0, 60.0, 27.0], 'episode_lengths': [121, 23, 88, 163, 68, 115, 80, 200, 53, 32, 114, 116, 169, 110, 146, 109, 67, 20

{'episode_reward_max': 200.0, 'episode_reward_min': 12.0, 'episode_reward_mean': 126.64, 'episode_len_mean': 126.64, 'episodes_this_iter': 20, 'policy_reward_min': {}, 'policy_reward_max': {}, 'policy_reward_mean': {}, 'custom_metrics': {}, 'hist_stats': {'episode_reward': [200.0, 200.0, 200.0, 200.0, 200.0, 190.0, 200.0, 146.0, 200.0, 200.0, 72.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 191.0, 200.0, 65.0, 21.0, 51.0, 73.0, 42.0, 65.0, 27.0, 12.0, 22.0, 34.0, 34.0, 82.0, 87.0, 16.0, 23.0, 27.0, 59.0, 60.0, 27.0, 121.0, 23.0, 88.0, 163.0, 68.0, 115.0, 80.0, 200.0, 53.0, 32.0, 114.0, 116.0, 169.0, 110.0, 146.0, 109.0, 67.0, 200.0, 123.0, 99.0, 99.0, 84.0, 110.0, 133.0, 47.0, 101.0, 116.0, 106.0, 167.0, 200.0, 29.0, 59.0, 83.0, 34.0, 62.0, 200.0, 149.0, 200.0, 200.0, 200.0, 183.0, 200.0, 200.0, 200.0, 93.0, 55.0, 80.0, 142.0, 200.0, 200.0, 200.0, 97.0, 200.0, 199.0, 14.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0], 'episode_lengths': [200, 200, 200, 200, 200, 190, 200, 146, 200, 

{'episode_reward_max': 200.0, 'episode_reward_min': 14.0, 'episode_reward_mean': 174.49, 'episode_len_mean': 174.49, 'episodes_this_iter': 20, 'policy_reward_min': {}, 'policy_reward_max': {}, 'policy_reward_mean': {}, 'custom_metrics': {}, 'hist_stats': {'episode_reward': [200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 196.0, 110.0, 133.0, 47.0, 101.0, 116.0, 106.0, 167.0, 200.0, 29.0, 59.0, 83.0, 34.0, 62.0, 200.0, 149.0, 200.0, 200.0, 200.0, 183.0, 200.0, 200.0, 200.0, 93.0, 55.0, 80.0, 142.0, 200.0, 200.0, 200.0, 97.0, 200.0, 199.0, 14.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 190.0, 200.0, 146.0, 200.0, 200.0, 72.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 191.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 169.0, 168.0, 177.0, 200.0, 152.0, 129.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0], 'episode_lengths': [200, 200, 200, 2

{'episode_reward_max': 200.0, 'episode_reward_min': 70.0, 'episode_reward_mean': 194.6, 'episode_len_mean': 194.6, 'episodes_this_iter': 20, 'policy_reward_min': {}, 'policy_reward_max': {}, 'policy_reward_mean': {}, 'custom_metrics': {}, 'hist_stats': {'episode_reward': [200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 190.0, 200.0, 146.0, 200.0, 200.0, 72.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 191.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 169.0, 168.0, 177.0, 200.0, 152.0, 129.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 196.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 70.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0], 'episode_lengths': [200, 200

In [4]:
print(reward_history)

[75.0, 145.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0, 200.0]


The history of `max_reward` shows that this model `200` by the third iteration -- which is good, since the [*solution*](https://gym.openai.com/envs/CartPole-v0/) for `CartPole-v0` is to get an average reward of `195.0` over a hundred consecutive trials.

In [7]:
! rllib rollout \
    /tmp/ppo/checkpoint_10/checkpoint-10 \
    --config "{\"env\": \"CartPole-v0\"}" --run PPO \
    --steps 2000

2020-03-21 18:13:13,752	INFO resource_spec.py:212 -- Starting Ray with 4.15 GiB memory available for workers and up to 2.09 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).
2020-03-21 18:13:14,122	INFO services.py:1078 -- View the Ray dashboard at [1m[32mlocalhost:8266[39m[22m
2020-03-21 18:13:14,807	INFO trainer.py:420 -- Tip: set 'eager': true or the --eager flag to enable TensorFlow eager execution
2020-03-21 18:13:14,840	INFO trainer.py:580 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
  obj = yaml.load(type_)
2020-03-21 18:13:19,802	INFO trainable.py:416 -- Restored on 192.168.1.65 from checkpoint: /tmp/ppo/checkpoint_10/checkpoint-10
2020-03-21 18:13:19,802	INFO trainable.py:423 -- Current state after restoring: {'_iteration': 10, '_timesteps_total': 40000, '_time_total': 41.65511107444763, '_episodes_total': 448}
[2m[36m(pid=3040)[0m   obj = yaml.

Now that we've trained a model, we can look at its resulting policy…

In [5]:
import pprint

policy = agent.get_policy()
model = policy.model

pprint.pprint(model.variables())
pprint.pprint(model.value_function())

[<tf.Variable 'default_policy/fc_1/kernel:0' shape=(4, 256) dtype=float32>,
 <tf.Variable 'default_policy/fc_1/bias:0' shape=(256,) dtype=float32>,
 <tf.Variable 'default_policy/fc_value_1/kernel:0' shape=(4, 256) dtype=float32>,
 <tf.Variable 'default_policy/fc_value_1/bias:0' shape=(256,) dtype=float32>,
 <tf.Variable 'default_policy/fc_2/kernel:0' shape=(256, 256) dtype=float32>,
 <tf.Variable 'default_policy/fc_2/bias:0' shape=(256,) dtype=float32>,
 <tf.Variable 'default_policy/fc_value_2/kernel:0' shape=(256, 256) dtype=float32>,
 <tf.Variable 'default_policy/fc_value_2/bias:0' shape=(256,) dtype=float32>,
 <tf.Variable 'default_policy/fc_out/kernel:0' shape=(256, 2) dtype=float32>,
 <tf.Variable 'default_policy/fc_out/bias:0' shape=(2,) dtype=float32>,
 <tf.Variable 'default_policy/value_out/kernel:0' shape=(256, 1) dtype=float32>,
 <tf.Variable 'default_policy/value_out/bias:0' shape=(1,) dtype=float32>]
<tf.Tensor 'Reshape:0' shape=(?,) dtype=float32>


In [6]:
model.base_model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
observations (InputLayer)       [(None, 4)]          0                                            
__________________________________________________________________________________________________
fc_1 (Dense)                    (None, 256)          1280        observations[0][0]               
__________________________________________________________________________________________________
fc_value_1 (Dense)              (None, 256)          1280        observations[0][0]               
__________________________________________________________________________________________________
fc_2 (Dense)                    (None, 256)          65792       fc_1[0][0]                       
______________________________________________________________________________________________