In [1]:
import numpy as np
import gym
from ray.rllib.models import ModelCatalog
from ray.rllib.models.tf.tf_modelv2 import TFModelV2
from ray.rllib.models.tf.fcnet_v2 import FullyConnectedNetwork
from gym.spaces import Discrete, Box

import ray
from ray import tune
from ray.rllib.utils import try_import_tf
from ray.tune import grid_search, register_env
import or_gym

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
from or_gym.envs.classic_or.knapsack import KnapsackEnv
# from gym import spaces

In [101]:
class KnapsackEnv1(gym.Env):

    def __init__(self, config=None):
        try:
            self.item_weights = weights
            self.item_values = values
        except NameError:
            self.item_values = np.random.randint(30, size=200)
            self.item_weights = np.random.randint(1, 20, size=200)
        self.item_numbers = np.arange(len(self.item_weights))
        self.N = len(self.item_weights)
        self.max_weight = 200
        self.current_weight = 0
        
        self.action_space = spaces.Discrete(self.N)
        self.observation_space = spaces.Box(
            0, self.max_weight, shape=(2, self.N + 1), dtype=np.int16)
#         self.observation_space = spaces.Tuple((
#             spaces.Discrete(self.N),
#             spaces.Discrete(self.N),
#             spaces.Box(0, self.max_weight, shape=(2,), dtype=np.int32)))
#         self.observation_space = spaces.Tuple((
#             spaces.Box(0, self.max_weight, shape=(self.N,), dtype=np.int32),
#             spaces.Box(0, self.max_weight, shape=(self.N,), dtype=np.int32),
#             spaces.Box(0, self.max_weight, shape=(2,), dtype=np.int32)
#         ))
#         self.observation_space = spaces.Box(
#             0, self.max_weight, shape=(2,), dtype=np.int32)
        
        self.seed()
        self.reset()
        
    def step(self, item):
        # Check that item will fit
        if self.item_weights[item] + self.current_weight <= self.max_weight:
            self.current_weight += self.item_weights[item]
            reward = self.item_values[item]
            if self.current_weight == self.max_weight:
                done = True
            else:
                done = False
        else:
            # End trial if over weight
            reward = 0
            done = True
            
        self._update_state()
        return self.state, reward, done, {}
    
    def _get_obs(self):
        return self.state
    
    def _update_state(self):
        self.state = np.vstack([
            self.item_weights,
            self.item_values
        ])
        self.state = np.hstack([
            self.state, 
            np.array([[self.max_weight],
                      [self.current_weight]])
        ])
    
    def reset(self):
        self.current_weight = 0
        self._update_state()
        return self.state
    
    def sample_action(self):
        return np.random.choice(self.item_numbers)

In [3]:
env = gym.make('Knapsack-v0')

In [4]:
env.reset().shape

(2, 201)

In [86]:
spaces.Tuple((spaces.Box(0, 200, shape=(1,)),
              spaces.Box(0, 200, shape=(1,))))
spaces.Tuple((spaces.Discrete(2), 
              spaces.Discrete(2)))

Tuple(Discrete(2), Discrete(2))

In [97]:
a, b = np.arange(10), np.arange(10)
c, d = 11, 12
x = np.vstack([a, b])
x = np.hstack([x, np.array([[c], [d]])])
y = np.array([[c], [d]])
x.shape, y.shape

((2, 11), (2, 1))

In [3]:
def create_env(config_env):
#     env = gym.make(config_env["version"])
    return KnapsackEnv()

tf = try_import_tf()

class CustomModel(TFModelV2):
    """Example of a custom model that just delegates to a fc-net."""

    def __init__(self, obs_space, action_space, num_outputs, model_config,
                 name):
        super(CustomModel, self).__init__(obs_space, action_space, num_outputs,
                                          model_config, name)
        self.model = FullyConnectedNetwork(obs_space, action_space,
                                           num_outputs, model_config, name)
        self.register_variables(self.model.variables())

    def forward(self, input_dict, state, seq_lens):
        return self.model.forward(input_dict, state, seq_lens)

    def value_function(self):
        return self.model.value_function()

# Can also register the env creator function explicitly with:
# register_env("Knapsack-v0", create_env)
# register_env("Knapsack-v0", create_env)
register_env("Knapsack-v0", lambda config: create_env(config))
ray.init(ignore_reinit_error=True)
ModelCatalog.register_custom_model("my_model", CustomModel)
x = tune.run(
    "PPO",
    stop={
        "timesteps_total": 10000,
    },
    config={
        "env": "Knapsack-v0",  # or "corridor" if registered above
        "model": {
            "custom_model": "my_model",
        },
        "env_config": {
            "version": "Knapsack-v0"
#             "corridor_length": 5,
        },
        "vf_share_layers": True,
        "lr": grid_search([1e-2, 1e-4, 1e-6]),  # try different lrs
        "num_workers": 1,  # parallelism
    },
)

2020-03-17 18:53:55,174	INFO resource_spec.py:216 -- Starting Ray with 4.2 GiB memory available for workers and up to 2.11 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).
2020-03-17 18:53:55,748	INFO ray_trial_executor.py:121 -- Trial PPO_Knapsack-v0_8ff22748: Setting up new remote runner.


Trial name,status,loc,lr
PPO_Knapsack-v0_8ff22748,RUNNING,,
PPO_Knapsack-v0_8ff5199e,PENDING,,
PPO_Knapsack-v0_8ff71dde,PENDING,,


2020-03-17 18:53:55,916	INFO ray_trial_executor.py:121 -- Trial PPO_Knapsack-v0_8ff5199e: Setting up new remote runner.
2020-03-17 18:53:55,961	INFO ray_trial_executor.py:121 -- Trial PPO_Knapsack-v0_8ff71dde: Setting up new remote runner.


[2m[36m(pid=31969)[0m   _np_qint8 = np.dtype([("qint8", np.int8, 1)])
[2m[36m(pid=31969)[0m   _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
[2m[36m(pid=31969)[0m   _np_qint16 = np.dtype([("qint16", np.int16, 1)])
[2m[36m(pid=31969)[0m   _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
[2m[36m(pid=31969)[0m   _np_qint32 = np.dtype([("qint32", np.int32, 1)])
[2m[36m(pid=31969)[0m   np_resource = np.dtype([("resource", np.ubyte, 1)])
[2m[36m(pid=31968)[0m   _np_qint8 = np.dtype([("qint8", np.int8, 1)])
[2m[36m(pid=31968)[0m   _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
[2m[36m(pid=31968)[0m   _np_qint16 = np.dtype([("qint16", np.int16, 1)])
[2m[36m(pid=31968)[0m   _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
[2m[36m(pid=31968)[0m   _np_qint32 = np.dtype([("qint32", np.int32, 1)])
[2m[36m(pid=31968)[0m   np_resource = np.dtype([("resource", np.ubyte, 1)])
[2m[36m(pid=31972)[0m   _np_qint8 = np.dtype([("qint8", np.int8, 1)])
[2m[36m

[2m[36m(pid=31972)[0m   "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
[2m[36m(pid=31969)[0m   "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
[2m[36m(pid=32097)[0m   _np_qint8 = np.dtype([("qint8", np.int8, 1)])
[2m[36m(pid=32097)[0m   _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
[2m[36m(pid=32097)[0m   _np_qint16 = np.dtype([("qint16", np.int16, 1)])
[2m[36m(pid=32097)[0m   _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
[2m[36m(pid=32097)[0m   _np_qint32 = np.dtype([("qint32", np.int32, 1)])
[2m[36m(pid=32097)[0m   np_resource = np.dtype([("resource", np.ubyte, 1)])
[2m[36m(pid=32110)[0m   _np_qint8 = np.dtype([("qint8", np.int8, 1)])
[2m[36m(pid=32110)[0m   _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
[2m[36m(pid=32110)[0m   _np_qint16 = np.dtype([("qint16", np.int16, 1)])
[2m[36m(pid=32110)[0m   _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
[2m[36m(pid=32110)[0m   _np_qint32 = n

[2m[36m(pid=32097)[0m   "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
[2m[36m(pid=31974)[0m   "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
[2m[36m(pid=32110)[0m   "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
Result for PPO_Knapsack-v0_8ff71dde:
  custom_metrics: {}
  date: 2020-03-17_18-54-16
  done: false
  episode_len_mean: 21.116402116402117
  episode_reward_max: 386.0
  episode_reward_mean: 260.70899470899474
  episode_reward_min: 135.0
  episodes_this_iter: 189
  episodes_total: 189
  experiment_id: d3f0529a6d9245179c750a145ebbd525
  experiment_tag: 2_lr=1e-06
  hostname: ubuntu
  info:
    grad_time_ms: 8852.399
    learner:
      default_policy:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 9.999999974752427e-07
        entropy: 5.297819137573242
        entropy_coeff: 0.0
        kl: 0.0004841416666749865
        policy_loss: -0.007955948822200298
        total_loss: 20465.3046875


Trial name,status,loc,lr,iter,total time (s),timesteps,reward
PPO_Knapsack-v0_8ff22748,RUNNING,,,,,,
PPO_Knapsack-v0_8ff5199e,RUNNING,,,,,,
PPO_Knapsack-v0_8ff71dde,RUNNING,192.168.0.11:31968,1e-06,1.0,13.7145,4000.0,260.709


Result for PPO_Knapsack-v0_8ff22748:
  custom_metrics: {}
  date: 2020-03-17_18-54-16
  done: false
  episode_len_mean: 20.49230769230769
  episode_reward_max: 381.0
  episode_reward_mean: 252.48717948717947
  episode_reward_min: 131.0
  episodes_this_iter: 195
  episodes_total: 195
  experiment_id: 23b8c42222264ac39550581747d91c02
  experiment_tag: 0_lr=0.01
  hostname: ubuntu
  info:
    grad_time_ms: 8808.329
    learner:
      default_policy:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 0.009999999776482582
        entropy: 1.7222822904586792
        entropy_coeff: 0.0
        kl: 14.326783180236816
        policy_loss: 0.6111807823181152
        total_loss: 16856.98828125
        vf_explained_var: 3.845460838647341e-09
        vf_loss: 16853.51171875
    load_time_ms: 85.413
    num_steps_sampled: 4000
    num_steps_trained: 3968
    sample_time_ms: 4292.357
    update_time_ms: 345.843
  iterations_since_restore: 1
  node_ip: 192.168.0.11
  num_healthy_workers: 1
  of

Trial name,status,loc,lr,iter,total time (s),timesteps,reward
PPO_Knapsack-v0_8ff22748,RUNNING,192.168.0.11:31972,0.01,2,25.8691,8000,289.429
PPO_Knapsack-v0_8ff5199e,RUNNING,192.168.0.11:31969,0.0001,1,13.7246,4000,254.104
PPO_Knapsack-v0_8ff71dde,RUNNING,192.168.0.11:31968,1e-06,1,13.7145,4000,260.709


Result for PPO_Knapsack-v0_8ff71dde:
  custom_metrics: {}
  date: 2020-03-17_18-54-29
  done: false
  episode_len_mean: 20.880208333333332
  episode_reward_max: 509.0
  episode_reward_mean: 259.765625
  episode_reward_min: 128.0
  episodes_this_iter: 192
  episodes_total: 381
  experiment_id: d3f0529a6d9245179c750a145ebbd525
  experiment_tag: 2_lr=1e-06
  hostname: ubuntu
  info:
    grad_time_ms: 8758.365
    learner:
      default_policy:
        cur_kl_coeff: 0.10000000149011612
        cur_lr: 9.999999974752427e-07
        entropy: 5.296303749084473
        entropy_coeff: 0.0
        kl: 0.0009387807222083211
        policy_loss: -0.009477922692894936
        total_loss: 20355.326171875
        vf_explained_var: 0.00014808293781243265
        vf_loss: 20355.337890625
    load_time_ms: 53.949
    num_steps_sampled: 8000
    num_steps_trained: 7936
    sample_time_ms: 4054.715
    update_time_ms: 186.239
  iterations_since_restore: 2
  node_ip: 192.168.0.11
  num_healthy_workers: 1
 

Trial name,status,loc,lr,iter,total time (s),timesteps,reward
PPO_Knapsack-v0_8ff22748,TERMINATED,,0.01,3,39.7761,12000,346.351
PPO_Knapsack-v0_8ff5199e,RUNNING,192.168.0.11:31969,0.0001,2,26.1219,8000,280.251
PPO_Knapsack-v0_8ff71dde,RUNNING,192.168.0.11:31968,1e-06,2,26.1566,8000,259.766


Result for PPO_Knapsack-v0_8ff71dde:
  custom_metrics: {}
  date: 2020-03-17_18-54-43
  done: true
  episode_len_mean: 21.03157894736842
  episode_reward_max: 424.0
  episode_reward_mean: 263.0157894736842
  episode_reward_min: 144.0
  episodes_this_iter: 190
  episodes_total: 571
  experiment_id: d3f0529a6d9245179c750a145ebbd525
  experiment_tag: 2_lr=1e-06
  hostname: ubuntu
  info:
    grad_time_ms: 9255.684
    learner:
      default_policy:
        cur_kl_coeff: 0.05000000074505806
        cur_lr: 9.999999974752427e-07
        entropy: 5.293855667114258
        entropy_coeff: 0.0
        kl: 0.001201825449243188
        policy_loss: -0.013342607766389847
        total_loss: 21042.234375
        vf_explained_var: 7.19793388270773e-05
        vf_loss: 21042.248046875
    load_time_ms: 42.466
    num_steps_sampled: 12000
    num_steps_trained: 11904
    sample_time_ms: 3982.143
    update_time_ms: 125.677
  iterations_since_restore: 3
  node_ip: 192.168.0.11
  num_healthy_workers: 1


Trial name,status,loc,lr,iter,total time (s),timesteps,reward
PPO_Knapsack-v0_8ff22748,TERMINATED,,0.01,3,39.7761,12000,346.351
PPO_Knapsack-v0_8ff5199e,TERMINATED,,0.0001,3,40.1578,12000,309.028
PPO_Knapsack-v0_8ff71dde,TERMINATED,,1e-06,3,40.2784,12000,263.016


2020-03-17 18:54:43,580	INFO tune.py:334 -- Returning an analysis object by default. You can call `analysis.trials` to retrieve a list of trials. This message will be removed in future versions of Tune.


In [13]:
x.dataframe()

Unnamed: 0,episode_reward_max,episode_reward_min,episode_reward_mean,episode_len_mean,episodes_this_iter,timesteps_this_iter,done,timesteps_total,episodes_total,training_iteration,...,info/learner/default_policy/kl,info/learner/default_policy/entropy,info/learner/default_policy/entropy_coeff,config/env,config/env_config,config/lr,config/model,config/num_workers,config/vf_share_layers,logdir
0,351.0,177.0,258.828877,21.385027,187,4000,True,12000,571,3,...,0.197311,2.035856,0.0,Knapsack-v0,{'version': 'Knapsack-v0'},0.01,{'custom_model': 'my_model'},1,True,/home/christian/ray_results/PPO/PPO_Knapsack-v...
1,511.0,176.0,304.865922,22.418994,179,4000,True,12000,556,3,...,0.027375,5.177581,0.0,Knapsack-v0,{'version': 'Knapsack-v0'},0.0001,{'custom_model': 'my_model'},1,True,/home/christian/ray_results/PPO/PPO_Knapsack-v...
2,380.0,134.0,262.0,20.701031,194,4000,True,12000,582,3,...,0.001136,5.294332,0.0,Knapsack-v0,{'version': 'Knapsack-v0'},1e-06,{'custom_model': 'my_model'},1,True,/home/christian/ray_results/PPO/PPO_Knapsack-v...
