*Training Poker Using RLLib*

In [1]:
import sys
from poker_env import PokerEnv
from agents.random_policy import RandomActions
from agents.heuristic_policy import HeuristicPolicy
from ray.rllib.algorithms.ppo import PPOConfig
from ray.rllib.algorithms.a3c import A3C
from ray.rllib.algorithms.sac import SAC
from ray.rllib.algorithms.dqn import DQN
from gym import spaces
import mpu
import numpy as np
import ray
from ray.rllib.models import MODEL_DEFAULTS
from ray.rllib.policy.policy import PolicySpec
from ray.tune.registry import register_env

  'nearest': pil_image.NEAREST,
  'bilinear': pil_image.BILINEAR,
  'bicubic': pil_image.BICUBIC,
  if hasattr(pil_image, 'HAMMING'):
  if hasattr(pil_image, 'BOX'):
  if hasattr(pil_image, 'LANCZOS'):
  from collections import Iterable


In Rllib, a policy function needs to be passed to map agent IDs to the policy to use. We also create and register the learning environment.

In [2]:
def select_policy(agent_id, episode, **kwargs):
    if agent_id == 0:
        return "a3c"
    elif agent_id == 1:
        return "sac"
    elif agent_id == 2:
        return "dqn"
    elif agent_id == 3:
        return "ppo"
    return "learned4"

def env_creator(config):
    env = PokerEnv(select_policy, config)
    return env

register_env("poker", lambda config: env_creator(config))

  and should_run_async(code)


The config describes all aspects of the training. A full list of the parameters is found here: https://github.com/ray-project/ray/blob/master/rllib/algorithms/algorithm_config.py 

In this example, we have used a default config that runs the PPO algorithm. Other heuristic agents have been defined that will play the game with the single learning agent. 

In [3]:
heuristic_observation_space = spaces.Dict({
            "hand": spaces.Box(0, 1, shape=(24, )),
            "community": spaces.Box(0, 1, shape=(24, ))
        })
action_space = spaces.Discrete(3)

#Defines the learning models architecture. 
model = MODEL_DEFAULTS.update({'fcnet_hiddens': [512, 512], 'fcnet_activation': 'relu'})

config = (
    PPOConfig()
    #Each rollout worker uses a single cpu
    .rollouts(num_rollout_workers=8, num_envs_per_worker=1)\
    .training(train_batch_size=4000, gamma=0.99, model=model, lr=0.0004)\
    .environment(disable_env_checking=True)\
    .multi_agent(
        policies={
            #These policies thave pre-definded polices that dont learn.
            "a3c": PolicySpec(config=A3C.get_default_config()),
            "sac": PolicySpec(config=SAC.get_default_config()),
            "dqn": PolicySpec(config=DQN.get_default_config()),
            #Passing nothing causes this agent to deafult to using a PPO policy
            "ppo": PolicySpec(
                config={}
            ),
        },
        policy_mapping_fn=select_policy,
        policies_to_train=['a3c', 'sac', 'dqn', 'ppo'],
    )\
    .resources(num_gpus=0)\
    .framework('torch')
)
trainer = config.build(env="poker")

2022-11-10 13:56:45,642	INFO worker.py:1518 -- Started a local Ray instance.
[2m[36m(pid=40758)[0m   import imp
[2m[36m(pid=40759)[0m   import imp
[2m[36m(pid=40764)[0m   import imp
[2m[36m(pid=40763)[0m   import imp
[2m[36m(pid=40762)[0m   import imp
[2m[36m(pid=40760)[0m   import imp
[2m[36m(pid=40757)[0m   import imp
[2m[36m(pid=40761)[0m   import imp
[2m[36m(pid=40758)[0m   'nearest': pil_image.NEAREST,
[2m[36m(pid=40758)[0m   'bilinear': pil_image.BILINEAR,
[2m[36m(pid=40758)[0m   'bicubic': pil_image.BICUBIC,
[2m[36m(pid=40758)[0m   if hasattr(pil_image, 'HAMMING'):
[2m[36m(pid=40758)[0m   if hasattr(pil_image, 'BOX'):
[2m[36m(pid=40758)[0m   if hasattr(pil_image, 'LANCZOS'):
[2m[36m(pid=40763)[0m   'nearest': pil_image.NEAREST,
[2m[36m(pid=40763)[0m   'bilinear': pil_image.BILINEAR,
[2m[36m(pid=40763)[0m   'bicubic': pil_image.BICUBIC,
[2m[36m(pid=40763)[0m   if hasattr(pil_image, 'HAMMING'):
[2m[36m(pid=40763)[0m   if has

In [4]:
#Start up tensorboard
#!tensorboard --logdir=~/ray_results --host 0.0.0.0

Training loop, each run will rollout x timesteps (where x is train_batch_size). An weight update is then applied using the rollout data. 

In [5]:
for i in range(1000):
    trainer.train()



RayTaskError(TypeError): [36mray::RolloutWorker.sample()[39m (pid=40760, ip=127.0.0.1, repr=<ray.rllib.evaluation.rollout_worker.RolloutWorker object at 0x7fd559daa880>)
  File "/Users/adamprice/Applications/anaconda3/envs/stocktake/lib/python3.8/site-packages/ray/rllib/evaluation/rollout_worker.py", line 810, in sample
    batches = [self.input_reader.next()]
  File "/Users/adamprice/Applications/anaconda3/envs/stocktake/lib/python3.8/site-packages/ray/rllib/evaluation/sampler.py", line 92, in next
    batches = [self.get_data()]
  File "/Users/adamprice/Applications/anaconda3/envs/stocktake/lib/python3.8/site-packages/ray/rllib/evaluation/sampler.py", line 282, in get_data
    item = next(self._env_runner)
  File "/Users/adamprice/Applications/anaconda3/envs/stocktake/lib/python3.8/site-packages/ray/rllib/evaluation/sampler.py", line 734, in _env_runner
    base_env.send_actions(actions_to_send)
  File "/Users/adamprice/Applications/anaconda3/envs/stocktake/lib/python3.8/site-packages/ray/rllib/env/multi_agent_env.py", line 572, in send_actions
    raise e
  File "/Users/adamprice/Applications/anaconda3/envs/stocktake/lib/python3.8/site-packages/ray/rllib/env/multi_agent_env.py", line 565, in send_actions
    obs, rewards, dones, infos = env.step(agent_dict)
  File "/Users/adamprice/Desktop/Poker/poker_env.py", line 186, in step
    self.progress_game_step()
  File "/Users/adamprice/Desktop/Poker/poker_env.py", line 292, in progress_game_step
    self.showdown()
  File "/Users/adamprice/Desktop/Poker/poker_env.py", line 308, in showdown
    self.agents[w].reward_buffer = int(self.pot_size / winners.shape[0]) - self.agents[winners].game_bet
TypeError: unhashable type: 'numpy.ndarray'

In [None]:
#Saves a checkpoint of the trainer.
trainer.save("checkpoint/ppo_poker")