*Wrapper example*

In [None]:
import sys
from poker_env import PokerEnv
from agents.random_policy import RandomActions
from agents.heuristic_policy import HeuristicPolicy
from ray.rllib.algorithms.ppo import PPOConfig
from ray.rllib.algorithms.ppo import (
    PPOConfig,
    PPOTF1Policy,
    PPOTF2Policy,
    PPOTorchPolicy,
)
from gym import spaces
import mpu
import numpy as np
import ray
from ray.rllib.models import MODEL_DEFAULTS
from ray.rllib.policy.policy import PolicySpec
from ray.tune.registry import register_env

If you want to modify the observation vectors provided by the environment, there are a couple of hoops you have to jump through.

In [None]:
from typing import Dict, Optional

#The following 3 functions need to be overwritten
class wrapped_poker_env(PokerEnv):
    def __init__(self, select_policy, config: Optional[Dict] = None):
        super().__init__(select_policy, config)
    
    #Required to work around RLlib bug.
    #This needs to the same shape as the observation vector the policy uses. 
    def set_empty_obs(self):
        for a in self.players_ids:
            if 'Heuristic' in self.select_policy(self.agents[a].ID, 0):
                self.agents[a].empty_obs = {"hand": np.zeros(24), "community": np.zeros(24)}
            #Here we have definded a new observation vector, it is the value of the agents 
            #hand and community cards.
            elif 'SimpleHand' in self.select_policy(self.agents[a].ID, 0):
                self.agents[a].empty_obs = {"obs": np.zeros(3+16+4),'state': np.zeros(1)}
            else:
                self.agents[a].empty_obs ={'obs': np.zeros(24+24+16+4), 'state': np.zeros(1)} 

    #The obs of each policy needs to be handled here.
    def get_obs(self, agent):
        community_cards_state = np.sum(self.community_cards, axis=0)
        if 'Heuristic' in self.select_policy(self.agents[agent].ID, 0):
            return {"hand": self.agents[agent].get_hand(), "community": community_cards_state} 
        elif 'SimpleHand' in self.select_policy(self.agents[agent].ID, 0):
            obs = np.array(self.score_hand(self.agents[agent].hand))
            obs = np.concatenate([obs, self.all_bets.reshape(-1)])
            obs = np.concatenate([obs, np.array([self.agents[a].chips for a in self.players_ids])])
            return {"obs": obs, "state": np.array([0])}
        
        #The default observation
        obs = self.agents[agent].get_hand()
        obs = np.concatenate([obs, community_cards_state])
        obs = np.concatenate([obs, self.all_bets.reshape(-1)])
        obs = np.concatenate([obs, np.array([self.agents[a].chips for a in self.players_ids])])
        return {"obs": obs, "state": np.array([0])}

Set up training with this different observation space

In [None]:
def select_policy(agent_id, episode, **kwargs):
    if agent_id == 0:
        return "learned"
    elif agent_id == 1:
        return "SimpleHand"
    elif agent_id == 2:
        return "Heuristic_10"
    elif agent_id == 3:
        return "Heuristic_100"
    return "Heuristic_1000"

def env_creator(config):
    env = wrapped_poker_env(select_policy, config)
    return env

register_env("poker", lambda config: env_creator(config))

In [None]:
heuristic_observation_space = spaces.Dict({
            "hand": spaces.Box(0, 1, shape=(24, )),
            "community": spaces.Box(0, 1, shape=(24, ))
        })

simple_hand_observation_space = spaces.Dict({
            "obs": spaces.Box(0, 400, shape=(3+16+4, )),
            "state": spaces.Box(0, 1, shape=(1, ))
        })
action_space = spaces.Discrete(3)

model = MODEL_DEFAULTS.update({'fcnet_hiddens': [512, 512], 'fcnet_activation': 'relu'})

config = (
    PPOConfig()
    .rollouts(num_rollout_workers=4, num_envs_per_worker=1)\
    .training(train_batch_size=4000, gamma=0.99, model=model, lr=0.0004)\
    .environment(disable_env_checking=True)\
    .multi_agent(
        policies={
            "random": PolicySpec(policy_class=RandomActions),
            "Heuristic_10": (HeuristicPolicy, heuristic_observation_space, action_space, {'difficulty': 0}),
            "Heuristic_100": (HeuristicPolicy, heuristic_observation_space, action_space, {'difficulty': 1}),
            "SimpleHand": (PPOTorchPolicy, simple_hand_observation_space, action_space, {}),
            "learned": PolicySpec(
                config={}
            ),
        },
        policy_mapping_fn=select_policy,
        #Notice how this will train both the normal obs and simple obs using ppo
        policies_to_train=['learned', 'SimpleHand'],
    )\
    .resources(num_gpus=0)\
    .framework('torch')
)
trainer = config.build(env="poker")


In [None]:
#!tensorboard --logdir=~/ray_results --host 0.0.0.0

In [None]:
for i in range(1000):
    trainer.train()

In [None]:
trainer.save("checkpoint/ppo_poker")