Code for loading a trained policy and turning it into a static agent.

Note, this is a bit of a WIP, and only works with torch currently. If you want to implement advisieral or leauge learning, this might be a good starting point, but not a full implmentation. 

In [1]:
import sys
from poker_env import PokerEnv
from agents.random_policy import RandomActions
from agents.heuristic_policy import HeuristicPolicy
from ray.rllib.algorithms.ppo import PPOConfig
from gym import spaces
import mpu
import numpy as np
import ray
from ray.rllib.models import MODEL_DEFAULTS
from ray.rllib.policy.policy import PolicySpec
from ray.tune.registry import register_env
import tensorflow as tf
import torch

  import distutils.spawn


Load in the config that trained the policy you want to extract.

In [2]:
def select_policy(agent_id, episode, **kwargs):
    if agent_id == 0:
        return "learned"
    elif agent_id == 1:
        return "Heuristic_10"
    elif agent_id == 2:
        return "Heuristic_100"
    elif agent_id == 3:
        return "Heuristic_1000"
    return "Heuristic_1000"

def env_creator(config):
    env = PokerEnv(select_policy, config)
    return env

register_env("poker", lambda config: env_creator(config))

In [3]:
# This should be the same as config used to train a model? 

heuristic_observation_space = spaces.Dict({
            "hand": spaces.Box(0, 1, shape=(24, )),
            "community": spaces.Box(0, 1, shape=(24, ))
        })
action_space = spaces.Discrete(3)

#Defines the learning models architecture. 
model = MODEL_DEFAULTS.update({'fcnet_hiddens': [512, 512], 'fcnet_activation': 'relu'})

config = (
    PPOConfig()
    #Each rollout worker uses a single cpu
    .rollouts(num_rollout_workers=2, num_envs_per_worker=1)\
    .training(train_batch_size=4000, gamma=0.99, model=model, lr=0.0004)\
    .environment(disable_env_checking=True)\
    .multi_agent(
        policies={
            #These policies thave pre-definded polices that dont learn.
            "random": PolicySpec(policy_class=RandomActions),
            "Heuristic_10": (HeuristicPolicy, heuristic_observation_space, action_space, {'difficulty': 0}),
            "Heuristic_100": (HeuristicPolicy, heuristic_observation_space, action_space, {'difficulty': 1}),
            "Heuristic_1000": (HeuristicPolicy, heuristic_observation_space, action_space, {'difficulty': 2}),
            #Passing nothing causes this agent to deafult to using a PPO policy
            "learned": PolicySpec(
                config={}
            ),
        },
        policy_mapping_fn=select_policy,
        policies_to_train=['learned'],
    )\
    .resources(num_gpus=0)\
    .framework('torch')
)
trainer = config.build(env="poker")


2022-11-10 14:15:56,095	INFO worker.py:1528 -- Started a local Ray instance.
[2m[36m(pid=35091)[0m   import distutils.spawn
[2m[36m(pid=35093)[0m   import distutils.spawn
2022-11-10 14:16:05,776	INFO trainable.py:164 -- Trainable.setup took 11.560 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


In [4]:
# Restore from the checkpoint
trainer.restore('checkpoint/ppo_poker/checkpoint_000010')

2022-11-10 14:16:05,819	INFO trainable.py:766 -- Restored on 172.31.11.130 from checkpoint: checkpoint/ppo_poker/checkpoint_000010
2022-11-10 14:16:05,820	INFO trainable.py:775 -- Current state after restoring: {'_iteration': 10, '_timesteps_total': None, '_time_total': 69.48709058761597, '_episodes_total': 4117}


In [5]:
#Get out the trained policy
trainer.get_policy('learned').export_model('models/ppo_agent')

In [6]:
model = torch.load('models/ppo_agent/model.pt')
#checkpoint/ppo_poker/checkpoint_000010
model.eval()
dic = {'obs': torch.tensor(np.zeros((69,1)).reshape(1, -1))}
torch.argmax(model(dic, [torch.tensor(np.zeros(0))], torch.tensor(np.zeros(1)))[0])

tensor(2)

Now set up the training env using the pre trained agent as a static agent in the environment. 

In [7]:
from ray.rllib.policy.policy import Policy
from ray.rllib.utils.spaces.space_utils import (
    get_base_struct_from_space)
from gym import spaces

class TrainedPolicyAgent(Policy):

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)  
        #model = tf.saved_model.load('models/ppo_agent')
        #self.inference = model.signatures["serving_default"]
        self.model = torch.load('models/ppo_agent/model.pt')
        self.model.eval()
        
    def get_initial_state(self):
        return [0]
        
    def compute_actions(
            self,
            obs_batch,
            state_batches=None,
            prev_action_batch=None,
            prev_reward_batch=None,
            info_batch=None,
            episodes=None,
            **kwargs
        ):
        #return inference(is_training=tf.constant(False), observations=obs_batch, timestep=tf.constant(-1, dtype=tf.int64))['actions_0'][0]
        dic = {'obs': torch.tensor(obs_batch.reshape(1, -1))}
        return [torch.argmax(self.model(dic, [torch.tensor(np.zeros(0))], torch.tensor(np.zeros(1)))[0]).item()], [], {}

    def get_weights(self):
        return None

    def set_weights(self, weights):
        return None


In [8]:

def select_policy_new_agent(agent_id, episode, **kwargs):
    if agent_id == 0:
        return "learned"
    elif agent_id == 1:
        return "Heuristic_10"
    elif agent_id == 2:
        return "Heuristic_100"
    elif agent_id == 3:
        return "PPO_Agent"
    return "Heuristic_1000"

# TODO this overrides stuff above....  
def env_creator(config):
    env = PokerEnv(select_policy_new_agent, config)
    return env

    
register_env("poker", lambda config: env_creator(config))

PPO_Agent_observation_space = spaces.Dict({
            "obs": spaces.Box(0, 400, shape=(24+24+16+4, )),
            "state": spaces.Box(0, 1, shape=(1, )),
        })

heuristic_observation_space = spaces.Dict({
            "hand": spaces.Box(0, 1, shape=(24, )),
            "community": spaces.Box(0, 1, shape=(24, ))
        })

action_space = spaces.Discrete(3)

model = MODEL_DEFAULTS.update({'fcnet_hiddens': [512, 512], 'fcnet_activation': 'relu'})

config = (
    PPOConfig()
    .rollouts(num_rollout_workers=1, num_envs_per_worker=1)\
    .training(train_batch_size=4000, gamma=0.99, model=model, lr=0.0004)\
    .environment(disable_env_checking=True)\
    .multi_agent(
        policies={
            "random": PolicySpec(policy_class=RandomActions),
            "Heuristic_10": (HeuristicPolicy, heuristic_observation_space, action_space, {'difficulty': 0}),
            "Heuristic_100": (HeuristicPolicy, heuristic_observation_space, action_space, {'difficulty': 1}),
            "PPO_Agent": (TrainedPolicyAgent, PPO_Agent_observation_space, action_space, {}),
            "learned": PolicySpec(
                config={}
            ),
        },
        policy_mapping_fn=select_policy_new_agent,
        policies_to_train=['learned'],
    )\
    .resources(num_gpus=0)\
    .framework('torch')
)
trainer = config.build(env="poker")


[2m[36m(pid=35221)[0m   import distutils.spawn


In [9]:
for i in range(10):
    trainer.train()



In [10]:

trainer.save("checkpoint/ppo_poker_checkpoint_test")



'checkpoint/ppo_poker_checkpoint_test/checkpoint_000010'