In [1]:
import tensorflow.compat.v1 as tf

from open_spiel.python import policy
from open_spiel.python import rl_environment
from open_spiel.python.algorithms import eva
from open_spiel.python.algorithms import exploitability
import pyspiel

class JointPolicy(policy.Policy):
  """Joint policy to be evaluated."""

  def __init__(self, env, agents):
    game = env.game
    player_ids = list(range(len(agents)))
    super(JointPolicy, self).__init__(game, player_ids)
    self._agents = agents

  def action_probabilities(self, state, player_id=None):
    cur_player = state.current_player()
    legal_actions = state.legal_actions(cur_player)
    probs = self._agents[cur_player].action_probabilities(state)
    return {action: probs[action] for action in legal_actions}


def tf_main(game_name, num_episodes):
  env_configs = {"players": 2}
  env = rl_environment.Environment(game_name, **env_configs)
  num_players = env.num_players
  num_actions = env.action_spec()["num_actions"]
  state_size = env.observation_spec()["info_state"][0]
  eva_agents = []
  with tf.Session() as sess:
    for player in range(num_players):
      eva_agents.append(
          eva.EVAAgent(
              sess,
              env,
              player,
              state_size,
              num_actions,
              batch_size=128,
              learning_rate=0.01,
              mixing_parameter=0.5,
              memory_capacity=int(1e6),
              discount_factor=1.0,
              update_target_network_every=1000,
              epsilon_start=0.06,
              epsilon_end=0.001,
              epsilon_decay_duration=int(1e6)))
    
    joint_policy = JointPolicy(env, eva_agents)
    sess.run(tf.global_variables_initializer())
    
    result = []
    for episode in range(num_episodes):
      if (episode + 1) % 1000 == 0:
        conv = exploitability.nash_conv(env.game, joint_policy)
        result.append(conv)
        print("Episode:%s - NashConv: %s" %(episode+1, conv))
        
      time_step = env.reset()
      while not time_step.last():
        current_player = time_step.observations["current_player"]
        current_agent = eva_agents[current_player]
        step_out = current_agent.step(time_step)
        time_step = env.step([step_out.action])
        
      for agent in eva_agents:
        agent.step(time_step)
        
  return result


Instructions for updating:
non-resource variables are not supported in the long term


In [2]:
tf_result = []
for _ in range(1):
    result = tf_main('leduc_poker', 10000)
    print(result)
    tf_result.append(result)

Episode:1000 - NashConv: 4.720580705953815
Episode:2000 - NashConv: 4.749483997465015
Episode:3000 - NashConv: 4.762153759840439
Episode:4000 - NashConv: 4.614934653791604
Episode:5000 - NashConv: 4.694948923098359
Episode:6000 - NashConv: 4.58856208386835
Episode:7000 - NashConv: 4.500648809481751
Episode:8000 - NashConv: 4.36771227475684
Episode:9000 - NashConv: 4.4433707451544375
Episode:10000 - NashConv: 4.453912521985062
[4.720580705953815, 4.749483997465015, 4.762153759840439, 4.614934653791604, 4.694948923098359, 4.58856208386835, 4.500648809481751, 4.36771227475684, 4.4433707451544375, 4.453912521985062]


In [None]:
tf_result = []
for _ in range(1):
    result = tf_main('leduc_poker', 10000)
    print(result)
    tf_result.append(result)

Episode:1000 - NashConv: 4.775832579685689
Episode:2000 - NashConv: 4.990296873522524
