In [1]:
import pyspiel
import numpy as np
import matplotlib.pyplot as plt
from open_spiel.python.egt import dynamics
from open_spiel.python.egt.utils import game_payoffs_array
from open_spiel.python import rl_environment
from open_spiel.python import rl_tools
from open_spiel.python.algorithms import tabular_qlearner, boltzmann_tabular_qlearner
print("")

# Define games
biased_rock_paper_scissors = pyspiel.create_matrix_game("brps", "biased_rock_paper_scissors", ["R", "P", "S"], ["R", "P", "S"], [[0, -0.25, 0.5], [0.25, 0, -0.05], [-0.5, 0.05, 0]], [[0, 0.25, -0.5], [-0.25, 0, 0.05], [0.5, -0.05, 0]])
dispersion = pyspiel.create_matrix_game("d", "dispersion", ["A", "B"], ["A", "B"], [[-1, 1], [1, -1]], [[-1, 1], [1, -1]])
battle_of_the_sexes = pyspiel.create_matrix_game("bots", "battle_of_the_sexes", ["O", "M"], ["O", "M"], [[3, 0], [0, 2]], [[2, 0], [0, 3]])
prisoner_dilemma = pyspiel.create_matrix_game("pd", "prisoner_dilemma", ["C", "D"], ["C", "D"], [[-1, -4], [0, -3]], [[-1, 0], [-4, -3]])

# Choose game
game = battle_of_the_sexes
print(game)
payoff_matrix = game_payoffs_array(game)


# Create the environment
env = rl_environment.Environment(game)
num_players = env.num_players
num_actions = env.action_spec()["num_actions"]
episodes=20000

# Create the agents
agents = [
    tabular_qlearner.QLearner(player_id=idx, num_actions=num_actions)
    for idx in range(num_players)
]
traj1=np.empty(episodes)
traj2=np.empty(episodes)
for cur_episode in range(episodes):
  if cur_episode % 1000 == 0:
    print(f"Episodes: {cur_episode}")
  time_step = env.reset()
  while not time_step.last():
    info_state1 = str(time_step.observations["info_state"][agents[0]._player_id])
    legal_actions1 = time_step.observations["legal_actions"][agents[0]._player_id]
    info_state2 = str(time_step.observations["info_state"][agents[1]._player_id])
    legal_actions2 = time_step.observations["legal_actions"][agents[1]._player_id]
    traj1[cur_episode],traj2[cur_episode]=agents[0]._get_action_probs(info_state1,legal_actions1,0.1)[1][0],agents[1]._get_action_probs(info_state2,legal_actions2,0.1)[1][0]
    agent1_output = agents[0].step(time_step)
    agent2_output = agents[1].step(time_step)
    time_step = env.step([agent1_output.action,agent2_output.action])
  # Episode is over, step all agents with final info state.
  for agent in agents:
    agent.step(time_step)
print("Done!")





bots()
Episodes: 0
Episodes: 1000
Episodes: 2000
Episodes: 3000
Episodes: 4000
Episodes: 5000
Episodes: 6000
Episodes: 7000
Episodes: 8000
Episodes: 9000
Episodes: 10000
Episodes: 11000
Episodes: 12000
Episodes: 13000
Episodes: 14000
Episodes: 15000
Episodes: 16000
Episodes: 17000
Episodes: 18000
Episodes: 19000
Done!


In [None]:
time_step = env.reset()
info_state = str(time_step.observations["info_state"][agents[0]._player_id])
legal_actions = time_step.observations["legal_actions"][agents[0]._player_id]
print(agents[0]._get_action_probs(info_state,legal_actions,0.1))
fig, ax = plt.subplots()
ax.plot(traj1,traj2)
plt.show()