In [1]:
import numpy as np
import pyspiel 
import tensorflow.compat.v1 as tf
import torch 
import torch.nn as nn

import algorithms.rcfr as rcfr_tf
import pytorch.rcfr as rcfr_pt
tf.disable_v2_behavior()

tf.enable_eager_execution()

_GAME = pyspiel.load_game('kuhn_poker')
_BATCH_SIZE = 12

Instructions for updating:
non-resource variables are not supported in the long term


In [8]:
def _new_model_tf():
  return rcfr_tf.DeepRcfrModel(
      _GAME,
      num_hidden_layers=1,
      num_hidden_units=13,
      num_hidden_factors=1,
      use_skip_connections=True)

In [23]:
def tnsorflow_example():
  game = pyspiel.load_game('kuhn_poker')

  models = []
  for _ in range(game.num_players()):
    models.append(
        rcfr_tf.DeepRcfrModel(
            game,
            num_hidden_layers=1,
            num_hidden_units=13,
            num_hidden_factors=8,
            use_skip_connections=True))

  buffer_size = -1
  truncate_negative = False
  bootstrap = False
  if buffer_size > 0:
    solver = rcfr_tf.ReservoirRcfrSolver(
        game,
        models,
        buffer_size,
        truncate_negative=truncate_negative)
  else:
    solver = rcfr_tf.RcfrSolver(
        game,
        models,
        truncate_negative=truncate_negative,
        bootstrap=bootstrap)

  def _train_fn(model, data):
    """Train `model` on `data`."""
    batch_size = 100
    num_epochs = 200
    step_size = 0.01
    data = data.shuffle(batch_size * 10)
    data = data.batch(batch_size)
    data = data.repeat(num_epochs)

    optimizer = tf.keras.optimizers.Adam(lr=step_size, amsgrad=True)

    @tf.function
    def _train():
      for x, y in data:
        optimizer.minimize(
            lambda: tf.losses.huber_loss(y, model(x), delta=0.01),  # pylint: disable=cell-var-from-loop
            model.trainable_variables)

    _train()

  # End of _train_fn
  iterations = 100
  for i in range(iterations):
    solver.evaluate_and_update_policy(_train_fn)
    if i % 10 == 0:
      conv = pyspiel.exploitability(game, solver.average_policy())
      print("Iteration {} exploitability {}".format(i, conv))

In [28]:
def pytorch_example():
  game = pyspiel.load_game('kuhn_poker')

  models = []
  for _ in range(game.num_players()):
    models.append(
        rcfr_pt.DeepRcfrModel(
            game,
            num_hidden_layers=1,
            num_hidden_units=13,
            num_hidden_factors=8,
            use_skip_connections=True))

  buffer_size = -1
  truncate_negative = False
  bootstrap = False
  if buffer_size > 0:
    solver = rcfr_pt.ReservoirRcfrSolver(
        game,
        models,
        buffer_size,
        truncate_negative=truncate_negative)
  else:
    solver = rcfr_pt.RcfrSolver(
        game,
        models,
        truncate_negative=truncate_negative,
        bootstrap=bootstrap)

  def _train_fn(model, data):
    """Train `model` on `data`."""
    batch_size = 100
    num_epochs = 200
    step_size = 0.01
    
    data = torch.utils.data.DataLoader(data, batch_size=batch_size, shuffle=True)
    loss_fn = nn.SmoothL1Loss()
    optimizer = torch.optim.Adam(model.parameters(), lr=step_size, amsgrad=True)

    def _train(model, data):
      for epoch in range(num_epochs):
        for x, y in data:
          optimizer.zero_grad()
          output = model(x)
          loss = loss_fn(output, y)
          loss.backward()
          optimizer.step()

    _train(model, data)

  # End of _train_fn
  iterations = 100
  for i in range(iterations):
    solver.evaluate_and_update_policy(_train_fn)
    if i % 10 == 0:
      conv = pyspiel.exploitability(game, solver.average_policy())
      print("Iteration {} exploitability {}".format(i, conv))

In [None]:
tnsorflow_example()

Iteration 0 exploitability 0.2506877366166314
Iteration 10 exploitability 0.036713111760547756
Iteration 20 exploitability 0.024237618248428322
Iteration 30 exploitability 0.014768610217531036
Iteration 40 exploitability 0.010061028217080992
Iteration 50 exploitability 0.012175980715234763
Iteration 60 exploitability 0.008652442532953775


In [None]:
pytorch_example()