Adapted from `rlcard/examples/leduc_holdem_dqn_pytorch.py`

In [1]:
import os
import sys
sys.path.insert(0, os.path.abspath('./rlcard'))

In [2]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
import torch
import rlcard
import json
import numpy as np
from DQNAgent import DQNAgent
from rlcard.agents import RandomAgent, CFRAgent
from rlcard.utils import set_global_seed, tournament
from rlcard.utils import Logger

In [4]:
set_global_seed(0)

In [5]:
eval_env = rlcard.make('limit-holdem', config={'seed': 0})

# Load models

Settings A - 1m hands

In [6]:
# after 1 million hands
dqn_agent_3 = DQNAgent(
    eval_env.action_num,
    eval_env.state_shape[0],
    hidden_neurons=[1024, 512, 1024, 512]
)

dqn_agent_3.load('./models/limit_holdem_dqn/03/step-5474450.pt')

Settings A - 0.5m hands

In [7]:
# after 500,000 hands
dqn_agent_3_half = DQNAgent(
    eval_env.action_num,
    eval_env.state_shape[0],
    hidden_neurons=[1024, 512, 1024, 512]
)

dqn_agent_3_half.load('./models/limit_holdem_dqn/03/step-2500210.pt')

Settings B - 1m hands

In [20]:
# after ~1 million hands
dqn_agent_5_end = DQNAgent(
    eval_env.action_num,
    eval_env.state_shape[0],
    hidden_neurons=[1024, 512, 1024, 512]
)

dqn_agent_5_end.load('./models/limit_holdem_dqn/05/step-5179089.pt')

Settings B - 0.5m hands

In [8]:
# after ~half million hands
dqn_agent_5 = DQNAgent(
    eval_env.action_num,
    eval_env.state_shape[0],
    hidden_neurons=[1024, 512, 1024, 512]
)

dqn_agent_5.load('./models/limit_holdem_dqn/05/step-2533731.pt')

Settings C - 2m hands

In [22]:
# after ~2 million hands
dqn_agent_6_end = DQNAgent(
    eval_env.action_num,
    eval_env.state_shape[0],
    hidden_neurons=[1024, 512, 1024, 512]
)

dqn_agent_6_end.load('./models/limit_holdem_dqn/06/step-10014402.pt')

Settings C - 1m hands

In [23]:
# after ~1 million hands
dqn_agent_6_middle = DQNAgent(
    eval_env.action_num,
    eval_env.state_shape[0],
    hidden_neurons=[1024, 512, 1024, 512]
)

dqn_agent_6_middle.load('./models/limit_holdem_dqn/06/step-5186110.pt')

# Check rewards

In [45]:
with open('rlcard/rlcard/games/limitholdem/card2index.json', 'r') as file:
    card2index = json.load(file)

In [46]:
eval_env.actions

['call', 'raise', 'fold', 'check']

In [47]:
obs = np.zeros(228)

obs[card2index['S2']] = 1
obs[card2index['C7']] = 1
obs[208] = 1
obs[213] = 1
obs[218] = 1
obs[223] = 1

In [48]:
model_input = torch.tensor(obs,
                           dtype=torch.float,
                           device=dqn_agent_3.device)

dqn_agent_3.policy_net.eval()
dqn_agent_3.policy_net(model_input)

tensor([nan, nan, nan, nan], grad_fn=<AddBackward0>)

In [49]:
model_input = torch.tensor(obs,
                           dtype=torch.float,
                           device=dqn_agent_3_half.device)

dqn_agent_3_half.policy_net.eval()
dqn_agent_3_half.policy_net(model_input)

tensor([ 0.0539,  0.4436, -5.2679,  0.1856], grad_fn=<AddBackward0>)

In [50]:
model_input = torch.tensor(obs,
                           dtype=torch.float,
                           device=dqn_agent_5.device)

dqn_agent_5.policy_net.eval()
dqn_agent_5.policy_net(model_input)

tensor([ 0.4005,  0.4427, -0.8425,  0.3784], grad_fn=<AddBackward0>)

In [51]:
model_input = torch.tensor(obs,
                           dtype=torch.float,
                           device=dqn_agent_5_end.device)

dqn_agent_5_end.policy_net.eval()
dqn_agent_5_end.policy_net(model_input)

tensor([-0.8466, -1.4755, -0.9655, -0.6745], grad_fn=<AddBackward0>)

In [52]:
model_input = torch.tensor(obs,
                           dtype=torch.float,
                           device=dqn_agent_6_middle.device)

dqn_agent_6_middle.policy_net.eval()
dqn_agent_6_middle.policy_net(model_input)

tensor([ 1.7728,  1.7508, -0.8454,  1.5301], grad_fn=<AddBackward0>)

In [53]:
model_input = torch.tensor(obs,
                           dtype=torch.float,
                           device=dqn_agent_6_end.device)

dqn_agent_6_end.policy_net.eval()
dqn_agent_6_end.policy_net(model_input)

tensor([ 0.1130,  0.6298, -0.8841,  0.5140], grad_fn=<AddBackward0>)

# Evaluate models against each other

In [15]:
random_agent = RandomAgent(action_num=eval_env.action_num)

In [33]:
def eval(agent1, agent2, n=100000):
    eval_env.set_agents([agent1, agent2])
    return tournament(eval_env, n)

A (0.5m) vs B (0.5m)

In [17]:
eval(dqn_agent_3_half, dqn_agent_5)

[-0.74562, 0.74562]

A (0.5m) vs random

In [18]:
eval(dqn_agent_3_half, random_agent)

[2.19958, -2.19958]

B (0.5m) vs random

In [19]:
eval(dqn_agent_5, random_agent)

[2.45081, -2.45081]

B (0.5m) vs B (1m)

In [28]:
eval(dqn_agent_5, dqn_agent_5_end)

[1.766985, -1.766985]

C (1m) vs C (2m)

In [29]:
eval(dqn_agent_6_middle, dqn_agent_6_end)

[0.71339, -0.71339]

C (1m) vs random

In [57]:
eval(dqn_agent_6_middle, random_agent)

[2.00511, -2.00511]

B (0.5m) vs. C (1m)

In [38]:
eval(dqn_agent_5, dqn_agent_6_middle)

[-0.2025, 0.2025]

C (1m) vs B (0.5m)

In [39]:
eval(dqn_agent_6_middle, dqn_agent_5)

[0.21637, -0.21637]

C (1m) vs C (1m)

In [40]:
eval(dqn_agent_6_middle, dqn_agent_6_middle)

[0.04212, -0.04212]

B (0.5m) vs B (0.5m)

In [41]:
eval(dqn_agent_5, dqn_agent_5)

[-0.00265, 0.00265]

random vs random

In [56]:
eval(random_agent, random_agent)

[0.00573, -0.00573]

# Action Selection

In [92]:
from collections import Counter

env = rlcard.make('limit-holdem', config={'seed': 0, 'record_action': True})
env.set_agents([dqn_agent_6_middle, dqn_agent_6_middle])

counts = Counter()
total_reward = 0

for i in range(100000):
    trajectories, payoffs = env.run(is_training=False)
    if trajectories[-1]:
        final_transition = trajectories[-1][-1]
        final_state = final_transition[0]
        total_reward += final_transition[2]
        actions = final_state['action_record']
        counts.update(x[1] for x in actions)
    else:
        print(trajectories)

In [93]:
counts

Counter({'call': 390944, 'raise': 606571, 'check': 31384})

In [94]:
total_reward

-2489.0