Adapted from `rlcard/examples/leduc_holdem_dqn_pytorch.py`

The below cell clones the project repo and then clones my fork of RLCard inside of it. It then connects the notebook to Google Drive to save checkpoints.

In [None]:
!git clone https://github.com/DanielKerrigan/CS-GY_9223_Deep_Learning.git

%cd CS-GY_9223_Deep_Learning

!git clone https://github.com/DanielKerrigan/rlcard.git

%cd rlcard

!git checkout dan

%cd ..

from google.colab import drive
drive.mount('/content/gdrive')
drive_dir = '/content/gdrive/My Drive/deep-learning/limit-holdem-dqn-05'

In [None]:
import os
import sys
sys.path.insert(0, os.path.abspath('./rlcard'))

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
import torch
import rlcard
from DQNAgent import DQNAgent
from rlcard.agents import RandomAgent
from rlcard.utils import set_global_seed, tournament
from rlcard.utils import Logger

In [None]:
# Make environment
env = rlcard.make('limit-holdem', config={'seed': 0})
eval_env = rlcard.make('limit-holdem', config={'seed': 0})

In [None]:
# Set the iterations numbers and how frequently we evaluate the performance
evaluate_every = 20_000
evaluate_num = 10_000
episode_num = 1_000_000

In [None]:
# Set a global seed
set_global_seed(0)

In [None]:
agent = DQNAgent(
    env.action_num,
    env.state_shape[0],
    memory=1_500_000,
    lr=0.15,
    batch_size=256,
    update_every=256,
    eps_start=0.9,
    eps_end=0.05,
    target_update=1000,
    hidden_neurons=[1024, 512, 1024, 512],
    clip_grads=True,
    smooth_loss=True,
)

eval_agent = DQNAgent(
    env.action_num,
    env.state_shape[0],
    hidden_neurons=[1024, 512, 1024, 512]
)

eval_agent.load('/content/gdrive/My Drive/deep-learning/limit-holdem-dqn-03/step-5474450.pt')

env.set_agents([agent, agent])
eval_env.set_agents([agent, eval_agent])

In [None]:
# Init a Logger to plot the learning curve
logger = Logger(drive_dir)

In [None]:
losses = []

In [None]:
for episode in range(episode_num):
    # Generate data from the environment
    trajectories, _ = env.run(is_training=True)

    # Feed transitions into agent memory, and train the agent
    for ts in trajectories[0]:
        agent.train(ts)

    # Evaluate the performance. Play with random agents.
    if episode % evaluate_every == 0:
        if agent.loss is not None:
            losses.append((agent.train_step, agent.loss.item()))
        print(f'\nEpisode: {episode}, Weight updates: {agent.weight_updates}, loss: {agent.loss.item()}')
        logger.log_performance(agent.weight_updates,
                               tournament(eval_env, evaluate_num)[0])        
        agent.save(drive_dir)

In [None]:
# Close files in the logger
logger.close_files()

In [None]:
# Plot the learning curve
logger.plot('DQN')

In [None]:
eval_env_dqn_dqn = rlcard.make('limit-holdem', config={'seed': 1})
eval_env_dqn_dqn.set_agents([agent, agent])
payoffs = tournament(eval_env_dqn_dqn, 10000)
print(payoffs)