In [2]:
import tensorflow as tf
import gym
from gym import spaces
import numpy as np6

from tf_agents.agents.dqn import dqn_agent
from tf_agents.networks import q_network
from tf_agents.environments import tf_py_environment
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.trajectories import trajectory
from tf_agents.policies import random_tf_policy
from tf_agents.utils import common
from tf_agents.environments import utils
from tf_agents.environments import suite_gym
from tf_agents.trajectories import time_step as ts

AttributeError: module 'tensorflow.python.framework.type_spec' has no attribute '_NAME_TO_TYPE_SPEC'

In [None]:
# ConnectFourEnv code
class ConnectFourEnv(gym.Env):
    def __init__(self):
        super(ConnectFourEnv, self).__init__()
        self.board = np.zeros((6, 7))
        self.current_player = 1
        self.action_space = spaces.Discrete(7)
        self.observation_space = spaces.Box(low=0, high=2, shape=(6, 7), dtype=np.int)

    def step(self, action):
        if np.all(self.board[:, action] != 0):
            return self.board.flatten(), -10, True, {}
        row = np.max(np.where(self.board[:, action] == 0))
        self.board[row, action] = self.current_player
        if self.check_win(self.current_player):
            return self.board.flatten(), 10, True, {}
        if np.all(self.board != 0):
            return self.board.flatten(), 0, True, {}
        self.current_player = 1 if self.current_player == 2 else 2
        return self.board.flatten(), 0, False, {}

    def reset(self):
        self.board = np.zeros((6, 7))
        self.current_player = 1
        return self.board.flatten()

    def check_win(self, player):
        for i in range(6):
            for j in range(4):
                if np.all(self.board[i, j:j+4] == player):
                    return True
        for i in range(3):
            for j in range(7):
                if np.all(self.board[i:i+4, j] == player):
                    return True
        for i in range(3):
            for j in range(4):
                if np.all(np.array([self.board[i+k, j+k] for k in range(4)]) == player):
                    return True
        for i in range(3, 6):
            for j in range(3):
                if np.all(np.array([self.board[i-k, j+k] for k in range(4)]) == player):
                    return True
        return False

In [None]:
# Convert the Python environment to a TensorFlow environment.
train_env = tf_py_environment.TFPyEnvironment(ConnectFourEnv())
eval_env = tf_py_environment.TFPyEnvironment(ConnectFourEnv())


In [None]:

# Initialize the QNetwork.
fc_layer_params = (100,)
q_net = q_network.QNetwork(
    train_env.observation_spec(),
    train_env.action_spec(),
    fc_layer_params=fc_layer_params)


In [None]:

# Initialize the agent.
optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=0.001)
train_step_counter = tf.Variable(0)
agent = dqn_agent.DqnAgent(
    train_env.time_step_spec(),
    train_env.action_spec(),
    q_network=q_net,
    optimizer=optimizer,
    td_errors_loss_fn=common.element_wise_squared_loss,
    train_step_counter=train_step_counter)
agent.initialize()


In [None]:

# Initialize the replay buffer.
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    data_spec=agent.collect_data_spec,
    batch_size=train_env.batch_size,
    max_length=100000)


In [None]:

# Initialize the data collection policy and collect some initial data.
collect_policy = agent.collect_policy
random_policy = random_tf_policy.RandomTFPolicy(train_env.time_step_spec(),
                                                train_env.action_spec())
initial_collect_steps = 1000
for _ in range(initial_collect_steps):
    time_step = train_env.current_time_step()
    action_step = random_policy.action(time_step)
    next_time_step = train_env.step(action_step.action)
    traj = trajectory.from_transition(time_step, action_step, next_time_step)
    replay_buffer.add_batch(traj)


In [None]:

# Set up the dataset.
dataset = replay_buffer.as_dataset(
    num_parallel_calls=3, 
    sample_batch_size=64, 
    num_steps=2).prefetch(3)

iterator = iter(dataset)


In [None]:

# Train the agent.
num_iterations = 20000
log_interval = 200
for _ in range(num_iterations):
    # Agent's turn
    time_step = train_env.current_time_step()
    action_step = agent.collect_policy.action(time_step)
    next_time_step = train_env.step(action_step.action)
    traj = trajectory.from_transition(time_step, action_step, next_time_step)
    replay_buffer.add_batch(traj)

    # Opponent's turn
    if not next_time_step.is_last():
        time_step = train_env.current_time_step()
        action_step = random_policy.action(time_step)
        next_time_step = train_env.step(action_step.action)
        traj = trajectory.from_transition(time_step, action_step, next_time_step)
        replay_buffer.add_batch(traj)

    experience, unused_info = next(iterator)
    train_loss = agent.train(experience).loss

    if train_step_counter.numpy() % log_interval == 0:
        print('step = {0}: loss = {1}'.format(train_step_counter.numpy(), train_loss))
