In [11]:
import numpy as np

In [21]:
import tensorflow as tf
tf.__version__

'2.4.1'

In [5]:
from kaggle_environments import make, evaluate
from gym import spaces

In [81]:
class ConnectFourGym:
    def __init__(self, agent2="random"):
        ks_env = make("connectx", debug=True)
        self.env = ks_env.train([None, agent2])
        self.rows = ks_env.configuration.rows
        self.columns = ks_env.configuration.columns
        # Learn about spaces here: http://gym.openai.com/docs/#spaces
        self.action_space = spaces.Discrete(self.columns)
        self.observation_space = spaces.Box(low=0, high=2, 
                                            shape=(self.rows,self.columns,1), dtype=np.int)
        # Tuple corresponding to the min and max possible rewards
        self.reward_range = (-10, 1)
        # StableBaselines throws error if these are not defined
        self.spec = None
        self.metadata = None
    def reset(self):
        self.obs = self.env.reset()
        return np.array(self.obs['board']).reshape(self.rows,self.columns,1)
    def change_reward(self, old_reward, done):
        if old_reward == 1: # The agent won the game
            return 1
        elif done: # The opponent won the game
            return -1
        else: # Reward 1/42
            return 1/(self.rows*self.columns)
    def step(self, action):
        # Check if agent's move is valid
        is_valid = (self.obs['board'][int(action)] == 0)
        if is_valid: # Play the move
            self.obs, old_reward, done, _ = self.env.step(int(action))
            reward = self.change_reward(old_reward, done)
        else: # End the game and penalize agent
            reward, done, _ = -10, True, {}
        return np.array(self.obs['board']).reshape(self.rows,self.columns,1), reward, done, _
env = ConnectFourGym(agent2="random")

In [82]:
import os
from pathlib import Path
from stable_baselines3.common.monitor import Monitor 
from stable_baselines3.common.vec_env import DummyVecEnv

# Create directory for logging training information
log_dir_ppo = Path(r'logs/ppo')
log_dir_a2c = Path(r'logs/a2c')
os.makedirs(log_dir_ppo, exist_ok=True)
os.makedirs(log_dir_a2c, exist_ok=True)

# Logging progress
monitor_env_ppo = Monitor(env, str(log_dir_ppo), allow_early_resets=True)
monitor_env_a2c = Monitor(env, str(log_dir_a2c), allow_early_resets=True)

# Create a vectorized environment
vec_env_ppo = DummyVecEnv([lambda: monitor_env_ppo])
vec_env_a2c = DummyVecEnv([lambda: monitor_env_a2c])

In [84]:
from stable_baselines3 import PPO, A2C

policy_kwargs = dict(activation_fn=th.nn.ReLU, net_arch=[dict(pi=[32, 16],
                                                          vf=[32, 16])])
model_ppo = PPO("MlpPolicy", vec_env_ppo, policy_kwargs=policy_kwargs, verbose=1)
# model_a2c = A2C("MlpPolicy", vec_env_a2c, policy_kwargs=policy_kwargs, verbose=1)

Using cpu device


In [88]:
model_ppo.learn(total_timesteps=100000)
model_ppo.save('models/ppo')

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 9.69        |
|    ep_rew_mean          | -0.113      |
| time/                   |             |
|    fps                  | 148         |
|    iterations           | 1           |
|    time_elapsed         | 13          |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.010847028 |
|    clip_fraction        | 0.0442      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.9        |
|    explained_variance   | -9.64       |
|    learning_rate        | 0.0003      |
|    loss                 | 3.37        |
|    n_updates            | 50          |
|    policy_gradient_loss | -0.0147     |
|    value_loss           | 9.01        |
-----------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 9.5 

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 8.9          |
|    ep_rew_mean          | 0.448        |
| time/                   |              |
|    fps                  | 137          |
|    iterations           | 11           |
|    time_elapsed         | 164          |
|    total_timesteps      | 22528        |
| train/                  |              |
|    approx_kl            | 0.0032726617 |
|    clip_fraction        | 0.0488       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.69        |
|    explained_variance   | -10.5        |
|    learning_rate        | 0.0003       |
|    loss                 | 1.58         |
|    n_updates            | 150          |
|    policy_gradient_loss | -0.0166      |
|    value_loss           | 2.71         |
------------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_m

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 8.97        |
|    ep_rew_mean          | 0.63        |
| time/                   |             |
|    fps                  | 136         |
|    iterations           | 21          |
|    time_elapsed         | 315         |
|    total_timesteps      | 43008       |
| train/                  |             |
|    approx_kl            | 0.004126774 |
|    clip_fraction        | 0.0763      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.52       |
|    explained_variance   | -1.71       |
|    learning_rate        | 0.0003      |
|    loss                 | 0.336       |
|    n_updates            | 250         |
|    policy_gradient_loss | -0.0168     |
|    value_loss           | 0.788       |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 8.88  

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 8.17         |
|    ep_rew_mean          | 0.701        |
| time/                   |              |
|    fps                  | 140          |
|    iterations           | 31           |
|    time_elapsed         | 451          |
|    total_timesteps      | 63488        |
| train/                  |              |
|    approx_kl            | 0.0070780516 |
|    clip_fraction        | 0.0556       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.39        |
|    explained_variance   | -11.1        |
|    learning_rate        | 0.0003       |
|    loss                 | 0.819        |
|    n_updates            | 350          |
|    policy_gradient_loss | -0.0132      |
|    value_loss           | 1.46         |
------------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 7.87        |
|    ep_rew_mean          | 0.884       |
| time/                   |             |
|    fps                  | 135         |
|    iterations           | 41          |
|    time_elapsed         | 617         |
|    total_timesteps      | 83968       |
| train/                  |             |
|    approx_kl            | 0.006413696 |
|    clip_fraction        | 0.0506      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.2        |
|    explained_variance   | -1.14       |
|    learning_rate        | 0.0003      |
|    loss                 | 0.13        |
|    n_updates            | 450         |
|    policy_gradient_loss | -0.0145     |
|    value_loss           | 0.389       |
-----------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 7.84

In [89]:
import pandas as pd
# Plot cumulative reward
with open(os.path.join(str(log_dir_ppo), "monitor.csv"), 'rt') as fh:    
    firstline = fh.readline()
    assert firstline[0] == '#'
    df_ppo = pd.read_csv(fh, index_col=None)['r']

# A2C
import plotly.express as px
fig = px.line(y=df_ppo.rolling(window=10).mean().dropna())
fig.show()

In [90]:
# from evaluation import get_win_percentages
get_win_percentages(agent1=agent, agent2='negamax', n_rounds=100)

Agent 1 Win Percentage: 0.0
Agent 2 Win Percentage: 0.91
Number of Invalid Plays by Agent 1: 9
Number of Invalid Plays by Agent 2: 0


## Next Try

In [66]:
class ConnectFourGym_CNN:
    def __init__(self, agent2="random"):
        ks_env = make("connectx", debug=True)
        self.env = ks_env.train([None, agent2])
        self.rows = ks_env.configuration.rows
        self.columns = ks_env.configuration.columns
        # Learn about spaces here: http://gym.openai.com/docs/#spaces
        self.action_space = spaces.Discrete(self.columns)
        self.observation_space = spaces.Box(low=0, high=2, 
                                            shape=(self.rows,self.columns,1), dtype=np.int)
        # Tuple corresponding to the min and max possible rewards
        self.reward_range = (-10, 1)
        # StableBaselines throws error if these are not defined
        self.spec = None
        self.metadata = None
    def reset(self):
        self.obs = self.env.reset()
        return np.array(self.obs['board']).reshape(self.rows,self.columns, 1)
    def change_reward(self, old_reward, done):
        if old_reward == 1: # The agent won the game
            return 1
        elif done: # The opponent won the game
            return -1
        else: # Reward 1/42
            return 1/(self.rows*self.columns)
    def step(self, action):
        # Check if agent's move is valid
        is_valid = (self.obs['board'][int(action)] == 0)
        if is_valid: # Play the move
            self.obs, old_reward, done, _ = self.env.step(int(action))
            reward = self.change_reward(old_reward, done)
        else: # End the game and penalize agent
            reward, done, _ = -10, True, {}
        return np.array(self.obs['board']).reshape(self.rows,self.columns, 1), reward, done, _
env = ConnectFourGym(agent2="random")

In [70]:
import os
import gym
import torch as th
import torch.nn as nn

from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
from stable_baselines3.common.monitor import Monitor 
from stable_baselines3.common.vec_env import DummyVecEnv

# Create directory for logging training information
log_dir_ppo_cumstomCNN = Path(r'logs/ppo_cumstomCNN')
os.makedirs(log_dir_ppo_cumstomCNN, exist_ok=True)

# Logging progress
monitor_env_ppo_customCNN = Monitor(env, str(log_dir_ppo_cumstomCNN), allow_early_resets=True)

# Create a vectorized environment
vec_env_ppo_customCNN = DummyVecEnv([lambda: monitor_env_ppo_customCNN])

class CustomCNN(BaseFeaturesExtractor):
    """
    :param observation_space: (gym.Space)
    :param features_dim: (int) Number of features extracted.
        This corresponds to the number of unit for the last layer.
    """

    def __init__(self, observation_space: gym.spaces.Box, features_dim: int = 256):
        super(CustomCNN, self).__init__(observation_space, features_dim)
        # We assume CxHxW images (channels first)
        # Re-ordering will be done by pre-preprocessing or wrapper
        n_input_channels = observation_space.shape[0]
        self.cnn = nn.Sequential(
            nn.Conv2d(n_input_channels, 32, kernel_size=(3,1), stride=2, padding=0),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=(3,1), stride=2, padding=0),
            nn.ReLU(),
            nn.Flatten(),
        )

        # Compute shape by doing one forward pass
        with th.no_grad():
            n_flatten = self.cnn(
                th.as_tensor(observation_space.sample()[None]).float()
            ).shape[1]

        self.linear = nn.Sequential(nn.Linear(n_flatten, features_dim), nn.ReLU())

    def forward(self, observations: th.Tensor) -> th.Tensor:
        return self.linear(self.cnn(observations))

policy_kwargs = dict(
    features_extractor_class=CustomCNN,
    features_extractor_kwargs=dict(features_dim=128),
)

model_cumstomCNN = PPO("CnnPolicy", vec_env_ppo_customCNN, policy_kwargs=policy_kwargs, verbose=1)

Using cpu device


In [75]:
model_cumstomCNN.learn(total_timesteps=5000)
model_cumstomCNN.save('models/ppo_cumstomCNN')
# model_cumstomCNN = PPO.load('models/ppo')

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 10.1         |
|    ep_rew_mean          | -1.36        |
| time/                   |              |
|    fps                  | 156          |
|    iterations           | 1            |
|    time_elapsed         | 13           |
|    total_timesteps      | 2048         |
| train/                  |              |
|    approx_kl            | 0.0060184835 |
|    clip_fraction        | 0.0437       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.93        |
|    explained_variance   | -6.66        |
|    learning_rate        | 0.0003       |
|    loss                 | 3.64         |
|    n_updates            | 20           |
|    policy_gradient_loss | -0.00986     |
|    value_loss           | 8.24         |
------------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_m

In [76]:
import pandas as pd
# Plot cumulative reward
with open(os.path.join(str(log_dir_ppo_cumstomCNN), "monitor.csv"), 'rt') as fh:    
    firstline = fh.readline()
    assert firstline[0] == '#'
    df_ppo = pd.read_csv(fh, index_col=None)['r']

# A2C
import plotly.express as px
fig = px.line(y=df_ppo.rolling(window=10).mean().dropna())
fig.show()

# Transform it into an agent

In [77]:
def agent(obs, config):
    # Use the best model to select a column
    col, _ = model_cumstomCNN.predict(np.array(obs['board']).reshape(6,7,1))
    # Check if selected column is valid
    is_valid = (obs['board'][int(col)] == 0)
    # If not valid, select random move. 
    if is_valid:
        return int(col)
    else:
        return random.choice([col for col in range(config.columns) if obs.board[int(col)] == 0])

## Check Performance

In [78]:
def get_win_percentages(agent1, agent2, n_rounds=100):
    # Use default Connect Four setup
    config = {'rows': 6, 'columns': 7, 'inarow': 4}
    # Agent 1 goes first (roughly) half the time       
    outcomes = evaluate("connectx", [agent1, agent2], config, [], n_rounds//2)
    # Agent 2 goes first (roughly) half the time      
    outcomes += [[b,a] for [a,b] in evaluate("connectx", [agent2, agent1], config, [], n_rounds-n_rounds//2)]
    print("Agent 1 Win Percentage:", np.round(outcomes.count([1,-1])/len(outcomes), 2))
    print("Agent 2 Win Percentage:", np.round(outcomes.count([-1,1])/len(outcomes), 2))
    print("Number of Invalid Plays by Agent 1:", outcomes.count([None, 0]))
    print("Number of Invalid Plays by Agent 2:", outcomes.count([0, None]))

In [80]:
# from evaluation import get_win_percentages
get_win_percentages(agent1=agent, agent2='negamax', n_rounds=100)

Agent 1 Win Percentage: 0.01
Agent 2 Win Percentage: 0.87
Number of Invalid Plays by Agent 1: 12
Number of Invalid Plays by Agent 2: 0


In [None]:
import random

def agent_random(obs, config):
    valid_moves = [col for col in range(config.columns) if obs.board[col] == 0]
    return random.choice(valid_moves)

def agent_middle(obs, config):
    return config.columns//2

def agent_leftmost(obs, config):
    valid_moves = [col for col in range(config.columns) if obs.board[col] == 0]
    return valid_moves[0]

# Define function to find the best agent
def get_win_percentages(agent1, agent2, n_rounds=100):
    # Use default Connect Four setup
    config = {'rows': 6, 'columns': 7, 'inarow': 4}
    # Agent 1 goes first (roughly) half the time          
    outcomes = evaluate("connectx", [agent1, agent2], config, [], n_rounds//2)
    # Agent 2 goes first (roughly) half the time      
    outcomes += [[b,a] for [a,b] in evaluate("connectx", [agent2, agent1], config, [], n_rounds-n_rounds//2)]
    print("Agent 1 Win Percentage:", np.round(outcomes.count([1,-1])/len(outcomes), 2))
    print("Agent 2 Win Percentage:", np.round(outcomes.count([-1,1])/len(outcomes), 2))
    print("Number of Invalid Plays by Agent 1:", outcomes.count([None, 0]))
    print("Number of Invalid Plays by Agent 2:", outcomes.count([0, None]))

# Deep Reinforcement Learning
import numpy as np
from kaggle_environments import make, evaluate
from gym import spaces

class ConnectFourGym:
    def __init__(self, agent2="random"):
        ks_env = make("connectx", debug=True)
        self.env = ks_env.train([None, agent2])
        self.rows = ks_env.configuration.rows
        self.columns = ks_env.configuration.columns
        # Learn about spaces here: http://gym.openai.com/docs/#spaces
        self.action_space = spaces.Discrete(self.columns)
        self.observation_space = spaces.Box(low=0, high=2, 
                                            shape=(self.rows,self.columns,1), dtype=np.int)
        # Tuple corresponding to the min and max possible rewards
        self.reward_range = (-10, 1)
        # StableBaselines throws error if these are not defined
        self.spec = None
        self.metadata = None
    def reset(self):
        self.obs = self.env.reset()
        return np.array(self.obs['board']).reshape(self.rows,self.columns,1)
    def change_reward(self, old_reward, done):
        if old_reward == 1: # The agent won the game
            return 1
        elif done: # The opponent won the game
            return -1
        else: # Reward 1/42
            return 1/(self.rows*self.columns)
    def step(self, action):
        # Check if agent's move is valid
        is_valid = (self.obs['board'][int(action)] == 0)
        if is_valid: # Play the move
            self.obs, old_reward, done, _ = self.env.step(int(action))
            reward = self.change_reward(old_reward, done)
        else: # End the game and penalize agent
            reward, done, _ = -10, True, {}
        return np.array(self.obs['board']).reshape(self.rows,self.columns,1), reward, done, _

# Create ConnectFour environment
env = ConnectFourGym(agent2="negamax")

import os
from stable_baselines3.common.monitor import Monitor 
from stable_baselines3.common.vec_env import DummyVecEnv
import torch as th

# Create directory for logging training information
log_dir_ppo = "ppo/"
log_dir_a2c = "a2c/"
os.makedirs(log_dir_ppo, exist_ok=True)
os.makedirs(log_dir_a2c, exist_ok=True)

# Logging progress
monitor_env_ppo = Monitor(env, log_dir_ppo, allow_early_resets=True)
monitor_env_a2c = Monitor(env, log_dir_a2c, allow_early_resets=True)

# Create a vectorized environment
vec_env_ppo = DummyVecEnv([lambda: monitor_env_ppo])
vec_env_a2c = DummyVecEnv([lambda: monitor_env_a2c])

from stable_baselines3 import PPO, A2C

policy_kwargs = dict(activation_fn=th.nn.ReLU, net_arch=[dict(pi=[32, 16],
                                                          vf=[32, 16])])
model_a2c = A2C("MlpPolicy", vec_env_a2c, policy_kwargs=policy_kwargs, verbose=1)
model_ppo = PPO("MlpPolicy", vec_env_ppo, policy_kwargs=policy_kwargs, verbose=1)


# In[ ]:


# model_a2c.learn(total_timesteps=100000)
model_ppo.learn(total_timesteps=50000)


# In[ ]:


# model.save('models/ppo')
# model = PPO.load('models/ppo')


# ## PPO results

# In[ ]:


import pandas as pd
# Plot cumulative reward
with open(os.path.join(log_dir_ppo, "monitor.csv"), 'rt') as fh:    
    firstline = fh.readline()
    assert firstline[0] == '#'
    df_ppo = pd.read_csv(fh, index_col=None)['r']

# A2C
import plotly.express as px
fig = px.line(df_ppo.rolling(window=10).mean())
fig.show()


# ## A2C Results

# In[ ]:


import pandas as pd
# Plot cumulative reward
with open(os.path.join(log_dir_a2c, "monitor.csv"), 'rt') as fh:    
    firstline = fh.readline()
    assert firstline[0] == '#'
    df_a2c = pd.read_csv(fh, index_col=None)['r']

# A2C
import plotly.express as px
fig = px.line(y=df_a2c.rolling(window=10).mean().dropna())
fig.show()


# In[ ]:


def agent1(obs, config):
    # Use the best model to select a column
    col, _ = model_ppo.predict(np.array(obs['board']).reshape(6,7,1))
    # Check if selected column is valid
    is_valid = (obs['board'][int(col)] == 0)
    # If not valid, select random move. 
    if is_valid:
        return int(col)
    else:
        return random.choice([col for col in range(config.columns) if obs.board[int(col)] == 0])


# In[ ]:


# Create the game environment
env = make("connectx")

# Two random agents play one game round
env.run([agent1, "negamax"])

# Show the game
env.render(mode="ipython")


# In[ ]:


get_win_percentages(agent1=agent1, agent2='negamax')


# ### Write agent to file

# In[ ]:


import inspect
import os

def write_agent_to_file(function, file):
    with open(file, "a" if os.path.exists(file) else "w") as f:
        f.write(inspect.getsource(function))
        print(function, "written to", file)

write_agent_to_file(agent1, "submission.py")

 
