# 1. Create the custom environment

In [3]:
!pip install gym

Collecting gym




  Using cached gym-0.19.0-py3-none-any.whl
Collecting cloudpickle<1.7.0,>=1.2.0
  Using cached cloudpickle-1.6.0-py3-none-any.whl (23 kB)
Installing collected packages: cloudpickle, gym
Successfully installed cloudpickle-1.6.0 gym-0.19.0


In [4]:
import gym
from gym import spaces
import numpy as np
import heuristicBot
import TicTacToe as ttt

In [5]:
def mark_piece(grid, index, player):
    """
    1. places a move by the RL agent
    2. check if invalid
    3. if not check if agent won or draw
    4. if not place move by opponent
    5. check if opponent won or draw
    6. return respective code at appropiate step
    returns -2:continue, -1:invalid move, 
            1:1 wins, 2:2 wins, 0:draw
    """
    if grid[index] != 0:
        return -1, grid
    else:
        grid[index] = player
        score = ttt.gridScore(grid)
        if score != -1 and score == 1:
            return player, grid
        elif score != -1:
            return 0, grid
        # opponent agent : a n-step-lookahead bot
        available_moves = [(i, j) for i in range(3)
                           for j in range(3) if grid[i, j] == 0]
        opp_player = (player % 2) + 1
        opp_move_idx = heuristicBot.nslAgent(
            2, grid, available_moves, opp_player)
        grid[opp_move_idx] = opp_player
        score = ttt.gridScore(grid)
        if score != -1 and score == 1:
            return opp_player, grid
        elif score != -1:
            return 0, grid
        return -2, grid

In [117]:
# Custom environment
class TicTacToeEnv(gym.Env):
    def __init__(self):
        """
        Define action and observation space
        They must be gym.spaces objects
        """
        super(TicTacToeEnv, self).__init__()
        self.action_space = spaces.Discrete(9)
        self.observation_space = spaces.Box(low=0, high=2, shape=(3,3), dtype=int)
        self.reward_range = (-10, 1)
        self.action_moves = [(i,j) for i in range(3) for j in range(3)]
        self.player = 1
        
        # defined else stable baselines throws error
        self.spec = None
        self.metadata = None


    def reset(self):
        """
        Reset the state of the environment to an initial state
        returns 
        eg.
            return self.state
        """
        self.grid = np.zeros((3, 3))

        return self.grid


    def step(self, action):
        """
        Execute one time step within the environment
        returns next state
        eg. 
            modify state
            return self.state, reward, done, info
        """
        score, self.grid = mark_piece(self.grid, self.action_moves[action], self.player)

        # reward
        reward = 0
        if score == -1:
            reward = -10
        elif score == 1:
            reward = 1
        elif score == 2:
            reward = -1
        elif score == -2:
            reward = 1/9

        # check if grid ended
        done = True
        if score == -2:
            done = False

        # info for debugginh
        info = {}
        
        return self.grid, reward, done, info


    def render(self, mode):
        if ttt.gameOver(self.grid, self.player, False)[0]:
            print('-----------------')
            ttt.showGrid(self.grid)
            print('-----------------')

In [118]:
env = TicTacToeEnv()
states = env.observation_space.shape
actions = env.action_space.n

In [89]:
states, actions

((3, 3), 9)

# 2. Dependencies

In [1]:
!pip install keras
!pip install keras-rl2
# !pip install tensorflow==2.3.0

Collecting keras
  Downloading keras-2.6.0-py2.py3-none-any.whl (1.3 MB)
Installing collected packages: keras
Successfully installed keras-2.6.0




Collecting keras-rl2
  Downloading keras_rl2-1.0.5-py3-none-any.whl (52 kB)
Collecting scipy==1.4.1
  Downloading scipy-1.4.1-cp37-cp37m-win_amd64.whl (30.9 MB)
Collecting tensorflow-estimator<2.4.0,>=2.3.0




  Downloading tensorflow_estimator-2.3.0-py2.py3-none-any.whl (459 kB)
Collecting numpy<1.19.0,>=1.16.0
  Downloading numpy-1.18.5-cp37-cp37m-win_amd64.whl (12.7 MB)
Installing collected packages: numpy, tensorflow-estimator, scipy, keras-rl2
  Attempting uninstall: numpy
    Found existing installation: numpy 1.20.3
    Uninstalling numpy-1.20.3:
      Successfully uninstalled numpy-1.20.3
  Attempting uninstall: tensorflow-estimator
    Found existing installation: tensorflow-estimator 2.5.0
    Uninstalling tensorflow-estimator-2.5.0:
      Successfully uninstalled tensorflow-estimator-2.5.0
  Attempting uninstall: scipy
    Found existing installation: scipy 1.6.2
    Uninstalling scipy-1.6.2:
      Successfully uninstalled scipy-1.6.2
Successfully installed keras-rl2-1.0.5 numpy-1.18.5 scipy-1.4.1 tensorflow-estimator-2.3.0


# Test environment

In [100]:
env = TicTacToeEnv()
episodes = 5
for episode in range(episodes):
    obs = env.reset()
    done = False
    score = 0

    while not done:
        env.render()
        action = env.action_space.sample()
        obs, reward, done, infp = env.step(action)
        score += reward
    print(f'Episode:{episode+1}, Score:{score}')
env.close()

Episode:1, Score:-0.7777777777777778
Episode:2, Score:-9.88888888888889
Episode:3, Score:-9.88888888888889
Episode:4, Score:-0.7777777777777778
Episode:5, Score:-0.6666666666666667


# 3. Train a deep learning model with keras

In [54]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.optimizers import Adam

In [110]:
def build_model(states, actions):
    model = Sequential()
    model.add(Flatten(input_shape=[1, states[0], states[1]]))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(actions, activation='linear'))
    return model

In [111]:
model = build_model(states, actions)

In [112]:
model.summary()

Model: "sequential_12"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_12 (Flatten)         (None, 9)                 0         
_________________________________________________________________
dense_27 (Dense)             (None, 32)                320       
_________________________________________________________________
dense_28 (Dense)             (None, 32)                1056      
_________________________________________________________________
dense_29 (Dense)             (None, 9)                 297       
Total params: 1,673
Trainable params: 1,673
Non-trainable params: 0
_________________________________________________________________


# Build Agent with keras-rl

In [58]:
from rl.agents import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory

In [113]:
def build_agent(model, actions):
    policy = BoltzmannQPolicy()
    memory = SequentialMemory(limit=50000, window_length=1)
    dqn = DQNAgent(model=model, memory=memory, policy=policy, nb_actions=actions, nb_steps_warmup=10, target_model_update=1e-2)
    return dqn

In [114]:
dqn = build_agent(model, actions)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])
dqn.fit(env, nb_steps=50000, visualize=False, verbose=1)

Training for 50000 steps ...
Interval 1 (0 steps performed)
2802 episodes - episode_reward: -0.922 [-9.889, 1.333] - loss: 0.480 - mae: 3.358 - mean_q: -0.179

Interval 2 (10000 steps performed)
2700 episodes - episode_reward: -0.419 [-9.889, 1.333] - loss: 0.093 - mae: 3.593 - mean_q: 0.031

Interval 3 (20000 steps performed)
2667 episodes - episode_reward: -0.377 [-9.889, 1.333] - loss: 0.073 - mae: 3.624 - mean_q: 0.058

Interval 4 (30000 steps performed)
2672 episodes - episode_reward: -0.400 [-9.889, 1.333] - loss: 0.067 - mae: 3.626 - mean_q: 0.078

Interval 5 (40000 steps performed)
done, took 924.909 seconds


<tensorflow.python.keras.callbacks.History at 0x17f66629f08>

# 4. Test and save the model

In [115]:
scores = dqn.test(env, nb_episodes=100, visualize=False)
print(np.mean(scores.history['episode_reward']))

Testing for 100 episodes ...
Episode 1: reward: 0.444, steps: 5
Episode 2: reward: 0.444, steps: 5
Episode 3: reward: 0.444, steps: 5
Episode 4: reward: 0.444, steps: 5
Episode 5: reward: 0.444, steps: 5
Episode 6: reward: 0.444, steps: 5
Episode 7: reward: -0.667, steps: 4
Episode 8: reward: 0.444, steps: 5
Episode 9: reward: 0.444, steps: 5
Episode 10: reward: 0.444, steps: 5
Episode 11: reward: 0.444, steps: 5
Episode 12: reward: 0.444, steps: 5
Episode 13: reward: 0.444, steps: 5
Episode 14: reward: 0.444, steps: 5
Episode 15: reward: -0.667, steps: 4
Episode 16: reward: 0.444, steps: 5
Episode 17: reward: 0.444, steps: 5
Episode 18: reward: 0.444, steps: 5
Episode 19: reward: 0.444, steps: 5
Episode 20: reward: 0.444, steps: 5
Episode 21: reward: 0.444, steps: 5
Episode 22: reward: 0.444, steps: 5
Episode 23: reward: 0.444, steps: 5
Episode 24: reward: 0.444, steps: 5
Episode 25: reward: 0.444, steps: 5
Episode 26: reward: 0.444, steps: 5
Episode 27: reward: 0.444, steps: 5
Episod

In [119]:
_ = dqn.test(env, nb_episodes=10, visualize=True)

Testing for 10 episodes ...
-----------------
| X || O || O |
| O || X || X |
| X || X || O |
-----------------
Episode 1: reward: 0.444, steps: 5
-----------------
| O || O || X |
| O || X || _ |
| O || X || X |
-----------------
Episode 2: reward: -0.667, steps: 4
-----------------
| O || X || X |
| X || X || O |
| O || O || X |
-----------------
Episode 3: reward: 0.444, steps: 5
-----------------
| O || O || X |
| X || X || O |
| O || X || X |
-----------------
Episode 4: reward: 0.444, steps: 5
-----------------
| O || X || O |
| X || X || O |
| X || O || X |
-----------------
Episode 5: reward: 0.444, steps: 5
-----------------
| X || O || X |
| O || X || X |
| O || X || O |
-----------------
Episode 6: reward: 0.444, steps: 5
-----------------
| O || O || X |
| O || X || _ |
| O || X || X |
-----------------
Episode 7: reward: -0.667, steps: 4
-----------------
| O || O || X |
| X || X || O |
| O || X || X |
-----------------
Episode 8: reward: 0.444, steps: 5
-----------------


In [120]:
dqn.save_weights('dqn_weights.h5f', overwrite=True)

In [121]:
del model
del dqn
del env

# 5. Load and play

In [124]:
env = TicTacToeEnv()
actions = env.action_space.n
states = env.observation_space.shape[0]
model = build_model(states, actions)
dqn = build_agent(model, actions)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])

In [125]:
dqn.load_weights('dqn_weights.h5f')

In [126]:
_ = dqn.test(env, nb_episodes=5, visualize=True)

Testing for 5 episodes ...
-----------------
| X || O || X |
| X || X || O |
| O || X || O |
-----------------
Episode 1: reward: 0.444, steps: 5
-----------------
| X || O || X |
| X || X || O |
| O || X || O |
-----------------
Episode 2: reward: 0.444, steps: 5
-----------------
| O || O || O |
| O || X || X |
| X || _ || X |
-----------------
Episode 3: reward: -0.667, steps: 4
-----------------
| O || X || O |
| O || X || X |
| X || O || X |
-----------------
Episode 4: reward: 0.444, steps: 5
-----------------
| O || X || X |
| X || X || O |
| O || O || X |
-----------------
Episode 5: reward: 0.444, steps: 5
