Copyright **`(c)`** 2023 Giovanni Squillero `<giovanni.squillero@polito.it>`  
[`https://github.com/squillero/computational-intelligence`](https://github.com/squillero/computational-intelligence)  
Free for personal or classroom use; see [`LICENSE.md`](https://github.com/squillero/computational-intelligence/blob/master/LICENSE.md) for details.  

# LAB10

Use reinforcement learning to devise a tic-tac-toe player.

### Deadlines:

* Submission: Sunday, December 17 ([CET](https://www.timeanddate.com/time/zones/cet))
* Reviews: Dies Natalis Solis Invicti ([CET](https://en.wikipedia.org/wiki/Sol_Invictus))

Notes:

* Reviews will be assigned  on Monday, December 4
* You need to commit in order to be selected as a reviewer (ie. better to commit an empty work than not to commit)

In [1]:
import numpy as np
import pickle
from abc import ABC, abstractmethod
from copy import deepcopy
from itertools import combinations
from random import randint, random
from tqdm import trange

In [2]:
class TicTacToe:
    def __init__(self) -> None:
        # define the board game
        self._board = np.ones((3, 3), dtype=np.uint8) * -1
        # define a board to ease the check winner computation
        self._eqv_board = np.array([[1, 6, 5], [8, 4, 0], [3, 2, 7]], dtype=np.uint8)
        # define a mapping for pretty printing
        self._id_to_block = {-1: '⬜️', 0: '❌', 1: '⭕️'}

    @property
    def board(self) -> np.ndarray:
        # return a copy of the board so that the board cannot be modified from outside
        return deepcopy(self._board)

    def print(self):
        # define a board for pretty printing
        fancy_board = np.chararray(self._board.shape, itemsize=1, unicode=True)
        for i in range(fancy_board.shape[0]):
            for j in range(fancy_board.shape[1]):
                # fill the fancy board
                fancy_board[(i, j)] = self._id_to_block[self._board[(i, j)]]
        print(fancy_board)

    def check_winner(self) -> int:
        # take the tiles belonging to the first player
        player1_tiles = self._board == 0
        # take the tiles belonging to the second player
        player2_tiles = self._board == 1
        # check if the first player has won
        if any(sum(h) == 12 for h in combinations(self._eqv_board[player1_tiles], 3)):
            return 0
        # check if the second player has won
        if any(sum(h) == 12 for h in combinations(self._eqv_board[player2_tiles], 3)):
            return 1
        # no player has won
        return -1

    def is_still_playable(self):
        # check if still there are not taken tiles
        return any((self._board == -1).flatten())

    def move(self, move: tuple[int, int], player_id: int) -> bool:
        # if the player id is not valid
        if player_id >= 2 or player_id <= -1:
            return False
        # check if the move is acceptable
        acceptable = self.is_acceptable(move)
        # if it is
        if acceptable:
            # update the board
            self._board[move] = player_id
        return acceptable

    def is_acceptable(self, move: tuple[int, int]):
        acceptable: bool = move[0] >= 0 and move[0] <= 3 and move[1] >= 0 and move[1] <= 3 and self._board[move] < 0
        return acceptable

In [3]:
def play(game: 'TicTacToe', player1: 'Player', player2: 'Player') -> int:
    # define the players
    players = [player1, player2]
    # set the moving player index
    current_player_idx = 1
    # define a variable to indicate if there is a winner
    winner = -1
    # if we can still play
    while winner < 0 and game.is_still_playable():
        # update the current moving player index
        current_player_idx += 1
        current_player_idx %= len(players)
        # define a variable to check if the chosen move is ok or not
        ok = False
        # while the chosen move is not ok
        while not ok:
            # let the current player make a move
            move = players[current_player_idx].make_move(game, current_player_idx)
            # check if now it is ok
            ok = game.move(move, current_player_idx)
        # check if there is a winner
        winner = game.check_winner()
    # return the winner
    return winner

In [4]:
def show_statistics(player_id: int, player1: 'Player', player2: 'Player', n_matches: int = 1_000):
    counter_wins = 0
    counter_losses = 0
    counter_draws = 0
    for _ in range(1_000):
        game = TicTacToe()
        winner = play(game, player1, player2)
        counter_wins = counter_wins + 1 if winner == player_id else counter_wins
        counter_losses = counter_losses + 1 if winner == (player_id + 1) % 2 else counter_losses
        counter_draws = counter_draws + 1 if winner == -1 else counter_draws
    print(f'Over {n_matches} matches: {counter_wins} wins, {counter_losses} losses and {counter_draws} draw')
    print(f'Wins + Draws percentage: {(counter_wins + counter_draws) / n_matches:.2%}')

In [5]:
class Player(ABC):
    def __init__(self) -> None:
        pass

    @abstractmethod
    def make_move(self, game: 'TicTacToe', player_id: int) -> tuple[int, int]:
        pass

In [6]:
class RandomPlayer(Player):
    def __init__(self) -> None:
        super().__init__()

    def make_move(self, game: TicTacToe, player_id: int) -> tuple[int, int]:
        # return a random move
        return (randint(0, game.board.shape[0] - 1), randint(0, game.board.shape[1] - 1))

## Reinforcement Learning: Q-learning

In [7]:
class QLearningRLPlayer(Player):
    def __init__(
        self,
        n_episodes: int,
        alpha: float,
        gamma: float,
        min_exploration_rate: float,
        exploration_decay_rate: float,
        opponent: 'Player',
    ) -> None:
        super().__init__()
        self._q_table = {}  # define the Action-value function
        self._n_episodes = n_episodes  # define the number of episodes for the training phase
        self._alpha = alpha  # define how much information to incorporate from the new experience
        self._gamma = gamma  # define the discount rate of the Bellman equation
        self._exploration_rate = 1  # define the exploration rate for the training phase
        self._min_exploration_rate = (
            min_exploration_rate  # define the minimum rate for exploration during the training phase
        )
        self._exploration_decay_rate = (
            exploration_decay_rate  # define the exploration decay rate used during the training
        )
        self._opponent = opponent  # define the opponent to play against

    def _move_reward(self, game: 'TicTacToe', move: tuple[int, int], player_id: int) -> int:
        # play a move
        acceptable = game.move(move, player_id)
        # give a negative reward to the agent
        reward = -1
        # if the move is acceptable
        if acceptable:
            # give a positive reward to the agent
            reward = 1
        return reward, acceptable

    def _game_reward(self, player: 'TicTacToe', winner: int) -> int:
        # if there was no winner
        if winner == -1:
            # return no reward
            return 0
        # if the agent is the winner
        elif self == player:
            # give a big positive reward
            return 10
        # give a big negative reward, otherwise
        return -10

    def _map_state_to_index(self, game: 'TicTacToe', player_id: int) -> int:
        # take the current game state
        state = game.board
        # change not taken tiles values to 2
        state[state == -1] = 2
        # map the state to a string in base 3
        state_repr_index = ''.join(str(_) for _ in state.flatten()) + str(player_id)
        return state_repr_index

    def _update_q_table(self, state_repr_index: str, new_state_repr_index: str, action: int, reward: int) -> None:
        # if the current state is unknown
        if state_repr_index not in self._q_table:
            # create its entry in the action-value mapping table
            self._q_table[state_repr_index] = np.zeros((9,))
        # if the next state is unknown
        if new_state_repr_index not in self._q_table:
            # create its entry in the action-value mapping table
            self._q_table[new_state_repr_index] = np.zeros((9,))
        prev_value = self._q_table[state_repr_index][action]
        # update the action-value mapping entry for the current state using Q-learning
        self._q_table[state_repr_index][action] = (1 - self._alpha) * prev_value + self._alpha * (
            reward + self._gamma * np.max(self._q_table[new_state_repr_index])
        )

    def _make_move(self, game: 'TicTacToe', player_id: int) -> tuple[int, int]:
        # get the current state representation
        state_repr_index = self._map_state_to_index(game, player_id)

        # randomly perform exploration
        if random() < self._exploration_rate:
            # by returning a random move
            move = randint(0, 8)
        # perform eploitation, otherwise
        else:
            # if the current state is unknown
            if state_repr_index not in self._q_table:
                # create its entry in the action-value mapping table
                self._q_table[state_repr_index] = np.zeros((9,))
            # take the action with maximum return of rewards
            move = np.argmax(self._q_table[state_repr_index])

        # reshape the move to match the board shape
        move = move // 3, move % 3

        return move

    def make_move(self, game: 'TicTacToe', player_id: int) -> tuple[int, int]:
        # get the current state representation
        state_repr_index = self._map_state_to_index(game, player_id)
        # if the current state is known
        if state_repr_index in self._q_table:
            # take the action with maximum return of rewards
            move = np.argmax(self._q_table[state_repr_index])
            # reshape the move to match the board shape
            move = move // 3, move % 3
            # if the move is acceptable
            if game.is_acceptable(move):
                # return it
                return move
        # perform a random move, otherwise
        return (randint(0, game.board.shape[0] - 1), randint(0, game.board.shape[1] - 1))

    def train(self) -> None:
        # define the history of rewards
        all_rewards = []
        # define how many episodes to run
        pbar = trange(self._n_episodes)
        # define the players
        players = (self, self._opponent)
        # for each episode
        for episode in pbar:
            # define a new game
            game = TicTacToe()
            # sets the rewards to zero
            rewards = 0

            # define a variable to indicate if there is a winner
            winner = -1
            # change players order
            players = (players[1], players[0])
            # define the current player index
            player_idx = 1

            # if we can still play
            while winner < 0 and game.is_still_playable():
                # change player
                player_idx = (player_idx + 1) % 2
                player = players[player_idx]

                # define a variable to check if the chosen move is ok or not
                ok = False
                # if it is our turn
                if self == player:
                    # while the chosen move is not ok
                    while not ok:
                        # get the current state representation
                        state_repr_index = self._map_state_to_index(game, player_idx)
                        # get a move
                        move = self._make_move(game, player_idx)
                        # reshape the move to form an index
                        action = move[0] * 3 + move[1]
                        # perform the move and get the reward
                        reward, ok = self._move_reward(game, move, player_idx)
                        # get the next state representation
                        new_state_repr_index = self._map_state_to_index(game, player_idx)

                        # update the action-value function
                        self._update_q_table(state_repr_index, new_state_repr_index, action, reward)

                        # update the rewards
                        rewards += reward
                # if it is the opponent turn
                else:
                    # while the chosen move is not ok
                    while not ok:
                        # get a move
                        move = player.make_move(game, player_idx)
                        # perform the move
                        ok = game.move(move, player_idx)

                # check if there is a winner
                winner = game.check_winner()

            # update the exploration rate
            self._exploration_rate = np.clip(
                np.exp(-self._exploration_decay_rate * episode), self._min_exploration_rate, 1
            )
            # get the game reward
            reward = self._game_reward(player, winner)
            # update the action-value function
            self._update_q_table(state_repr_index, new_state_repr_index, action, reward)
            # update the rewards
            rewards += reward
            # update the rewards history
            all_rewards.append(rewards)
            pbar.set_description(f'rewards value: {rewards}, current exploration rate: {self._exploration_rate:2f}')

        print(f'** Last 1_000 episodes - Mean rewards value: {sum(all_rewards[-1_000:]) / 1_000:.2f} **')
        print(f'** Last rewards value: {all_rewards[-1]:} **')

In [8]:
q_learning_rl_agent = QLearningRLPlayer(
    n_episodes=100_000,
    alpha=0.1,
    gamma=0.99,
    min_exploration_rate=0.01,
    exploration_decay_rate=2.5e-5,
    opponent=RandomPlayer(),
)
q_learning_rl_agent.train()

rewards value: 14, current exploration rate: 0.082087: 100%|██████████| 100000/100000 [01:35<00:00, 1043.01it/s] 

** Last 1_000 episodes - Mean rewards value: 10.25 **
** Last rewards value: 14 **





In [9]:
len(q_learning_rl_agent._q_table.keys())

9977

In [10]:
with open('./q_learning_rl_agent.pkl', 'wb') as f:
    pickle.dump(q_learning_rl_agent, f)

In [11]:
with open('./q_learning_rl_agent.pkl', 'rb') as f:
    q_learning_rl_agent = pickle.load(f)

In [12]:
game = TicTacToe()
game.print()
player1 = RandomPlayer()
player2 = q_learning_rl_agent
winner = play(game, player1, player2)
game.print()
if winner == -1:
    print(f"Draw!")
else:
    print(f"Winner: Player {winner}")

[['⬜' '⬜' '⬜']
 ['⬜' '⬜' '⬜']
 ['⬜' '⬜' '⬜']]
[['❌' '❌' '❌']
 ['⬜' '❌' '⭕']
 ['⬜' '⭕' '⭕']]
Winner: Player 0


In [13]:
show_statistics(0, q_learning_rl_agent, RandomPlayer())

Over 1000 matches: 899 wins, 22 losses and 79 draw
Wins + Draws percentage: 97.80%


In [14]:
show_statistics(1, RandomPlayer(), q_learning_rl_agent)

Over 1000 matches: 655 wins, 68 losses and 277 draw
Wins + Draws percentage: 93.20%


## Reinforcement Learning: Monte Carlo learning

In [15]:
class MonteCarloRLPlayer(Player):
    def __init__(
        self,
        n_episodes: int,
        gamma: float,
        min_exploration_rate: float,
        exploration_decay_rate: float,
        opponent: 'Player',
    ) -> None:
        super().__init__()
        self._q_table = {}  # define the Action-value function
        self._q_counters = {}
        self._n_episodes = n_episodes  # define the number of episodes for the training phase
        self._gamma = gamma  # define the discount rate of the Bellman equation
        self._exploration_rate = 1  # define the exploration rate for the training phase
        self._min_exploration_rate = (
            min_exploration_rate  # define the minimum rate for exploration during the training phase
        )
        self._exploration_decay_rate = (
            exploration_decay_rate  # define the exploration decay rate used during the training
        )
        self._opponent = opponent  # define the opponent to play against

    def _move_reward(self, game: 'TicTacToe', move: tuple[int, int], player_id: int) -> int:
        # play a move
        acceptable = game.move(move, player_id)
        # give a negative reward to the agent
        reward = -1
        # if the move is acceptable
        if acceptable:
            # give a positive reward to the agent
            reward = 1
        return reward, acceptable

    def _game_reward(self, player: 'TicTacToe', winner: int) -> int:
        # if there was no winner
        if winner == -1:
            # return no reward
            return 0
        # if the agent is the winner
        elif self == player:
            # give a big positive reward
            return 10
        # give a big negative reward, otherwise
        return -10

    def _map_state_to_index(self, game: 'TicTacToe', player_id: int) -> int:
        # take the current game state
        state = game.board
        # change not taken tiles values to 2
        state[state == -1] = 2
        # map the state to a string in base 3
        state_repr_index = ''.join(str(_) for _ in state.flatten()) + str(player_id)
        return state_repr_index

    def _update_q_table(self, state_repr_index: str, action: int, return_of_rewards: float) -> None:
        # if the current state is unknown
        if state_repr_index not in self._q_counters:
            # create its entry in the action-value mapping table
            self._q_table[state_repr_index] = np.zeros((9,))
            self._q_counters[state_repr_index] = np.zeros((9,))
        self._q_counters[state_repr_index][action] += 1
        self._q_table[state_repr_index][action] = (
            self._q_table[state_repr_index][action]
            + (return_of_rewards - self._q_table[state_repr_index][action]) / self._q_counters[state_repr_index][action]
        )

    def _make_move(self, game: 'TicTacToe', player_id: int) -> tuple[int, int]:
        # get the current state representation
        state_repr_index = self._map_state_to_index(game, player_id)

        # randomly perform exploration
        if random() < self._exploration_rate:
            # by returning a random move
            move = randint(0, 8)
        # perform eploitation, otherwise
        else:
            # if the current state is unknown
            if state_repr_index not in self._q_table:
                # create its entry in the action-value mapping table
                self._q_table[state_repr_index] = np.zeros((9,))
                self._q_counters[state_repr_index] = np.zeros((9,))
            # take the action with maximum return of rewards
            move = np.argmax(self._q_table[state_repr_index])

        # reshape the move to match the board shape
        move = move // 3, move % 3

        return move

    def make_move(self, game: 'TicTacToe', player_id: int) -> tuple[int, int]:
        # get the current state representation
        state_repr_index = self._map_state_to_index(game, player_id)
        # if the current state is known
        if state_repr_index in self._q_table:
            # take the action with maximum return of rewards
            move = np.argmax(self._q_table[state_repr_index])
            # reshape the move to match the board shape
            move = move // 3, move % 3
            # if the move is acceptable
            if game.is_acceptable(move):
                # return it
                return move
        # perform a random move, otherwise
        return (randint(0, game.board.shape[0] - 1), randint(0, game.board.shape[1] - 1))

    def train(self) -> None:
        # define the history of rewards
        all_rewards = []
        # define how many episodes to run
        pbar = trange(self._n_episodes)
        # define the players
        players = (self, self._opponent)

        # for each episode
        for episode in pbar:
            # define a new game
            game = TicTacToe()
            # sets the rewards to zero
            rewards = 0

            # define the trajectory
            trajectory = []

            # define a variable to indicate if there is a winner
            winner = -1
            # change players order
            players = (players[1], players[0])
            # define the current player index
            player_idx = 1

            # if we can still play
            while winner < 0 and game.is_still_playable():
                # change player
                player_idx = (player_idx + 1) % 2
                player = players[player_idx]

                # define a variable to check if the chosen move is ok or not
                ok = False
                # if it is our turn
                if self == player:
                    # while the chosen move is not ok
                    while not ok:
                        # get the current state representation
                        state_repr_index = self._map_state_to_index(game, player_idx)
                        # get a move
                        move = self._make_move(game, player_idx)
                        # reshape the move to form an index
                        action = move[0] * 3 + move[1]
                        # perform the move and get the reward
                        reward, ok = self._move_reward(game, move, player_idx)

                        # update the trajectory
                        trajectory.append((state_repr_index, action, reward))

                        # update the rewards
                        rewards += reward
                # if it is the opponent turn
                else:
                    # while the chosen move is not ok
                    while not ok:
                        # get a move
                        move = player.make_move(game, player_idx)
                        # perform the move
                        ok = game.move(move, player_idx)

                # check if there is a winner
                winner = game.check_winner()

            # update the exploration rate
            self._exploration_rate = np.clip(
                np.exp(-self._exploration_decay_rate * episode), self._min_exploration_rate, 1
            )
            # delete last reward
            rewards -= reward
            # delete last tuple in trajectory
            trajectory.pop()
            # get the game reward
            reward = self._game_reward(player, winner)
            # update the trajectory
            trajectory.append((state_repr_index, action, reward))
            # update the rewards
            rewards += reward
            # update the rewards history
            all_rewards.append(rewards)

            # set the current return of rewards
            return_of_rewards = 0
            # for all tuples in trajectory
            for state_repr_index, action, reward in trajectory:
                # update the return of rewards
                return_of_rewards = reward + self._gamma * return_of_rewards
                # update the action-value function
                self._update_q_table(state_repr_index, action, return_of_rewards)

            pbar.set_description(f'rewards value: {rewards}, current exploration rate: {self._exploration_rate:2f}')

        print(f'** Last 1_000 episodes - Mean rewards value: {sum(all_rewards[-1_000:]) / 1_000:.2f} **')
        print(f'** Last rewards value: {all_rewards[-1]:} **')

In [16]:
monte_carlo_rl_agent = MonteCarloRLPlayer(
    n_episodes=100_000,
    gamma=0.99,
    min_exploration_rate=0.01,
    exploration_decay_rate=2.5e-5,
    opponent=RandomPlayer(),
)
monte_carlo_rl_agent.train()

rewards value: 12, current exploration rate: 0.082087: 100%|██████████| 100000/100000 [01:30<00:00, 1109.05it/s] 

** Last 1_000 episodes - Mean rewards value: 7.98 **
** Last rewards value: 12 **





In [26]:
len(monte_carlo_rl_agent._q_table.keys())

4519

In [18]:
with open('./monte_carlo_rl_agent.pkl', 'wb') as f:
    pickle.dump(monte_carlo_rl_agent, f)

In [19]:
with open('./monte_carlo_rl_agent.pkl', 'rb') as f:
    monte_carlo_rl_agent = pickle.load(f)

In [20]:
game = TicTacToe()
game.print()
player1 = RandomPlayer()
player2 = monte_carlo_rl_agent
winner = play(game, player1, player2)
game.print()
if winner == -1:
    print(f"Draw!")
else:
    print(f"Winner: Player {winner}")

[['⬜' '⬜' '⬜']
 ['⬜' '⬜' '⬜']
 ['⬜' '⬜' '⬜']]
[['⬜' '⭕' '⬜']
 ['❌' '⭕' '❌']
 ['⬜' '⭕' '❌']]
Winner: Player 1


In [21]:
show_statistics(0, monte_carlo_rl_agent, RandomPlayer())

Over 1000 matches: 946 wins, 13 losses and 41 draw
Wins + Draws percentage: 98.70%


In [22]:
show_statistics(1, RandomPlayer(), monte_carlo_rl_agent)

Over 1000 matches: 685 wins, 67 losses and 248 draw
Wins + Draws percentage: 93.30%


In [23]:
show_statistics(0, monte_carlo_rl_agent, q_learning_rl_agent)

Over 1000 matches: 0 wins, 0 losses and 1000 draw
Wins + Draws percentage: 100.00%


In [24]:
show_statistics(0, q_learning_rl_agent, monte_carlo_rl_agent)

Over 1000 matches: 1000 wins, 0 losses and 0 draw
Wins + Draws percentage: 100.00%


In [27]:
show_statistics(1, q_learning_rl_agent, monte_carlo_rl_agent)

Over 1000 matches: 0 wins, 1000 losses and 0 draw
Wins + Draws percentage: 0.00%
