Copyright **`(c)`** 2023 Giovanni Squillero `<giovanni.squillero@polito.it>`  
[`https://github.com/squillero/computational-intelligence`](https://github.com/squillero/computational-intelligence)  
Free for personal or classroom use; see [`LICENSE.md`](https://github.com/squillero/computational-intelligence/blob/master/LICENSE.md) for details.  

In [12]:
from abc import ABC, abstractmethod
from itertools import combinations, product
from collections import namedtuple, defaultdict
from random import choice
from copy import deepcopy

from tqdm.auto import tqdm
import numpy as np

from tqdm import trange

import random

In [13]:
State = namedtuple('State', ['x', 'o'])

In [14]:
MAGIC = [2, 7, 6, 9, 5, 1, 4, 3, 8]

In [15]:
class Player(ABC):
    @abstractmethod
    def make_move(self, game):
        pass


In [16]:
class Game():

    def __init__(self) -> None:
        self._board = np.array([[1, 6, 5], [8, 4, 0], [3, 2, 7]])  # magic board that sum 12
        self._o_cells = set()  # start the game  (player_id = 0)
        self._x_cells = set()  # play for second (player_id = 1)

    def get_state(self) -> tuple[frozenset, frozenset]:
        return (frozenset(self._o_cells), frozenset(self._x_cells))

    def draw(self) -> bool:
        return ((len(self._x_cells) + len(self._o_cells)) == 9) and self.check_winner() == -1

    def won(self, cells) -> bool:
        #Check if it works with the magic sqaure rule
        return any(sum(h) == 12 for h in combinations(cells, 3))

    def check_winner(self) -> int:
        if self.won(self._o_cells):
            return 0
        elif self.won(self._x_cells):
            return 1
        return -1

    def print(self) -> None:
        #Print the board
        for r in range(self._board.shape[0]):
            print('-------------')
            out = '| '
            for c in range(self._board.shape[1]):
                if self._board[r, c] in self._x_cells:
                   token = 'x'
                elif self._board[r, c] in self._o_cells:
                    token = 'o'
                else:
                   token = ' '
                out += token + ' | '
            print(out)
        print('-------------')

    def play(self, player1: Player, player2: Player, QSecond=False) -> int:
        #Play the game
        game = self
        players = [player1, player2]
        if QSecond:
            current_player_idx = 1
        else:
            current_player_idx = 0
        winner = -1
        draw = False
        while winner < 0 and not draw:
            current_player_idx += 1
            current_player_idx %= len(players)
            ok = False
            while not ok:
                pos = players[current_player_idx].make_move(game)
                ok = self.move(pos, current_player_idx)
            winner = self.check_winner()
            draw = self.draw()
            if False:
                self.print()
        return winner

    def move(self, pos: tuple[int, int], player_id: int) -> bool:
        '''Perform a move'''
        if player_id > 1:
            return False
        acceptable: bool = self.valid_move(pos)
        if acceptable:
            # put the player id in the piece
            if player_id == 0:
                self._o_cells.add(self._board[pos])
            elif player_id == 1:
                self._x_cells.add(self._board[pos])
        return acceptable
    
    def valid_move(self, pos: tuple[int, int]) -> bool:
        '''Check if the move is valid'''
        return self._board[pos] not in (self._x_cells | self._o_cells)
    
    def get_possible_moves(self):
        all_moves = product([0, 1, 2], repeat=2)
        legal_moves = []
        for move in all_moves:
            if self.valid_move(move):
                legal_moves.append(move)
        return legal_moves

In [17]:
class RandomPlayer(Player):

    def make_move(self, game) -> tuple[int, int]:
        # choose random position (row,col)
        pos = (random.randint(0, 2), random.randint(0, 2))
        return pos

In [18]:
class QPlayer(Player):
    def __init__(self, learning_rate=0.25, discount_factor=0.9, exploration_prob=0.3):
        self.q_values = {}  # Dictionary to store Q-values for state-action pairs
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.exploration_prob = exploration_prob
        self.last_state = None
        self.last_action = None

    def get_legal_moves(self, game: Game):
            # Get legal moves for the current state in the game
            legal_moves = [pos for pos in game.get_possible_moves() if game.valid_move(pos)]
            return legal_moves

    def make_move(self, game):
        state = game.get_state()
        legal_moves = self.get_legal_moves(game)

        # Choose an action using epsilon-greedy strategy
        if np.random.rand() < self.exploration_prob:
            action = choice(legal_moves)
        else:
            action = self.get_best_action(state, legal_moves)

        # Store the state-action pair for updating Q-values in the next turn
        self.last_state = state
        self.last_action = action

        return action

    def get_best_action(self, state, legal_moves):
        # Choose the best action based on Q-values or a random action if not seen before
        if state not in self.q_values:
            return choice(legal_moves)
        q_values_for_state = self.q_values[state]
        best_action = max(legal_moves, key=lambda action: q_values_for_state.get(action, 0))
        return best_action

    def update_q_values(self, reward, game: Game):
        # Update Q-values based on the reward and the transition to the next state
        if self.last_state not in self.q_values:
            self.q_values[self.last_state] = {}
        old_q_value = self.q_values[self.last_state].get(self.last_action, 0)
        next_state = game.get_state()
        max_next_q_value = max(self.q_values.get(next_state, {}).values(), default=0)
        new_q_value = old_q_value + self.learning_rate * (reward + self.discount_factor * max_next_q_value - old_q_value)
        self.q_values[self.last_state][self.last_action] = new_q_value

    def training(self) -> None:
        # Play multiple episodes/games to train the Q-learning player
        num_episodes = 100000

        for episode in range(num_episodes):
            # Reset the game for a new episode
            game = Game()
            reward = 0

            while not game.check_winner():
                # Make a move using epsilon-greedy strategy
                winner = self.make_move(game)

            # Apply the selected action and get the reward
                if winner == -1:
                    # 0 for draw
                    reward = 0
                elif winner == 0:
                    # 10 for win
                    reward = 10
                else:
                    # -10 for lose
                    reward = -10

            # Update Q-values based on the reward and the transition to the next state
            self.update_q_values(reward, game)

        print("Training completed.")
    


In [19]:
def test(player_0, player_1, num_games=1_000):
    #Test on 1000 games
    wins, draws, loses = 0, 0, 0

    for _ in range(num_games):
        g = Game()

        winner = g.play(player_0, player_1)
        wins += 1 if winner == 1 else 0
        loses += 1 if winner == 0 else 0
        draws += 1 if winner == -1 else 0

    print("Wins:", wins)
    print("Loses:", loses)
    print("Draws:", draws)

In [20]:
qlplayer = QPlayer()
qlplayer.training()
test(qlplayer, RandomPlayer())
print()

Training completed.
Wins: 580
Loses: 277
Draws: 143



In [21]:
def single_match(player1, player2):
    #Single match
    g = Game()
    winner = g.play(player1, player2, QSecond=False)
    g.print()
    if winner == -1:
        print("Game endend in Draw.")
    else:
        print(f"Winner: Player {winner}")

In [22]:
qlplayer = QPlayer()
randomplayer = RandomPlayer()
qlplayer.training()
single_match(qlplayer, randomplayer)

Training completed.
-------------
| x | o | x | 
-------------
| x | o | o | 
-------------
| x | x | o | 
-------------
Winner: Player 1
