In [4]:
import random
from game import Game, Move, Player
import numpy as np
from collections import defaultdict
from tqdm.auto import tqdm
import numpy as np
from scipy.sparse import lil_matrix
import pickle
from copy import deepcopy
from itertools import product

  from .autonotebook import tqdm as notebook_tqdm


### Symmetries


In [5]:
shift90 = np.array([4,9,14,19,24,3,8,13,18,23,2,7,12,17,22,1,6,11,16,21,0,5,10,15,20])
shift180 = shift90[shift90]
shift270 = shift180[shift90]
mirror = np.array([4,3,2,1,0,9,8,7,6,5,14,13,12,11,10,19,18,17,16,15,24,23,22,21,20]) 
m_shift90 = mirror[shift90]
m_shift180 = mirror[shift180]
m_shift270 = mirror[shift270]
SYMMETRIES = [shift90,shift180,shift270,m_shift90,m_shift180,m_shift270,mirror]
MOVES_SHIFT = [Move.TOP,Move.LEFT,Move.BOTTOM,Move.RIGHT]

### Game class that uses symmetries

In [8]:
class LearningGame(Game):
    def __init__(self):
        super().__init__()

    def set_state(self, state):
        board,_ = LearningGame.board_from_state(state)
        self._board = board
    
    def set_board(self, board):
        self._board = board
    
    def state_from_board(self,player: int):
        board = self.get_board()
        board = board.flatten()
        Xs = board == 0
        Os = board == 1
        key = tuple(map(tuple, (Xs, Os))), player
        return key

    def board_from_state(state):
        board = np.ones(25, dtype=np.uint8) * -1
        Xs_Os,player = state
        Xs,Os = Xs_Os
        board[list(Xs)] = 0
        board[list(Os)] = 1
        return board.reshape(5,5), player
    
    def move(self, from_pos: tuple[int, int], slide: Move, player_id: int) -> bool:
        '''Perform a move'''
        if player_id not in (0, 1):
            return False
        prev_value = deepcopy(self._board[(from_pos[1], from_pos[0])])
        acceptable = self.take((from_pos[1], from_pos[0]), player_id)
        if acceptable:
            acceptable = self.slide((from_pos[1], from_pos[0]), slide)
            if not acceptable:  # restore previous
                self._board[(from_pos[1], from_pos[0])] = deepcopy(prev_value)
        return acceptable

    def take(self, from_pos: tuple[int, int], player_id: int) -> bool:
        """Checks that {from_pos} is in the border and marks the cell with {player_id}"""
        row, col = from_pos
        from_border = row in (0, 4) or col in (0, 4)
        if not from_border:
            return False  # the cell is not in the border
        if self._board[from_pos] != player_id and self._board[from_pos] != -1:
            return False  # the cell belongs to the opponent
        self._board[from_pos] = player_id
        return True

    @staticmethod
    def acceptable_slides(from_position: tuple[int, int]):
        """When taking a piece from {from_position} returns the possible moves (slides)"""
        acceptable_slides = [Move.BOTTOM, Move.TOP, Move.LEFT, Move.RIGHT]
        axis_0 = from_position[0]    # axis_0 = 0 means uppermost row
        axis_1 = from_position[1]    # axis_1 = 0 means leftmost column

        if axis_0 == 0:  # can't move upwards if in the top row...
            acceptable_slides.remove(Move.TOP)
        elif axis_0 == 4:
            acceptable_slides.remove(Move.BOTTOM)

        if axis_1 == 0:
            acceptable_slides.remove(Move.LEFT)
        elif axis_1 == 4:
            acceptable_slides.remove(Move.RIGHT)
        return acceptable_slides

    def slide(self, from_pos: tuple[int, int], slide: Move) -> bool:
        '''Slide the other pieces'''
        if slide not in self.acceptable_slides(from_pos):
            return False  # consider raise ValueError('Invalid argument value')
        axis_0, axis_1 = from_pos
        # np.roll performs a rotation of the element of a 1D ndarray
        if slide == Move.RIGHT:
            self._board[axis_0] = np.roll(self._board[axis_0], -1)
        elif slide == Move.LEFT:
            self._board[axis_0] = np.roll(self._board[axis_0], 1)
        elif slide == Move.BOTTOM:
            self._board[:, axis_1] = np.roll(self._board[:, axis_1], -1)
        elif slide == Move.TOP:
            self._board[:, axis_1] = np.roll(self._board[:, axis_1], 1)
        return True
    
    def get_available_moves(self,player: int) -> list[tuple[tuple[int, int], Move]]:
        available_moves = []
        l = [0,1,2,3,4]
        l2 = [0,4]
        p1 = list(product(l, l2))
        p2 = list(product(l2, l))
        positions = set(p1) | set(p2)
        syms = []
        original_board = self.get_board().flatten()
        for sym in SYMMETRIES:
            board = original_board[sym]
            if (original_board == board).all():
                syms.append(sym)
        
        while positions:
            row,col = positions.pop()
            if self._board[row, col] == -1 or self._board[row, col] == player: 
                for move in self.acceptable_slides((row,col)):
                    available_moves.append(((col, row), move))
            else:
                continue
            index = row * 5 + col
            for sym in syms:
                idx_pos = sym[index]
                pos = idx_pos // 5, idx_pos % 5
                positions.discard(pos)
        return available_moves


In [9]:

class RandomPlayer(Player):
    def __init__(self) -> None:
        super().__init__()

    def make_move(self, game: 'Game') -> tuple[tuple[int, int], Move]:
        from_pos = (random.randint(0, 4), random.randint(0, 4))
        move = random.choice([Move.TOP, Move.BOTTOM, Move.LEFT, Move.RIGHT])
        return from_pos, move

#see winning move
class MyPlayer(Player):
    def __init__(self,player) -> None:
        super().__init__()
        self.player = player

    def make_move(self, game: 'Game') -> tuple[tuple[int, int], Move]:
        g = LearningGame()
        moves = [Move.TOP, Move.BOTTOM, Move.LEFT, Move.RIGHT]
        l = [0,1,2,3,4]
        l2 = [0,4]
        p1 = list(product(l, l2))
        p2 = list(product(l2, l))
        positions = list(set(p1) | set(p2))
        for pos in positions:
            for move in moves:
                g.set_board(game.get_board())
                ok = g.move(pos,move,self.player)
                if not ok:
                    continue
                if g.check_winner() == self.player:
                    return pos,move
        from_pos = random.choice(positions)
        move = random.choice([Move.TOP, Move.BOTTOM, Move.LEFT, Move.RIGHT])
        return from_pos, move

class MyPlayer2(Player):
    def __init__(self,n: int) -> None:
        super().__init__()
        self.n = n

    def make_move(self, game: 'Game') -> tuple[tuple[int, int], Move]:
        from_pos = (self.n, 0)
        move = Move.BOTTOM
        return from_pos, move

# Q-Learning strategy (Tried but not good -> too many states to save)

In [8]:
class QLearningAgent(Player):
    def __init__(self, player = 0,alpha = 0.1, gamma = 0.9, epsilon = 0.1):
        # Instead of using a 2D matrix, the q_table is implemented as a dictionary where the keys are tuples representing states, 
        # and the values are arrays that represent the Q values associated with possible actions in the corresponding state.
        self.q_table = defaultdict(lambda: np.zeros((16, 4), dtype=float))
        self.alpha = alpha    # Learning rate
        self.gamma = gamma    # Discount factor
        self.epsilon = epsilon # Exploration rate
        self.player = player # Indicate if it is X or O

    def translate_action(self,action):
        '''
        from index (i,j) to an action ((column,row),Move)
        '''
        pos,m = action[0], action[1]
        if pos < 10:
            tile = pos%5, pos//5 * 4
        else:
            tile = pos%2 * 4, pos // 2 - 4
        return tile,Move(m)
    
    def perform_actual_state(self,state):
        if state in self.q_table:
            return state, -1
        state1,player = state
        Xs,Os = state1
        Xs = np.array(Xs)
        Os = np.array(Os)
        for i,s in enumerate(SYMMETRIES):
            X_shift = Xs[s]
            O_shift = Os[s]
            actual_state = tuple(map(tuple, (X_shift, O_shift))), player
            if actual_state in self.q_table:
                return actual_state, i
        return state, -1
    
    def choose_action(self, state):
        if random.uniform(0, 1) < self.epsilon:
            # Explore: choose a random valid action
            q_values = self.q_table[state]
            indices = np.where(q_values != float("-inf"))
            actions = list(zip(indices[0],indices[1]))
            selected = random.choice(actions)
            return selected
        else:
            # Exploit: choose the best action based on current Q-values
            # If not present the state as a key in the dict, it will be created
            q_values = self.q_table[state]
            max_values = np.max(q_values)
            max_indices = np.where(q_values == max_values)
            actions = list(zip(max_indices[0],max_indices[1]))
            selected = random.choice(actions)
            return selected

    def learn(self, state, action, reward, next_state):
        # Represents the current Q value for the "action" in the "state"
        old_value = self.q_table[state][action]

        actual_next_state, _ = self.perform_actual_state(next_state)
        # Represents the maximum estimated Q value for valid actions in the "next_state"
        future_q = 0 if reward != 0 else -np.max(self.q_table[actual_next_state])
        
        # Bellman Equation: Q(s,a)←(1−α)⋅Q(s,a)+α⋅(r+γ⋅maxQ(s′,a′))
        # where Q(s,a) is the current Q value for state s and action a
        new_value = (1 - self.alpha) * old_value + self.alpha * (reward + self.gamma * future_q)
        self.q_table[state][action] = new_value
        return actual_next_state

    def learn_error(self,state,action):
        self.q_table[state][action] = float("-inf")

    def save_q_table(self,name):
        # Salva il dizionario utilizzando pickle
        with open(f"{name}.pkl", 'wb') as pickle_file:
            pickle.dump(dict(self.q_table), pickle_file)

    def load_q_table(self,name):
        # Apri il file pickle in modalità lettura binaria (rb)
        with open(f"{name}.pkl", 'rb') as pickle_file:
            # Carica il dizionario da file pickle
            loaded_dict = pickle.load(pickle_file)

        # Converti il dizionario in una defaultdict
        loaded_defaultdict = defaultdict(lambda: np.zeros((16, 4), dtype=float), loaded_dict)
        self.q_table = loaded_defaultdict

    def set_player(self,player: int):
        self.player = player
        
    def make_move(self, game: 'Game') -> tuple[tuple[int, int], Move]:
        state = LearningGame.state_from_board(game,self.player)
        actual_state,idx = self.perform_actual_state(state)
        q_values = self.q_table[actual_state]
        max_values = np.max(q_values)
        max_indices = np.where(q_values == max_values)
        actions = list(zip(max_indices[0],max_indices[1]))
        selected = random.choice(actions)
        tile,move = self.translate_action(selected)
        if idx < 0:
            return tile,move
        
        i = tile[0] + tile[1] * 5  
        pos = np.where(SYMMETRIES[idx] == i)[0][0]    
        tile = pos%5, pos//5

        if idx>2:
            if move == Move.LEFT:
                move = Move.RIGHT
            elif move == Move.RIGHT:
                move = Move.LEFT
            idx -= 3
        if idx <=2:
            idx += 1
            index_move = MOVES_SHIFT.index(move)
            index_actual_move = (idx + index_move) % 4
            move = MOVES_SHIFT[index_actual_move]
        
        return tile,move
    


def train_agent(episodes):
    agent = QLearningAgent()
    for episode in tqdm(range(episodes)):
        g = LearningGame()
        current_player_idx = 0
        state = g.state_from_board(current_player_idx)
        winner = -1
        while winner < 0:
            ok = False
            while not ok:
                action = agent.choose_action(state)
                from_pos, slide = agent.translate_action(action)
                ok = g.move(from_pos, slide, current_player_idx)
                if not ok:
                    agent.learn_error(state,action)
            
            winner = g.check_winner()
            reward = 0
            if winner == current_player_idx:
                reward = 1
            elif winner == (1 - current_player_idx):
                reward = -1

            current_player_idx = 1 - current_player_idx
            next_state = g.state_from_board(current_player_idx)

            actual_next_state = agent.learn(state,action,reward,next_state)
            g.set_state(actual_next_state)
            state = actual_next_state
    return agent

# Train the agent
#trained_agent = train_agent(100)

trained_agent = QLearningAgent(0)
trained_agent.load_q_table("q_table100k")

In [9]:
#trained_agent.save_q_table("q_table100k")
q_table = trained_agent.q_table
print(f"I evaluated {len(q_table)} states")

I evaluated 3586324 states


In [10]:
q_table = trained_agent.q_table
max_state = max(q_table, key=lambda state: np.max(q_table[state]))
print(max_state)
state1,player = max_state
Xs,Os = state1
Xs = np.array(Xs)
Os = np.array(Os)
print("Stato con il massimo valore nella Q-table:\n", LearningGame.board_from_state(max_state))
print("Valori associati:\n", q_table[max_state])

'''for key, v in q_table.items():
    state1,player = key
    Xs,Os = state1
    Xs = np.array(Xs)
    Os = np.array(Os)
    for i,s in enumerate(SYMMETRIES):
        X_shift = Xs[s]
        O_shift = Os[s]
        actual_state = tuple(map(tuple, (X_shift, O_shift))), player
        board_now = LearningGame.board_from_state(key)
        board_s = LearningGame.board_from_state(actual_state)
        if np.array_equal(board_s, board_now) and actual_state in q_table:
            print("-"*25)
            print("Stato iniziale:\n", LearningGame.board_from_state(key))
            print("Stato simmetrico:\n", LearningGame.board_from_state(actual_state))
            print("-"*25)
            break'''

(((False, False, False, False, True, False, False, False, False, True, False, False, False, False, False, False, False, False, False, True, False, False, False, False, True), (False, False, True, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, True, False, False)), 0)
Stato con il massimo valore nella Q-table:
 (array([[-1, -1,  1, -1,  0],
       [ 1, -1, -1, -1,  0],
       [-1, -1, -1, -1, -1],
       [-1, -1, -1, -1,  0],
       [ 1, -1,  1, -1,  0]], dtype=int16), 0)
Valori associati:
 [[0.    0.    0.    0.   ]
 [0.    0.    0.    0.   ]
 [0.    0.    0.    0.   ]
 [0.    0.    0.    0.   ]
 [0.    0.    0.    0.   ]
 [0.    0.    0.    0.   ]
 [0.    0.    0.    0.   ]
 [0.    0.    0.    0.   ]
 [0.    0.    0.    0.   ]
 [0.    0.    0.    0.   ]
 [0.    0.    0.    0.   ]
 [0.    0.    0.    0.   ]
 [0.    0.    0.    0.271]
 [0.    0.    0.    0.   ]
 [0.    0.     -inf 0.   ]
 [0.    0.    0.

'for key, v in q_table.items():\n    state1,player = key\n    Xs,Os = state1\n    Xs = np.array(Xs)\n    Os = np.array(Os)\n    for i,s in enumerate(SYMMETRIES):\n        X_shift = Xs[s]\n        O_shift = Os[s]\n        actual_state = tuple(map(tuple, (X_shift, O_shift))), player\n        board_now = LearningGame.board_from_state(key)\n        board_s = LearningGame.board_from_state(actual_state)\n        if np.array_equal(board_s, board_now) and actual_state in q_table:\n            print("-"*25)\n            print("Stato iniziale:\n", LearningGame.board_from_state(key))\n            print("Stato simmetrico:\n", LearningGame.board_from_state(actual_state))\n            print("-"*25)\n            break'

# MiniMax Agent (The Choosen one)

In [12]:
class QuixoMinimaxAgent(Player):
    def __init__(self, player: int, depth: int):
        self.player = player
        self.depth = depth
        self.board = LearningGame()
        self.initial_val = float('inf') if player == 0 else float('-inf')

    def make_move(self, quixo_game: Game) -> tuple[tuple[int, int], Move]:
        self.board.set_board(quixo_game.get_board())
        _, move = self.minimax(deepcopy(self.board), depth=self.depth, maximizing_player = self.player, evaluation_prev_node = self.initial_val)
        return move

    def minimax(self, quixo_game: LearningGame, depth: int, maximizing_player: int, evaluation_prev_node: float):
        if depth == 0 or quixo_game.check_winner() != -1:
            return self.evaluate(quixo_game), None

        possible_moves = quixo_game.get_available_moves(maximizing_player)

        if maximizing_player == 0:
            max_eval = float('-inf')
            best_move = None
            for tile,move in possible_moves:
                if max_eval == float('inf'):
                    return max_eval, best_move
                if max_eval >= evaluation_prev_node:
                    return max_eval, best_move
                
                new_game = deepcopy(quixo_game)
                ok = new_game.move(tile,move,0)
                if not ok:
                    print(tile, move)
                    continue
                evaluation, _ = self.minimax(new_game, depth - 1, 1, max_eval)
                if evaluation > max_eval:
                    max_eval = evaluation
                    best_move = tile,move
            return max_eval, best_move
        else:
            min_eval = float('inf')
            best_move = None
            for tile,move in possible_moves:
                if min_eval == float('-inf'):
                    return min_eval, best_move
                if min_eval <= evaluation_prev_node:
                    return min_eval, best_move
                
                new_game = deepcopy(quixo_game)
                ok = new_game.move(tile,move,1)
                if not ok:
                    print(tile, move)
                    continue
                evaluation, _ = self.minimax(new_game, depth - 1, 0, min_eval)
                if evaluation < min_eval:
                    min_eval = evaluation
                    best_move = tile,move
            return min_eval, best_move

    def evaluate(self, quixo_game: LearningGame) -> float:
        winner = quixo_game.check_winner()
        if winner == 1:
            return float('-inf')
        elif winner == 0:
            return float('inf')
        
        X = []
        O = []
        for i in range(quixo_game._board.shape[0]):
            x_r = 0
            x_c = 0
            o_r = 0
            o_c = 0
            for j in range(quixo_game._board.shape[1]):
                if quixo_game._board[i,j] == 0:
                    x_r += 1
                elif quixo_game._board[i,j] == 1:
                    o_r += 1
                
                if quixo_game._board[j,i] == 0:
                    x_c += 1
                elif quixo_game._board[j,i] == 1:
                    o_c += 1
            X.append(x_r)
            X.append(x_c)
            O.append(o_r)
            O.append(o_c)

        x_d1 = 0
        x_d2 = 0
        o_d1 = 0
        o_d2 = 0
        for i in range(quixo_game._board.shape[0]):
            if quixo_game._board[i,i] == 0:
                x_d1 += 1
            elif quixo_game._board[i,i] == 1:
                o_d1 += 1

            if quixo_game._board[i, -(i+1)] == 0:
                x_d2 += 1
            elif quixo_game._board[i,-(i+1)] == 1:
                o_d2 += 1

        evaluation = 0
        for i in range(len(X)):
            val = X[i] / 4 - O[i] / 4
            if X[i] > O[i]:
                val += X[i]*0.1
            
            if X[i] < O[i]:
                val -= O[i]*0.1

            if i<2 or i>=len(X)-2:
                val = val * 1.5
            evaluation += val
        
        val = x_d1 / 4 - o_d1 / 4
        if x_d1 > o_d1:
            val += x_d1*0.1
        
        if x_d1 < o_d1:
            val -= o_d1*0.1
        
        evaluation += val

        val = x_d2 / 4 - o_d2 / 4
        if x_d2 > o_d2:
            val += x_d2*0.1
        
        if x_d2 < o_d2:
            val -= o_d2*0.1
        evaluation += val
        return evaluation


In [81]:
n_wins = 0
for i in tqdm(range(1000)):
    g = LearningGame()
    player1 = QuixoMinimaxAgent(0,2)
    player2 = MyPlayer(1)
    winner = g.play(player1, player2)
    if winner == 0:
        n_wins += 1
print(f"Agent wins {n_wins} game")


100%|██████████| 1000/1000 [43:33<00:00,  2.61s/it]

Agent wins 1000 game





In [86]:
n_wins = 0
for i in tqdm(range(1000)):
    g = LearningGame()
    player1 = MyPlayer(0)
    player2 = QuixoMinimaxAgent(1,2)
    winner = g.play(player1, player2)
    if winner == 1:
        n_wins += 1
print(f"Agent wins {n_wins} game")

100%|██████████| 1000/1000 [52:35<00:00,  3.16s/it] 

Agent wins 1000 game



