In [1]:
import random
import numpy as np
from game import Game, Move, Player
from copy import deepcopy
from tqdm import tqdm

In [2]:
class Dummy_Game(object):
    def __init__(self) -> None:
        self._board = np.ones((5, 5), dtype=np.uint8) * -1
        self.current_player_idx = 1

    def get_board(self): return self._board

    def single_move(self, board, from_pos, move, player_id):
        self._board = deepcopy(board)
        self.current_player_idx = player_id
        ok = self.__move(from_pos, move, player_id)
        return deepcopy(self._board), ok
    
    def check_winner_board(self, board):
        self._board = board
        return self.check_winner()

    def check_winner(self) -> int:
        for x in range(self._board.shape[0]):
            if self._board[x, 0] != -1 and all(self._board[x, :] == self._board[x, 0]): return self._board[x, 0]
        for y in range(self._board.shape[1]):
            if self._board[0, y] != -1 and all(self._board[:, y] == self._board[0, y]): return self._board[0, y]
        if self._board[0, 0] != -1 and all([self._board[x, x] for x in range(self._board.shape[0])] == self._board[0, 0]): return self._board[0, 0]
        if self._board[0, -1] != -1 and all([self._board[x, -(x + 1)] for x in range(self._board.shape[0])] == self._board[0, -1]): return self._board[0, -1]
        return -1

    def __move(self, from_pos: tuple[int, int], slide: Move, player_id: int) -> bool:
        if player_id > 2: return False
        prev_value = deepcopy(self._board[(from_pos[1], from_pos[0])])
        acceptable = self.__take((from_pos[1], from_pos[0]), player_id)
        if acceptable:
            acceptable = self.__slide((from_pos[1], from_pos[0]), slide)
            if not acceptable: self._board[(from_pos[1], from_pos[0])] = deepcopy(prev_value)
        return acceptable

    def __take(self, from_pos: tuple[int, int], player_id: int) -> bool:
        acceptable: bool = ((from_pos[0] == 0 and from_pos[1] < 5) or (from_pos[0] == 4 and from_pos[1] < 5) or (from_pos[1] == 0 and from_pos[0] < 5) or (from_pos[1] == 4 and from_pos[0] < 5)) and (self._board[from_pos] < 0 or self._board[from_pos] == player_id)
        if acceptable: self._board[from_pos] = player_id
        return acceptable

    def __slide(self, from_pos: tuple[int, int], slide: Move) -> bool:
        SIDES = [(0, 0), (0, 4), (4, 0), (4, 4)]
        if from_pos not in SIDES:
            acceptable_top: bool = from_pos[0] == 0 and (slide == Move.BOTTOM or slide == Move.LEFT or slide == Move.RIGHT)
            acceptable_bottom: bool = from_pos[0] == 4 and (slide == Move.TOP or slide == Move.LEFT or slide == Move.RIGHT)
            acceptable_left: bool = from_pos[1] == 0 and (slide == Move.BOTTOM or slide == Move.TOP or slide == Move.RIGHT)
            acceptable_right: bool = from_pos[1] == 4 and (slide == Move.BOTTOM or slide == Move.TOP or slide == Move.LEFT)
        else:
            acceptable_top: bool = from_pos == (0, 0) and (slide == Move.BOTTOM or slide == Move.RIGHT)
            acceptable_left: bool = from_pos == (4, 0) and (slide == Move.TOP or slide == Move.RIGHT)
            acceptable_right: bool = from_pos == (0, 4) and (slide == Move.BOTTOM or slide == Move.LEFT)
            acceptable_bottom: bool = from_pos == (4, 4) and (slide == Move.TOP or slide == Move.LEFT)
        acceptable: bool = acceptable_top or acceptable_bottom or acceptable_left or acceptable_right
        if acceptable:
            piece = self._board[from_pos]
            if slide == Move.LEFT:
                for i in range(from_pos[1], 0, -1): self._board[(from_pos[0], i)] = self._board[(from_pos[0], i - 1)]
                self._board[(from_pos[0], 0)] = piece
            elif slide == Move.RIGHT:
                for i in range(from_pos[1], self._board.shape[1] - 1, 1): self._board[(from_pos[0], i)] = self._board[(from_pos[0], i + 1)]
                self._board[(from_pos[0], self._board.shape[1] - 1)] = piece
            elif slide == Move.TOP:
                for i in range(from_pos[0], 0, -1): self._board[(i, from_pos[1])] = self._board[(i - 1, from_pos[1])]
                self._board[(0, from_pos[1])] = piece
            elif slide == Move.BOTTOM:
                for i in range(from_pos[0], self._board.shape[0] - 1, 1): self._board[(i, from_pos[1])] = self._board[(i + 1, from_pos[1])]
                self._board[(self._board.shape[0] - 1, from_pos[1])] = piece
        return acceptable

In [3]:
border = []
for i in range(5):
    for j in range(5):
        if i == 0 or i == 4 or j == 0 or j == 4:
            border.append((i, j))
BORDER = (list(set(border)))
print(len(BORDER))

def tile_to_moves(tile):
    possible_moves = [Move.TOP, Move.BOTTOM, Move.LEFT, Move.RIGHT]
        
    if tile[0] == 0: possible_moves.remove(Move.LEFT)
    if tile[0] == 4: possible_moves.remove(Move.RIGHT)
    if tile[1] == 0: possible_moves.remove(Move.TOP)
    if tile[1] == 4: possible_moves.remove(Move.BOTTOM)

    return possible_moves

tile_moves = {tile: tile_to_moves(tile) for tile in BORDER}

ALL_MOVES = []
for tile in BORDER:
    possible_moves = tile_moves[tile]
    for move in possible_moves: ALL_MOVES.append((tile, move))
N_ALL = len(ALL_MOVES)

class RandomPlayer(Player):
    def __init__(self) -> None:
        super().__init__()

    def make_move(self, game: 'Game') -> tuple[tuple[int, int], Move]:

        from_pos = random.choice(BORDER)
        while game.get_board()[from_pos[1], from_pos[0]] == 1 - game.current_player_idx: from_pos = random.choice(BORDER)

        possible_moves = tile_moves[from_pos]
        
        move = random.choice(possible_moves)

        return from_pos, move

16


In [4]:
## state can be represented as two numbers if 0,1 are considered as bit

In [5]:
import numpy as np

def state_to_board(state):
    binary_string = format(state, '050b')
    binary_array = np.array(list(map(int, binary_string))).reshape(2, 5, 5)

    board = np.zeros((5, 5), dtype=int)
    board[binary_array[0] == 1] = -1
    board[binary_array[1] == 1] = 1

    return board

def board_to_state(board):
    binary_array = np.zeros((2, 5, 5), dtype=int)
    
    binary_array[0][board == -1] = 1
    binary_array[1][board == 1] = 1

    binary_string = ''.join(map(str, binary_array.flatten()))
    return int(binary_string, 2)



rand_board = np.random.choice([-1, 0, 1], size=(5, 5), replace=True)
print('Board:')
print(rand_board)

rand_state = board_to_state(rand_board)
rand_board = state_to_board(rand_state)

print('\nState:')
print(rand_state)
print('\nBoard:')
print(state_to_board(rand_state))

Board:
[[ 1  1  1  1  1]
 [-1  1  0  1 -1]
 [-1  0  0  0  0]
 [ 1  1 -1  1  0]
 [ 1  1  0  1  1]]

State:
19245781287771

Board:
[[ 1  1  1  1  1]
 [-1  1  0  1 -1]
 [-1  0  0  0  0]
 [ 1  1 -1  1  0]
 [ 1  1  0  1  1]]


In [6]:
dict_rot = {
    (Move.TOP, 1): Move.LEFT,
    (Move.TOP, 2): Move.BOTTOM,
    (Move.TOP, 3): Move.RIGHT,
    (Move.BOTTOM, 1): Move.RIGHT,
    (Move.BOTTOM, 2): Move.TOP,
    (Move.BOTTOM, 3): Move.LEFT,
    (Move.LEFT, 1): Move.TOP,
    (Move.LEFT, 2): Move.RIGHT,
    (Move.LEFT, 3): Move.BOTTOM,
    (Move.RIGHT, 1): Move.BOTTOM,
    (Move.RIGHT, 2): Move.LEFT,
    (Move.RIGHT, 3): Move.TOP,
}

dict_flip = {
    Move.TOP: Move.TOP,
    Move.BOTTOM: Move.BOTTOM,
    Move.LEFT: Move.RIGHT,
    Move.RIGHT: Move.LEFT,
}

#rot_orario: (3, 4) -> (4, 1) -> (1, 0) -> (0, 3) -> (3, 4)
#: (xi, yi) -> (yi, 4 - xi)
#rot_anti_orario: (3, 4) -> (0, 3) -> (1, 0) -> (4, 1) -> (3, 4)
#: (xi, yi) -> (4 - yi, xi)

def rot(n_rot):
    def rot_n(from_pos, move):
        for _ in range(n_rot):
            from_pos = 4 - from_pos[1], from_pos[0]
        return from_pos, dict_rot[(move, n_rot)]
    return rot_n

def flip(from_pos, move):
    from_pos = 4 - from_pos[0], from_pos[1]
    return from_pos, dict_flip[move]

def flip_rot(n_rot):
    def flip_rot_n(from_pos, move):
        from_pos, move = rot(n_rot)(from_pos, move)
        return flip(from_pos, move)
    return flip_rot_n

rot1 = rot(1)
rot2 = rot(2)
rot3 = rot(3)
flip_rot1 = flip_rot(1)
flip_rot2 = flip_rot(2)
flip_rot3 = flip_rot(3)

verse_simmetries = [
    rot3,
    rot2,
    rot1,
    flip,
    flip_rot3,
    flip_rot2,
    flip_rot1,
]

inverse_simmetries = [
    rot1,
    rot2,
    rot3,
    flip,
    flip_rot1,
    flip_rot2,
    flip_rot3,
]

def check_simmetries(board, state_list):

    if tuple(board.flatten()) in state_list: return tuple(board.flatten()), None

    R1 = np.rot90(board)
    if tuple(R1.flatten()) in state_list: return tuple(R1.flatten()), 0

    R2 = np.rot90(R1)
    if tuple(R2.flatten()) in state_list: return tuple(R2.flatten()), 1

    R3 = np.rot90(R2)
    if tuple(R3.flatten()) in state_list: return tuple(R3.flatten()), 2
    
    F = np.fliplr(board)
    if tuple(F.flatten()) in state_list: return tuple(F.flatten()), 3
    
    FR1 = np.rot90(F)
    if tuple(FR1.flatten()) in state_list: return tuple(FR1.flatten()), 4
    
    FR2 = np.rot90(FR1)
    if tuple(FR2.flatten()) in state_list: return tuple(FR2.flatten()), 5
    
    FR3 = np.rot90(FR2)
    if tuple(FR3.flatten()) in state_list: return tuple(FR3.flatten()), 6
    
    return None

MOVES_SIMMETRIES = {} #(id_move, id_simmetry) -> id_move

for id_move in range(len(ALL_MOVES)):
    from_pos, move = ALL_MOVES[id_move]

    for id_simmetry in range(len(inverse_simmetries)):

        idx = None
        for i in range(len(ALL_MOVES)):
            if ALL_MOVES[i] == inverse_simmetries[id_simmetry](from_pos, move):
                idx = i
                break
        
        MOVES_SIMMETRIES[(id_move, id_simmetry)] = i

print(len(MOVES_SIMMETRIES))
print(len(ALL_MOVES) * 7)

308
308


In [7]:
## to discard for the amount of possible states

## to change with a check that control if a state already exist, if yes retreive the q-values, if not it creates a random q-value for 
## each move for that state, if legal


#import itertools
#from tqdm import tqdm
#MATRIX_SIZE = 5
#
#count_all = 0
#for s in itertools.product([-1, 0, 1], repeat= pow(MATRIX_SIZE, 2)): count_all += 1
#print(count_all)
#print('--------------')
#
#states_list = []
#
#for s in tqdm(itertools.product([-1, 0, 1], repeat= pow(MATRIX_SIZE, 2))):
#    if check_simmetries(np.array(s).reshape(MATRIX_SIZE, MATRIX_SIZE), states_list) is None:
#        states_list.append(tuple(s))
#    
#    #if count_all % 100 == 0:
#    #    print((len(states_list), count_all))
#
#print(count_all)
#print(len(states_list))

In [8]:
class MyPlayer(Player):
    def __init__(self, eps= 2, simulations_on_new= 1, base_until_move_change= 10) -> None:
        super().__init__()

        self.eps = eps
        self.simulation_on_new = simulations_on_new

        self.states_dict = {}

        self.dummy = Dummy_Game()

        self.train_init()

        self.base_until_move_change = base_until_move_change
        self.last_pos_move = None
        self.until_move_change = self.base_until_move_change
        self.n_move_changes = 0

        self.tot_count = 0
        self.random_count = 0

    def get_random_count(self): return self.random_count
    def get_tot_count(self): return self.tot_count
    def get_n_move_changes(self): return self.n_move_changes
    def reset_counters(self):
        self.n_move_changes = 0
        self.tot_count = 0
        self.random_count = 0
        self.until_move_change = self.base_until_move_change

    def expansion(self, board, next_to_move):

        # get legal childrens of a node, child: (move_to_child, child_state)

        children = []
        for from_pos, move in ALL_MOVES:
            new_board, ok = self.dummy.single_move(board, from_pos, move, next_to_move)
            if ok: children.append(((from_pos, move), tuple(new_board.flatten())))

        return children
    
    def simulation(self, base_board, next_to_move):

        # random simulation from a certain state to the end, return outcomes

        win_0 = 0
        win_1 = 0

        for _ in range(self.simulation_on_new):

            next_to_play = next_to_move
            board = deepcopy(base_board)

            winner = self.dummy.check_winner_board(board)
            while winner == -1:

                from_pos, move = ALL_MOVES[np.random.randint(0, N_ALL)]
                new_board, ok = self.dummy.single_move(board, from_pos, move, next_to_play)
                while not ok:
                    from_pos, move = ALL_MOVES[np.random.randint(0, N_ALL)]
                    new_board, ok = self.dummy.single_move(board, from_pos, move, next_to_play)

                board = new_board
                next_to_play = 1 - next_to_play

                winner = self.dummy.check_winner_board(board)

            if winner == 0: win_0 += 1
            else: win_1 += 1
        
        return win_0, win_1, self.simulation_on_new
    
    def update(self, states_to_update, win_0, win_1, count):

        player_responsible = 0

        ##### M
        #n_move = 0
        ##### M

        for state in states_to_update:
            player_responsible = 1 - player_responsible

            ##### M
            #if player_responsible == 1: n_move += 1
            #amount = win_0 * n_move if player_responsible == 0 else win_1 * n_move
            amount = win_0 if player_responsible == 0 else win_1
            ##### M

            if state in self.states_dict:
                self.states_dict[state][0] += amount
                self.states_dict[state][1] += count
            else: self.states_dict[state] = [amount, count, []]

    def selection(self, current_state, path= None):

        training = path is not None
        #if not training: path = []

        if current_state not in self.states_dict:
            if training:
                print("STATE NOT IN STATES_DICT -> IT SHOULDN'T HAPPEN")
                return None
            else: return None

        parent = self.states_dict[current_state]
        parent_count = parent[1]
        childrens = parent[2]

        if len(childrens) == 0: return None

        if training:
            if np.random.random() < self.eps:
                return childrens[np.random.randint(0, len(childrens))][1]

        has_childs = []
        values = []
        for _, child_state in childrens:
            #if child_state not in path:
                child = self.states_dict[child_state]
                wi = child[0]
                ci = child[1]
                has_childs.append(len(child[2]) > 0)

                if training: values.append((wi / ci))# + np.sqrt(self.eps * np.log(parent_count) / ci))
                else: values.append(wi / ci)

        best_id = np.argmax(values)

        if training: return childrens[best_id][1]
        else: return childrens[best_id][0]

    def train_init(self):

        starting_board = np.ones((5, 5), dtype= np.uint8) * -1

        win_0, win_1, count = self.simulation(starting_board, 0)

        starting_state = tuple(starting_board.flatten())

        self.update([starting_state], win_0, win_1, count)

    def train_wrapper(self, n_games= 10):

        for _ in tqdm(range(n_games)):

            board = np.ones((5, 5), dtype= np.uint8) * -1
            new_state = tuple(board.flatten())
            next_to_move = 1

            path = []

            winner = -1
            while winner == -1:

                state = new_state

                ######## S
                simmetry = check_simmetries(board, self.states_dict)
                if simmetry is None: 
                    self.states_dict[state] = [0, 0, []]
                    id_simmetry = None
                else: state, id_simmetry = simmetry
                #if state not in self.states_dict: self.states_dict[state] = [0, 0, []]
                ######## S

                node = self.states_dict[state]
                next_to_move = 1 - next_to_move
                
                from_pos, move = ALL_MOVES[np.random.randint(0, N_ALL)]

                ######## S
                if id_simmetry is not None: from_pos, move = inverse_simmetries[id_simmetry](from_pos, move)
                ######## S

                board, ok = self.dummy.single_move(board, from_pos, move, next_to_move)
                while not ok:

                    from_pos, move = ALL_MOVES[np.random.randint(0, N_ALL)]

                    ######## S
                    if id_simmetry is not None: from_pos, move = inverse_simmetries[id_simmetry](from_pos, move)
                    ######## S

                    board, ok = self.dummy.single_move(board, from_pos, move, next_to_move)

                new_state = tuple(board.flatten())

                path.append(new_state)

                if ((from_pos, move), new_state) not in node[2]: node[2].append(((from_pos, move), new_state))

                winner = self.dummy.check_winner_board(board)

            self.update(path, 1 - winner, winner, 1)

        print(f'states explored: {len(self.states_dict)}')

    def make_move(self, game: 'Game') -> tuple[tuple[int, int], Move]:

        self.tot_count += 1

        board = game.get_board()
        state = tuple(board.flatten())

        ######## S
        simmetry = check_simmetries(board, self.states_dict)
        if simmetry is not None: state, id_simmetry = simmetry
        else: id_simmetry = None
        ######## S

        pos_move = self.selection(state)

        ######## S
        if id_simmetry is not None and pos_move is not None: pos_move = inverse_simmetries[id_simmetry](pos_move[0], pos_move[1])
        ######## S

        rand = False
        if pos_move is None: rand = True
        elif pos_move == self.last_pos_move:
            self.until_move_change -= 1
            if self.until_move_change == 0:
                rand = True
                self.until_move_change = self.base_until_move_change
        else: self.last_pos_move = pos_move

        if rand:
            self.random_count += 1
            board = game.get_board()
            player_id = game.current_player_idx
            from_pos, move = ALL_MOVES[np.random.randint(0, N_ALL)]
            while board[from_pos[1], from_pos[0]] == 1 - player_id: from_pos, move = ALL_MOVES[np.random.randint(0, N_ALL)]
        else: from_pos, move = pos_move
        
        #print((from_pos, move))
        return from_pos, move

In [9]:
class QAgent(Player):

    def __init__(self, player_id= 0, exploration_factor= 0):
        super().__init__()

        self.exploration_factor =exploration_factor
        self.alpha = 0.5

        self.prev_state = tuple([-1 for _ in range(25)])
        self.state = None

        self.player_id = player_id
        self.opponent_id = 1 - player_id

        self.dummy = Dummy_Game()

        self.values = dict()

        self.state_move_to_state = dict()

    def make_move(self, game: 'Game') -> tuple[tuple[int, int], Move]:

        self.state = tuple(game.get_board().flatten())
        return self.make_optimal_move(self.state)

    def choose_move_train(self, state):
 
        if random.random() < self.exploration_factor: pos_move = ALL_MOVES[np.random.randint(0, N_ALL)]
        else: pos_move = self.make_optimal_move(self.state)

        return pos_move

    def make_move_and_learn(self, state):

        self.state = state

        self.learn_state(state)

        return self.choose_move_train(state)

    def make_optimal_move(self, state):

        temp_pos_move_list = []
        v = -float('Inf')

        board = np.array(state).reshape(5, 5)
        state = tuple(board.flatten())

        for from_pos, move in ALL_MOVES:

            v_temp = []

            ###
            if (state, from_pos, move) in self.state_move_to_state:
                temp_board = self.state_move_to_state[(state, from_pos, move)]
                ok = True
            else:
                temp_board, ok = self.dummy.single_move(board, from_pos, move, self.player_id)
                self.state_move_to_state[(state, from_pos, move)] = temp_board
            #temp_board, ok = self.dummy.single_move(board, from_pos, move, self.player_id)
            ###
            if ok:

                temp_state = tuple(temp_board.flatten())

                for from_pos_op, move_op in ALL_MOVES:

                    ###
                    if (temp_state, from_pos_op, move_op) in self.state_move_to_state:
                        temp_board_op = self.state_move_to_state[(temp_state, from_pos_op, move_op)]
                        ok = True
                    else:
                        temp_board_op, ok = self.dummy.single_move(temp_board, from_pos_op, move_op, self.opponent_id)
                        self.state_move_to_state[(temp_state, from_pos_op, move_op)] = temp_board_op
                    #temp_board_op, ok = self.dummy.single_move(temp_board, from_pos_op, move_op, self.opponent_id)
                    ###
                    if ok:

                        temp_state_op = tuple(temp_board_op.flatten())

                        if temp_state_op in self.values: v_temp.append(self.values[temp_state_op])

                if len(v_temp) != 0: v_temp = np.min(v_temp)
                else: v_temp = 1 # to encourage exploration

                if v_temp > v:
                    temp_pos_move_list = [(from_pos, move)]
                    v = v_temp
                elif v_temp == v:
                    temp_pos_move_list.append((from_pos, move))

        try:
            new_state = random.choice(temp_pos_move_list)
        except ValueError:
            print('temp state:', temp_pos_move_list)
            raise Exception('temp state empty')

        return new_state

    def reward(self, winner):
        if winner == self.player_id:
            R = 1
        elif winner == -1:
            R = 0
        else:
            R = -1
        return R
    
    def learn_state(self, state):

        if self.player_id in state:
            if self.prev_state in self.values.keys(): v_s = self.values[self.prev_state]
            else: v_s = int(0)

            winner = self.dummy.check_winner_board(np.array(state).reshape(5, 5))
            R = self.reward(winner)

            if self.state in self.values.keys() and winner == -1: v_s_tag = self.values[state] # anche senza .keys() ?
            else: v_s_tag = int(0)

            self.values[self.prev_state] = v_s + self.alpha*(R + v_s_tag - v_s)

        self.prev_state = state

    def train(self, n_games= 10000):

        for _ in tqdm(range(n_games)):

            board = np.ones((5, 5), dtype= np.uint8) * -1
            state = tuple(board.flatten())

            winner = -1
            while winner == -1:

                ######## S
                #simmetry = check_simmetries(board, self.states_dict)
                #if simmetry is None: 
                #    self.states_dict[state] = [0, 0, []]
                #    id_simmetry = None
                #else: state, id_simmetry = simmetry
                #if state not in self.states_dict: self.states_dict[state] = [0, 0, []]
                ######## S
                
                from_pos, move = self.make_move_and_learn(state)

                ######## S
                #if id_simmetry is not None: from_pos, move = inverse_simmetries[id_simmetry](from_pos, move)
                ######## S

                board, ok = self.dummy.single_move(board, from_pos, move, 0)

                state = tuple(board.flatten())

                winner = self.dummy.check_winner_board(board)

                if winner == -1:

                    from_pos, move = ALL_MOVES[np.random.randint(0, N_ALL)]
                    board, ok = self.dummy.single_move(board, from_pos, move, 1)
                    while not ok:
                        from_pos, move = ALL_MOVES[np.random.randint(0, N_ALL)]
                        board, ok = self.dummy.single_move(board, from_pos, move, 1)

                    state = tuple(board.flatten())

                    winner = self.dummy.check_winner_board(board)

            self.learn_state(state)
            #self.learn_state(state)

In [38]:
ThePlayer = QAgent(0, 0.3)
ThePlayer.train(1000)

#

player1 = ThePlayer
player2 = RandomPlayer()

n_trials = 100
wins_first = 0

for _ in tqdm(range(n_trials)):
    
    g = Game()
    winner = g.play(player1, player2)
    if winner == 0: wins_first += 1

print(f"Player won {wins_first} / {n_trials} as first")

 14%|█▍        | 138/1000 [00:37<03:54,  3.68it/s]


KeyboardInterrupt: 

In [12]:
import keras.layers as Kl
import keras.models as Km

class DeepQAgent(Player):

    def __init__(self, player_id= 0, exploration_factor= 0.5):
        super().__init__()

        print('init')

        self.exploration_factor = 1
        self.until_reduction = 50
        self.exploration_factor_after = exploration_factor
        self.alpha = 0.5

        self.prev_state = tuple([-1 for _ in range(25)])
        self.state = None

        self.player_id = player_id
        self.opponent_id = 1 - player_id

        self.dummy = Dummy_Game()

        self.values = dict()

        self.state_move_to_state = dict()

        self.value_model = self.create_model()

        self.until_change_move = 5
        self.last_move = None
        self.last_state = None

        self.memory = []
        self.memory_empty = []
        self.memory_win = []
        self.memory_lose = []
        self.batch_size = 10

        print('init done')

    def create_model(self): #########################################################################################################
        
        model = Km.Sequential()
        model.add(Kl.Dense(75, activation='relu', input_dim= 25))
        model.add(Kl.Dense(44, activation='relu'))
        model.add(Kl.Dense(1, activation='linear'))
        model.compile(optimizer='adam', loss='mean_absolute_error', metrics=['accuracy'])

        model.summary()
        return model

    def choose_move_train(self, state):
 
        if random.random() < self.exploration_factor:
            #pos_move = ALL_MOVES[np.random.randint(0, N_ALL)]
            board = np.array(state).reshape(5, 5)
            ok_moves = []
            for from_pos, move in ALL_MOVES:
                if board[from_pos[1], from_pos[0]] != 1 - self.player_id: ok_moves.append((from_pos, move))
            pos_move = ok_moves[np.random.randint(0, len(ok_moves))]
        else: pos_move = self.make_optimal_move(self.state)

        return pos_move

    def make_move_and_learn(self, state):

        self.state = state

        self.learn_state(state)

        return self.choose_move_train(state)
    
    def make_optimal_move(self, state):

        temp_pos_move_list = []
        v = -float('Inf')

        board = np.array(state).reshape(5, 5)
        state = tuple(board.flatten())

        temp_states = []
        pos_moves = []
        counts = []

        for from_pos, move in ALL_MOVES:

            v_temp = []

            ###
            if (state, from_pos, move) in self.state_move_to_state:
                temp_board = self.state_move_to_state[(state, from_pos, move)]
                ok = True
            else:
                temp_board, ok = self.dummy.single_move(board, from_pos, move, self.player_id)
                if ok: self.state_move_to_state[(state, from_pos, move)] = temp_board
            #temp_board, ok = self.dummy.single_move(board, from_pos, move, self.player_id)
            ###
            if ok:

                temp_state = tuple(temp_board.flatten())

                count = 0
                pos_moves.append((from_pos, move))

                for from_pos_op, move_op in ALL_MOVES:
                    
                    #print(count)

                    ###
                    if (temp_state, from_pos_op, move_op) in self.state_move_to_state:
                        temp_board_op = self.state_move_to_state[(temp_state, from_pos_op, move_op)]
                        ok = True
                    else:
                        temp_board_op, ok = self.dummy.single_move(temp_board, from_pos_op, move_op, self.opponent_id)
                        if ok: self.state_move_to_state[(temp_state, from_pos_op, move_op)] = temp_board_op
                        
                    #temp_board_op, ok = self.dummy.single_move(temp_board, from_pos_op, move_op, self.opponent_id)
                    ###
                    if ok:

                        temp_states.append(temp_board_op.reshape(25,))
                        count += 1
                
                counts.append(count)

        values = self.calc_value(np.array(temp_states).reshape(-1, 25))


        ###################
        count = 0
        max_v = -9999
        for i in range(len(pos_moves)):
            pos_move = pos_moves[i]
            v_opp = []
            for _ in range(counts[i]):
                v = values[count]
                count += 1

                v_opp.append(v)

            pos_move_v = min(v_opp)

            if pos_move_v > max_v:
                temp_pos_move_list = [pos_move]
                max_v = pos_move_v
            elif pos_move_v == max_v: temp_pos_move_list.append(pos_move)
            #elif pos_move_v > max_v - 1e-4: temp_pos_move_list.append(pos_move) ################################################################

        #print(f'max_v: {max_v}')

#        count = 0
#        for i in range(len(pos_moves)):
#            pos_move = pos_moves[i]
#            for _ in range(counts[i]):
#                v_temp = values[count]
#                count += 1
#
#                if v_temp > v:
#                    temp_pos_move_list = [pos_move]
#                    v = v_temp
#                elif v_temp == v: temp_pos_move_list.append(pos_move)

        ####################

        new_state = random.choice(temp_pos_move_list)

        return new_state

    def reward(self, winner):
        if winner == self.player_id:
            R = 1
        elif winner == -1:
            R = 0
        else:
            R = -1
        return R

    def learn_state(self, state):

        #########################################################################################################################

        self.calc_target(state)

        self.train_model(10)
        
        #
        
#        winner = self.dummy.check_winner_board(np.array(state).reshape(5, 5))
#        
#        target = self.calc_target(state, winner)
#
#        self.train_model(target, 10)

        #########################################################################################################################

        self.prev_state = state

    def calc_value(self, state_array):
        return self.value_model.predict(state_array, verbose= 0)

        #########################################################################################################################

    def calc_target(self, state):

        winner = self.dummy.check_winner_board(np.array(state).reshape(5, 5))

        #
    
#    def calc_target(self, state, winner):

        #########################################################################################################################

        if self.player_id in state:

            v_s = self.calc_value(np.array(self.prev_state).reshape(1, 25))

            R = self.reward(winner)

            #R = self.reward_2(winner, self.prev_state, state) # con evaluate board current - evaluate board pre scalate di 300

            if winner == -1: v_s_tag = self.calc_value(np.array(state).reshape(1, 25))
            else: v_s_tag = 0

        #########################################################################################################################

            target = v_s + self.alpha * (R + v_s_tag - v_s)

            self.memory.append((self.prev_state, target))
            if len(self.memory) > self.batch_size * 10: self.memory = self.memory[1:]

            if R == 0:
                self.memory_empty.append((self.prev_state, target))
                if len(self.memory_empty) > self.batch_size * 10: self.memory_empty = self.memory_empty[1:]
            elif R > 0:
                self.memory_win.append((self.prev_state, target))
                if len(self.memory_win) > self.batch_size * 10: self.memory_win = self.memory_win[1:]
            else:
                self.memory_lose.append((self.prev_state, target))
                if len(self.memory_lose) > self.batch_size * 10: self.memory_lose = self.memory_lose[1:]

            #print('-')
            #print(len(self.memory_empty))
            #print(len(self.memory_win))
            #print(len(self.memory_lose))
            #print('-')

            #

#            target = np.array(v_s + self.alpha * (R + v_s_tag - v_s))
#
#            return target
        
        #########################################################################################################################
            
        #########################################################################################################################

    def train_model(self, epochs= 10):

        ## single batch per epoch or multiple patches per epoch ()

        #len_m = len(self.memory)
        len_e = len(self.memory_empty)
        len_w = len(self.memory_win)
        len_l = len(self.memory_lose)

        #if len_m > self.batch_size:
        if len_e > self.batch_size and len_w > self.batch_size and len_l > self.batch_size:
            
            X_train = []
            Y_train = []

            if self.until_reduction > 0:
                self.until_reduction -= 1
                if self.until_reduction == 0: self.exploration_factor = self.exploration_factor_after

            # memory
            #idx_rand = np.random.choice(range(len_m), size= (self.batch_size,), replace= False)
            #xy = [self.memory[i] for i in idx_rand]
            #for x, y in xy:
            #    X_train.append(np.array(x))
            #    Y_train.append(y)
        
            # occorre giocare random puro all'inizio, altrimenti win è rara e non riempe il buffer
            idx_rand = np.random.choice(range(len_e), size= (self.batch_size,), replace= False)
            empty = [self.memory_empty[i] for i in idx_rand]
            idx_rand = np.random.choice(range(len_w), size= (self.batch_size,), replace= False)
            win = [self.memory_win[i] for i in idx_rand]
            idx_rand = np.random.choice(range(len_l), size= (self.batch_size,), replace= False)
            lose = [self.memory_lose[i] for i in idx_rand]
            
            for e, w, l in zip(empty, win, lose):
                X_train.append(np.array(e[0]))
                X_train.append(np.array(w[0]))
                X_train.append(np.array(l[0]))
                Y_train.append(e[1])
                Y_train.append(w[1])
                Y_train.append(l[1])
            
            #print(f'exploration factor: {self.exploration_factor}')
            X_train = np.array(X_train)
            #print(f'X_train shape: {X_train.shape}')
            Y_train = np.array(Y_train)
            #print(f'Y_train shape: {Y_train.shape}')
            
            self.value_model.fit(X_train, Y_train, batch_size= self.batch_size * 3, epochs=epochs, verbose=0)

        #if len(self.memory_empty) > 2 and len(self.memory_win) > 2 and len(self.memory_lose) > 2:
        #    
        #    X_train = []
        #    Y_train = []
        #
        #    for _ in range(self.batch_size):
        #        choice = np.random.randint(-1, 2)
        #        if choice < 0:
        #            X_train.append(random.choice(self.memory_empty)[0])
        #            Y_train.append(random.choice(self.memory_empty)[1])
        #        elif choice == 0:
        #            X_train.append(random.choice(self.memory_win)[0])
        #            Y_train.append(random.choice(self.memory_win)[1])
        #        else:
        #            X_train.append(random.choice(self.memory_lose)[0])
        #            Y_train.append(random.choice(self.memory_lose)[1])
        #
        #    X_train = np.array(X_train)
        #    print(f'X_train shape: {X_train.shape}')
        #    Y_train = np.array(Y_train)
        #    print(f'Y_train shape: {Y_train.shape}')
        #
        #    self.value_model.fit(X_train, Y_train, batch_size= self.batch_size, epochs=epochs, verbose=0)

        #

#    def train_model(self, target, epochs):
#
#        X_train = np.array(self.prev_state).reshape(1, 25)
#
#        if target is not None: self.value_model.fit(X_train, target, epochs=epochs, verbose=0)

        #########################################################################################################################

    def train(self, n_games= 1000):

        for _ in tqdm(range(n_games)):

            board = np.ones((5, 5), dtype= np.uint8) * -1
            state = tuple(board.flatten())

            winner = -1
            while winner == -1:

                #print('move')

                ######## S
                #simmetry = check_simmetries(board, self.states_dict)
                #if simmetry is None: 
                #    self.states_dict[state] = [0, 0, []]
                #    id_simmetry = None
                #else: state, id_simmetry = simmetry
                #if state not in self.states_dict: self.states_dict[state] = [0, 0, []]
                ######## S
                
                from_pos, move = self.make_move_and_learn(state)

                ######## S
                #if id_simmetry is not None: from_pos, move = inverse_simmetries[id_simmetry](from_pos, move)
                ######## S

                board, ok = self.dummy.single_move(board, from_pos, move, 0)

                state = tuple(board.flatten())

                winner = self.dummy.check_winner_board(board)

                if winner == -1:

                    from_pos, move = ALL_MOVES[np.random.randint(0, N_ALL)]
                    board, ok = self.dummy.single_move(board, from_pos, move, 1)
                    while not ok:
                        from_pos, move = ALL_MOVES[np.random.randint(0, N_ALL)]
                        board, ok = self.dummy.single_move(board, from_pos, move, 1)

                    state = tuple(board.flatten())

                    winner = self.dummy.check_winner_board(board)

            self.learn_state(state)
            #self.learn_state(state)

    
    def make_move(self, game: 'Game') -> tuple[tuple[int, int], Move]:

        board = game.get_board()
        self.state = tuple(board.flatten())
        pos_move = self.make_optimal_move(self.state)

        if self.last_state == self.state and pos_move == self.last_move:
            self.until_change_move -= 1
            if self.until_change_move == 0:
                #print('change')
                ok_moves = []
                for from_pos, move in ALL_MOVES:
                    if board[from_pos[1], from_pos[0]] != 1 - self.player_id: ok_moves.append((from_pos, move))
                pos_move = ok_moves[np.random.randint(0, len(ok_moves))]
                self.until_change_move = 2
        else: self.until_change_move = 2

        self.last_state = self.state
        self.last_move = pos_move

        #print(board)
        #print(pos_move)

        return pos_move

In [20]:
ThePlayer = DeepQAgent(0)
ThePlayer.train(1000)

#
#to_do:
#- check single memory
#- reward_2
#- trainare per un po
#- more dense
#- conv


player1 = ThePlayer
player2 = RandomPlayer()

n_trials = 1000
wins_first = 0

for _ in tqdm(range(n_trials)):
    
    g = Game()
    winner = g.play(player1, player2)
    if winner == 0: wins_first += 1

print(f"Player won {wins_first} / {n_trials} as first")

init
Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_12 (Dense)            (None, 75)                1950      
                                                                 
 dense_13 (Dense)            (None, 44)                3344      
                                                                 
 dense_14 (Dense)            (None, 1)                 45        
                                                                 
Total params: 5,339
Trainable params: 5,339
Non-trainable params: 0
_________________________________________________________________
init done


  5%|▌         | 533/10000 [42:03<12:27:01,  4.73s/it]


KeyboardInterrupt: 

In [17]:
player1_old = ThePlayer

In [18]:
player2 = RandomPlayer()

n_trials = 100
wins_first = 0

for _ in tqdm(range(n_trials)):
    
    g = Game()
    winner = g.play(player1_old, player2)
    if winner == 0: wins_first += 1

print(f"Player won {wins_first} / {n_trials} as first")

100%|██████████| 100/100 [05:51<00:00,  3.52s/it]

Player won 57 / 100 as first





In [95]:
import keras.layers as Kl
import keras.models as Km

class DeepQAgent(Player):

    def __init__(self, player_id= 0, exploration_factor= 0):
        super().__init__()

        print('init')

        self.exploration_factor =exploration_factor
        self.alpha = 0.5

        self.prev_state = tuple([-1 for _ in range(25)])
        self.state = None

        self.player_id = player_id
        self.opponent_id = 1 - player_id

        self.dummy = Dummy_Game()

        self.values = dict()

        self.state_move_to_state = dict()

        self.value_model = self.create_model()

        print('init done')

    def create_model(self):
        
        model = Km.Sequential()
        model.add(Kl.Dense(2, activation='relu', input_dim= 25))
        model.add(Kl.Dense(2, activation='relu'))
        model.add(Kl.Dense(1, activation='linear'))
        model.compile(optimizer='adam', loss='mean_absolute_error', metrics=['accuracy'])

        model.summary()
        return model

    def make_move(self, game: 'Game') -> tuple[tuple[int, int], Move]:

        self.state = tuple(game.get_board().flatten())
        return self.make_optimal_move(self.state)

    def choose_move_train(self, state):
 
        if random.random() < self.exploration_factor: pos_move = ALL_MOVES[np.random.randint(0, N_ALL)]
        else: pos_move = self.make_optimal_move(self.state)

        return pos_move

    def make_move_and_learn(self, state):

        self.state = state

        self.learn_state(state)

        return self.choose_move_train(state)

    def make_optimal_move(self, state):

        temp_pos_move_list = []
        v = -float('Inf')

        board = np.array(state).reshape(5, 5)
        state = tuple(board.flatten())

        temp_states = []
        pos_moves = []

        for from_pos, move in ALL_MOVES:

            v_temp = []

            ###
            if (state, from_pos, move) in self.state_move_to_state:
                temp_board = self.state_move_to_state[(state, from_pos, move)]
                ok = True
            else:
                temp_board, ok = self.dummy.single_move(board, from_pos, move, self.player_id)
                self.state_move_to_state[(state, from_pos, move)] = temp_board
            #temp_board, ok = self.dummy.single_move(board, from_pos, move, self.player_id)
            ###
            if ok:

                temp_state = tuple(temp_board.flatten())

                temp_states.append(temp_board.reshape(25,))

                pos_moves.append((from_pos, move))

                ####################################################################################

                #############################################

#                for from_pos_op, move_op in ALL_MOVES:
#
#                    ###
#                    if (temp_state, from_pos_op, move_op) in self.state_move_to_state:
#                        temp_board_op = self.state_move_to_state[(temp_state, from_pos_op, move_op)]
#                        ok = True
#                    else:
#                        temp_board_op, ok = self.dummy.single_move(temp_board, from_pos_op, move_op, self.opponent_id)
#                        self.state_move_to_state[(temp_state, from_pos_op, move_op)] = temp_board_op
#                    #temp_board_op, ok = self.dummy.single_move(temp_board, from_pos_op, move_op, self.opponent_id)
#                    ###
#                    if ok:
#
#                        temp_state_op = tuple(temp_board_op.flatten())
#
#                        v_temp.append(self.calc_value(temp_state_op))
#
#                if len(v_temp) != 0: v_temp = np.min(v_temp)
#                else: v_temp = 1 # to encourage exploration
#
#                if v_temp > v:
#                    temp_pos_move_list = [(from_pos, move)]
#                    v = v_temp
#                elif v_temp == v:
#                    temp_pos_move_list.append((from_pos, move))

                #############################################

#                v_temp = self.calc_value(temp_state)
#                if v_temp > v:
#                    temp_pos_move_list = [(from_pos, move)]
#                    v = v_temp
#                elif v_temp == v:
#                    temp_pos_move_list.append((from_pos, move))

                ####################################################################################

        values = self.calc_value(np.array(temp_states).reshape(-1, 25))

        for v_temp, pos_move in zip(values, pos_moves):

            if v_temp > v:
                temp_pos_move_list = [pos_move]
                v = v_temp
            elif v_temp == v: temp_pos_move_list.append(pos_move) 

        new_state = random.choice(temp_pos_move_list)

        return new_state

    def reward(self, winner):
        if winner == self.player_id:
            R = 1
        elif winner == -1:
            R = 0
        else:
            R = -1
        return R

    def learn_state(self, state):

        winner = self.dummy.check_winner_board(np.array(state).reshape(5, 5))

        target = self.calc_target(state, winner)

        self.train_model(target, 10) # 10

        self.prev_state = state

    def calc_value(self, state_array):
        return self.value_model.predict(state_array, verbose= 0)

    def calc_target(self, state, winner):

        if self.player_id in state:

            v_s = self.calc_value(np.array(self.prev_state).reshape(1, 25))

            R = self.reward(winner)

            if winner == -1: v_s_tag = self.calc_value(np.array(state).reshape(1, 25))
            else: v_s_tag = 0

            target = np.array(v_s + self.alpha * (R + v_s_tag - v_s))

            return target

    def train_model(self, target, epochs):

        X_train = np.array(self.prev_state).reshape(1, 25)

        if target is not None: self.value_model.fit(X_train, target, epochs=epochs, verbose=0)

    def train(self, n_games= 10000):

        for _ in tqdm(range(n_games)):

            board = np.ones((5, 5), dtype= np.uint8) * -1
            state = tuple(board.flatten())

            winner = -1
            while winner == -1:

                #print('move')

                ######## S
                #simmetry = check_simmetries(board, self.states_dict)
                #if simmetry is None: 
                #    self.states_dict[state] = [0, 0, []]
                #    id_simmetry = None
                #else: state, id_simmetry = simmetry
                #if state not in self.states_dict: self.states_dict[state] = [0, 0, []]
                ######## S
                
                from_pos, move = self.make_move_and_learn(state)

                ######## S
                #if id_simmetry is not None: from_pos, move = inverse_simmetries[id_simmetry](from_pos, move)
                ######## S

                board, ok = self.dummy.single_move(board, from_pos, move, 0)

                state = tuple(board.flatten())

                winner = self.dummy.check_winner_board(board)

                if winner == -1:

                    from_pos, move = ALL_MOVES[np.random.randint(0, N_ALL)]
                    board, ok = self.dummy.single_move(board, from_pos, move, 1)
                    while not ok:
                        from_pos, move = ALL_MOVES[np.random.randint(0, N_ALL)]
                        board, ok = self.dummy.single_move(board, from_pos, move, 1)

                    state = tuple(board.flatten())

                    winner = self.dummy.check_winner_board(board)

            self.learn_state(state)
            self.learn_state(state)

In [98]:
ThePlayer = DeepQAgent(0)
ThePlayer.train(100)

#

player1 = ThePlayer
player2 = RandomPlayer()

n_trials = 100
wins_first = 0

for _ in tqdm(range(n_trials)):
    
    g = Game()
    winner = g.play(player1, player2)
    if winner == 0: wins_first += 1

print(f"Player won {wins_first} / {n_trials} as first")

init
Model: "sequential_27"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_81 (Dense)            (None, 2)                 52        
                                                                 
 dense_82 (Dense)            (None, 2)                 6         
                                                                 
 dense_83 (Dense)            (None, 1)                 3         
                                                                 
Total params: 61
Trainable params: 61
Non-trainable params: 0
_________________________________________________________________
init done


  0%|          | 0/100 [00:00<?, ?it/s]

0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0


  1%|          | 1/100 [00:07<13:05,  7.93s/it]

0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0


  1%|          | 1/100 [00:10<17:58, 10.90s/it]

0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0





KeyboardInterrupt: 

In [None]:
class Agent(Player):

    def __init__(self, player_id= 0, exploration_factor=1):
        super().__init__()

        self.exploration_factor =exploration_factor
        self.epsilon = 0.1
        self.alpha = 0.5

        self.prev_state = tuple([-1 for _ in range(25)])
        self.state = None

        self.player_id = player_id
        self.opponent_id = 1 - player_id

        self.dummy = Dummy_Game()

    def choose_move(self, game: 'Game') -> tuple[tuple[int, int], Move]:

        board = game.get_board()
        self.state = tuple(board.flatten())
 
        if random.random() < self.exploration_factor: pos_move = ALL_MOVES[np.random.randint(0, N_ALL)]
        else: pos_move = self.make_optimal_move(self.state)

        return pos_move

    def make_move_and_learn(self, state):

        self.learn_state(state)

        return self.make_move(state)

    def make_optimal_move(self, state):

        temp_state_list = []
        v = -float('Inf')

        board = np.array(state).reshape(5, 5)

        for from_pos, move in ALL_MOVES:

            v_temp = []

            temp_board, ok = self.dummy.single_move(board, from_pos, move, self.player_id)
            # could add check on ok for legal moves

            temp_state = tuple(temp_board.flatten())

            for from_pos_op, move_op in ALL_MOVES:
                temp_board_op, ok = self.dummy.single_move(temp_board, from_pos_op, move_op, self.opponent_id)
            # could add check on ok for legal moves

                temp_state_op = tuple(temp_board_op.flatten())

                v_temp.append(self.calc_value(temp_state_op))

            # delets Nones
            v_temp = list(filter(None.__ne__, v_temp))

            if len(v_temp) != 0:
                v_temp = np.min(v_temp)
            else:
                # encourage exploration
                v_temp = 1

            if v_temp > v:
                temp_state_list = [temp_state]
                v = v_temp
            elif v_temp == v:
                temp_state_list.append(temp_state)

        try:
            new_state = random.choice(temp_state_list)
        except ValueError:
            print('temp state:', temp_state_list)
            raise Exception('temp state empty')

        return new_state

    def reward(self, winner):
        if winner == self.player_id:
            R = 1
        elif winner == -1:
            R = 0
        else:
            R = -1
        return R

class DeepAgent(Agent):

    def __init__(self, tag, exploration_factor=1):
        super().__init__(tag, exploration_factor)
        self.tag = tag
        self.value_model = self.load_model()

    @staticmethod
    def state2array(state):

        num_state = []
        for s in state:
            if s == 'X':
                num_state.append(1)
            elif s == 'O':
                num_state.append(-1)
            else:
                num_state.append(0)
        num_state = np.array([num_state])
        return num_state

    def learn_state(self, state, winner):

        target = self.calc_target(state, winner)

        self.train_model(target, 10)

        self.prev_state = state

    def load_model(self):
        s = 'model_values' + self.tag + '.h5'
        model_file = Path(s)
        if model_file.is_file():
            model = Km.load_model(s)
            print('load model: ' + s)
        else:
            print('new model')
            model = Km.Sequential()
            model.add(Kl.Dense(18, activation='relu', input_dim=9))
            model.add(Kl.Dense(18, activation='relu'))
            model.add(Kl.Dense(1, activation='linear'))
            model.compile(optimizer='adam', loss='mean_absolute_error', metrics=['accuracy'])

        model.summary()
        return model

    def calc_value(self, state):
        return self.value_model.predict(self.state2array(state))

    def calc_target(self, state, winner):

        if self.tag in state:

            v_s = self.calc_value(self.prev_state)

            R = self.reward(winner)

            if winner is None:
                v_s_tag = self.calc_value(state)
            else:
                v_s_tag = 0

            target = np.array(v_s + self.alpha * (R + v_s_tag - v_s))

            return target

    def train_model(self, target, epochs):

        X_train = self.state2array(self.prev_state)

        if target is not None:
            self.value_model.fit(X_train, target, epochs=epochs, verbose=0)


def check_player():
    # print('QAgent X 1 and QAgent 1 0')
    # game = TicTacToe('QAgent', 'QAgent', 1, 0)
    # game.play_to_learn(1000)
    # print('DeepAgent X 0.8 and DeepAgent 0.8')
    # game = TicTacToe('DeepAgent', 'DeepAgent', 0.8, 0.8)
    # game.play_to_learn(30000)
    print('DeepAgent X 0 and QAgent 1, 0')
    game = TicTacToe('Player', 'DeepAgent', 0.8, 0.8)
    game.play_game()


check_player()