In [None]:
import random
from game import Game, Player, Move
import numpy as np
from copy import copy, deepcopy
from hashlib import sha1
from tqdm import tqdm
import matplotlib.pyplot as plt
import json


class Training_game(Game):
    def __init__(self) -> None:
        super().__init__()
    
    def move(self, move, id):
        return super()._Game__move(move[0], move[1], id)

    def set_board(self, board):
        self._board = board

    def change_board(self, move, player_id):
        current_board = copy(self._board)
        success = self.move(move, player_id)
        if success:
            return self.get_board(), current_board
        else:
            return None 

    def play(self, player1: Player, player2: Player, first = True) -> int:
            '''Play the game. Returns the winning player'''
            players = [player1, player2]
            winner = -1
            while winner < 0:
                self.current_player_idx += 1
                self.current_player_idx %= len(players)
                ok = False
                while not ok:
                    from_pos, slide = players[self.current_player_idx].make_move(
                        self)
                    ok = self.move([from_pos, slide], self.current_player_idx)
                winner = self.check_winner()
            p = player1 if first else player2
            p.states.append(self._board)
            return winner


class RandomPlayer(Player):
    def __init__(self) -> None:
        super().__init__()

    def make_move(self, game: 'Game') -> tuple[tuple[int, int], Move]:
        from_pos = (random.randint(0, 4), random.randint(0, 4))
        move = random.choice([Move.TOP, Move.BOTTOM, Move.LEFT, Move.RIGHT])
        return from_pos, move


class TrainingAgent(Player):
    def __init__(self, player_id, lr, gamma, num_training_games, exploit) -> None:
        super().__init__()
        self.player_value = player_id
        self.lr = lr
        self.gamma = gamma
        self.exploit = exploit
        self.states = []
        self.Q = {}
        self.games_played = 0
        self.num_training_games = num_training_games


    def make_move(self, game: 'Game') -> tuple[tuple[int, int], Move]:
        current_board = np.copy(game.get_board())
        self.states.append(current_board)
        e = np.exp(-1.5 * (self.games_played / self.num_training_games)) 
        epsilon = 0 if (self.games_played / self.num_training_games >= 1) or self.exploit else e
        if (np.random.rand() < epsilon):
            moves = self.get_legal_moves(current_board)
            index = random.randint(0, len(moves) - 1)
            return moves[index]

        else:
            best_move = None
            highest_eval = -np.inf
            moves = self.get_legal_moves(game.get_board())
            for move in moves:
                current_board = np.copy(game.get_board())
                new_game = Training_game()
                new_game.set_board(current_board)
                new_board, prev_pos = new_game.change_board(move, self.player_value)
                hashed_board_state = self.hash_board(new_board)
                table_result = self.Q.get(hashed_board_state)
                eval = 0 if table_result == None else table_result
                if (eval > highest_eval):
                    highest_eval = eval
                    best_move = move
            return best_move
                    
                
    def train(self, num_games, opponent=None):
            r = opponent if opponent != None else RandomPlayer()
            self.num_games_played = 0
            for _ in tqdm(range(num_games)):
                t = Training_game()
        
                result = t.play(r, self, first=False)

                reward = 1 if self.player_value == result else -1

                for state in reversed(self.states):
                    state_to_eval = state
                    hashed_state = self.hash_board(state_to_eval)
                    if (self.Q.get(hashed_state) == None):
                            self.Q[hashed_state] = 0
                    self.Q[hashed_state] += self.lr * (self.gamma * reward - self.Q[hashed_state])
                    for _ in range(4):
                        if (self.Q.get(hashed_state) == None):
                            self.Q[hashed_state] = 0
                        self.Q[hashed_state] += self.lr * (self.gamma * reward - self.Q[hashed_state])
                        state_to_eval = np.rot90(state_to_eval)
                    reward = self.Q[hashed_state] 
                self.states.clear()
                self.games_played += 1
        
            


    
    def get_legal_moves(self, board: np.ndarray):
        moves = list()
        rows, cols = board.shape
        for i in range(rows):
            if (board[i, 0] == -1):
                pickable_piece = (0,i)
                if (i == 0):
                    insertions = (Move.BOTTOM, Move.RIGHT)
                elif (i == 4):
                    insertions = (Move.TOP, Move.RIGHT)
                else:
                    insertions = (Move.TOP, Move.BOTTOM, Move.RIGHT)
                
                for insertion in insertions:
                    moves.append((pickable_piece, insertion))
            
            if (board[i, 4] == -1):
                pickable_piece = (4,i)
                if (i == 0):
                    insertions = (Move.BOTTOM, Move.LEFT)
                elif (i == 4):
                    insertions = [Move.TOP, Move.LEFT]
                else:
                    insertions = (Move.TOP, Move.BOTTOM, Move.LEFT)
                for insertion in insertions:
                    moves.append((pickable_piece, insertion))
        for j in range(cols):
            if(board[0, j] == self.player_value or board[0, j] == -1):
                pickable_piece = (j,0)
                if(j == 0):
                    insertions = (Move.BOTTOM, Move.RIGHT)
                elif (j == 4):
                    insertions = (Move.BOTTOM, Move.LEFT)
                else:
                    insertions = (Move.LEFT, Move.RIGHT, Move.BOTTOM)
                for insertion in insertions:
                    moves.append((pickable_piece, insertion))
            if(board[4, j] == self.player_value or board[4, j] == -1):
                pickable_piece = (j,4)
                if (j == 0):
                    insertions = (Move.TOP, Move.RIGHT)
                elif (j == 4):
                    insertions = (Move.TOP, Move.LEFT)
                else:
                    insertions = (Move.LEFT, Move.RIGHT, Move.TOP)
                for insertion in insertions:
                    moves.append((pickable_piece, insertion))
        return moves

    def hash_board(self, board):
        sha_hash = sha1()
        sha_hash.update(board.tobytes())
        return sha_hash.hexdigest()

    def epsilon(self, n):
        xr = 1 - np.log2(1+(n / self.num_training_games))
        epsilon = xr if n < self.num_training_games else 0
        return epsilon


In [None]:
# This cell block was used to train the agent, before writing the Q-table to json. Commented out in order to not run it while importing this notebook


# num_training_games = 2000
# player_id = 1
# num_epochs = 100
# player = TrainingAgent(player_id, 0.01, 0.9, num_training_games, False)
# x = []
# w = []

# for i in range(num_epochs):
#     player.train(num_training_games)
#     r = RandomPlayer()
#     num_games = 1000
#     results = []
#     for _ in tqdm(range(num_games)):
#         r2 = RandomPlayer()
#         t = Training_game()
#         result = t.play(r, player, False)
#         results.append(result)

#     print(f"player winrate: {results.count(player_id) / num_games}")

#     w.append(results.count(player_id) / num_games)
#     x.append(i * num_training_games)
    
# plt.plot(x, w)

In [None]:
# cell block that writes the values of Q table to json file. Commented out in order to not overwrite the current state


# file_path = 'Q2_000_000_second.json'
# with open(file_path, 'w') as file:
#     json.dump(player.Q, file, indent=2) 