In [46]:
import hashlib
import pickle
import time
import numpy as np

In [47]:
rows = 3
cols = 3

learning_rate: float = 0.01
discount_factor: float = 0.05
exploration_rate: float = 0.45 # 45% Memory 65% Greed

default_reward = 100

debug = False

In [48]:
def empty_board():
    return np.zeros((rows, cols))

def get_legal_moves():
    legal = set()
    for i in range(rows):
        for j in range(cols):
            legal.add((i, j))
    return legal

symbol_dict = {
    -1: "0",
    0: " ",
    1: "X"
}

def print_board(board):
    print("_____________")
    for row in range(rows):
        row_string = ""
        for column in range(cols):
            if column == 0:
                row_string += f"| {symbol_dict.get(board[row][column])} |"
            elif column == 1:
                row_string += f" {symbol_dict.get(board[row][column])} |"
            else:
                row_string += f" {symbol_dict.get(board[row][column])} |"
        print(row_string)
        if row != 2:
            print("|---+---+---|")
    print("-------------")

In [49]:
# Board Fields
current_board = empty_board()
legal_moves = get_legal_moves()
current_board_hash = None

# State Fields
game_finished = False
winner = None

# Player Fields
active_player = 1

def check_for_winner(board):
    rows, cols = board.shape
    diag_sum_inc, diag_sum_dec = 0, 0

    # Loop over the rows
    for i in range(rows):
        row_sum = sum(board[i, :])
        if row_sum == 3 or row_sum == -3:
            return True, row_sum // 3
    # Loop over the columns
    for i in range(cols):
        col_sum = sum(board[:, i])
        if col_sum == 3 or col_sum == -3:
            return True, col_sum // 3
        diag_sum_inc += board[i, i]
        diag_sum_dec += board[i, cols - i - 1]
    diag_sum = max(abs(diag_sum_inc), abs(diag_sum_dec))
    if diag_sum == 3:
        return True, diag_sum_inc // 3
    if len(legal_moves) == 0: return True, None
    return False, None

def reset():
    if True:
        print(f"Resetting States")
    # Reset the Current Board
    global current_board
    current_board = empty_board()
    # Reset legal moves for the current board
    global legal_moves
    legal_moves = get_legal_moves()
    # Reset the Currently Hashed Board
    global current_board_hash
    current_board_hash = None
    # Reset the Finished State
    global game_finished
    game_finished = False
    # Reset the Winning Player
    global winner
    winner = None
    # Reset the Active Player
    global active_player
    active_player = 1

def get_hashed_board():
    global current_board_hash
    current_board_hash = str(current_board.reshape(cols * rows))
    return current_board_hash

def update_board_state(position: (int, int), symbol):
    if type(current_board) is None: print("What the actual fuck!?")
    current_board[position[0]][position[1]] = symbol
    if position in legal_moves:
        legal_moves.remove(position)
    if active_player == -1: return  1
    else: return -1

def play(rounds: int = 100):
    global active_player
    global debug
    game_won_1 = 0
    game_won_2 = 0
    game_drawn = 0
    for i in range(rounds):
        if rounds > 100 and i % (rounds / 100) == 0:
            print(f"Round {i}: ")
        is_won, player = check_for_winner(current_board)
        while not is_won:
            if debug:
                print(f"While loop active player {active_player}")
            if active_player == 1:
                move = agent_1.move(legal_moves, current_board, active_player)
                active_player = update_board_state(move, active_player)
                hashed = get_hashed_board()
                agent_1.add_memory(hashed)
                is_won, player = check_for_winner(current_board)
            else:
                move = agent_2.move(legal_moves, current_board, active_player)
                active_player = update_board_state(move, active_player)
                hashed = get_hashed_board()
                agent_2.add_memory(hashed)
                is_won, player = check_for_winner(current_board)
        if is_won:
            if player == 1: game_won_1 += 1
            elif player == None: game_drawn += 1
            else: game_won_2 += 1
            if True:
                print_board(current_board)
                msg = "It's a draw!"
                if player == 1:
                    msg = f"Congratulations on the win: Player 1: '{agent_1.alias}'"
                elif player == -1:
                    msg = f"Congratulations on the win: Player 2: '{agent_2.alias}'"
                print(msg)
            reward(player)
            agent_1.reset()
            agent_2.reset()
            reset()

    reset()
    print(f"Game Statistics:")
    print(f"Game Won by Player 1: {game_won_1} ({str((game_won_1/rounds)*100) + '%'})")
    print(f"Game Won by Player 2: {game_won_2} ({str((game_won_2/rounds)*100) + '%'})")
    print(f"Games that resulted in a draw: {game_drawn} ({str((game_drawn/rounds)*100) + '%'})")
    if isinstance(agent_1, MachineAgent):
        agent_1.save_pickle_policy(f"./a_1_{rounds}_policy")
    if isinstance(agent_2, MachineAgent):
        agent_2.save_pickle_policy(f"./a_2_{rounds}_policy")

def reward(champion):
    if champion == 1:
        agent_1.reward(default_reward)
        agent_2.reward(0)
    elif champion == -1:
        agent_1.reward(0)
        agent_2.reward(default_reward)
    else:
        agent_1.reward(default_reward / 10) # 100 / 10  = 10
        agent_2.reward(default_reward / 2)  # 100 / 02  = 50

In [50]:
def get_agent_hashed_board(board):
    return str(board.reshape(cols * rows))

alias_count = 0

class Agent:
    def __init__(self, alias: str = None):
        self.memory = {}
        self.taken_state_hashes = []
        global alias_count
        if alias is None:
            self.alias = "agent_" + str(alias_count)
            alias_count += 1
        else:
            self.alias = alias

    def move(self, valid_positions, board, symbol):
        pass  # STUB

    def reward(self, reward_value: int):
        pass  # STUB

    def get_alias(self):
        return self.alias

    def add_memory(self, hashed):
        pass  # STUB

    def reset(self):
        if debug:
            print(f"Player: {self.alias} is reset!")
        self.taken_state_hashes = []

minimax_calls = 0

class MachineAgent(Agent):
    def __init__(self):
        self.cache = {}                 # Minimax Cache to save on operational overhead
        self.use_policy_only = False   # Value to determine if the agent should move based off solely the policy or policy + lazy options.
        super().__init__()

    def move(self, valid_positions: [(int, int)], board, symbol: int):
        if debug: print("move") # DEBUG
        action = None
        # If it doesn't have a loaded policy, aka it's training, and it's randomized 0->1 value falls below the exploration rate threshold.
        if not self.use_policy_only and np.random.uniform(0, 1) <= exploration_rate:
            # Lazily choose between "Random Move" and "MiniMax Optimized Move".
            choice = np.random.choice([0,1], p=[0.6, 0.4])
            if choice == 0:     # Random Choice
                move = legal_moves.pop() # Pops a random positional value from the set.
            else:   # MiniMax
                move = self.optimal_move(valid_positions, board.copy(), symbol) # Starts a call chain to find the minimax optimized positional choice.
            action = move
        else: # Otherwise go with a memorized value from the memory policy map.
            policy_max = float("-inf")                          # Set the default to negative infinity
            for move in valid_positions:                        # Loop over all valid legal positions to take
                next_board = board.copy()                       # Copy the board
                next_board[move[0]][move[1]] = symbol           # Set the positional value to be this players sign
                hashed = get_agent_hashed_board(next_board)     # Hash the currently changed board
                policy_value = 0                                # Set the local policy value to 0
                if self.memory.get(hashed) is None:             # If there isn't a memory for this hashed state
                    pass                                        # Then return nothing and let the policy value be 0
                else:                                           # Otherwise
                    policy_value = self.memory.get(hashed)      # Grab the policy value from the memory map
                if policy_value >= policy_max:                  # If the local policy_value is larger than or equal to the policy_max
                    policy_max = policy_value                   # Set the max to the new max
                    action = move                               # Set action to this move

        if action is None:                                          # If there is no action found
            if debug: print("No optimal moves, selecting first")    # DEBUG
            action = valid_positions.pop()                          # Then pop a random value from the valid_positions set.
        return action                                               # Return the chosen action

    def optimal_move(self, valid_positions, board, symbol):
        global minimax_calls
        if debug:
            print("optimal_move")
        best_score = float('-inf')
        best_move = None
        for pos in valid_positions:
            x, y = pos
            board[x][y] = symbol
            if debug:
                minimax_calls = 0
                print("minimax")
            score = self.minimax(board.copy(), 0, True)
            if debug:
                print(f"minimax was called {minimax_calls} times")
            board[x][y] = 0
            if score > best_score:
                best_score = score
                best_move = (x, y)
        return best_move

    def minimax(self, board, depth: int, is_maximizing: bool):
        global minimax_calls
        minimax_calls += 1
        if self.cache.get(get_agent_hashed_board(board)) is not None:
            return self.cache.get(get_agent_hashed_board(board))
        is_won, player = check_for_winner(board)
        if is_won:
            return player
        best_score = None
        if is_maximizing:
            best_score = float('-inf')
            for x in range(board.shape[0]):
                for y in range(board.shape[1]):
                    if board[x][y] == 0:
                        board[x][y] = 1
                        score = self.minimax(board.copy(), depth + 1, False)
                        board[x][y] = 0
                        best_score = float(max(score, best_score))
        else:
            best_score = float('inf')
            for x in range(board.shape[0]):
                for y in range(board.shape[1]):
                    if board[x][y] == 0:
                        board[x][y] = -1
                        score = self.minimax(board.copy(), depth + 1, True)
                        board[x][y] = 0
                        best_score = float(min(score, best_score))

        self.cache[get_agent_hashed_board(board)] = best_score
        return best_score

    def reward(self, reward_value: int):
        if self.use_policy_only:
            pass
        if debug:
            print("reward")
        for state in reversed(self.taken_state_hashes):
            if self.memory.get(state) is None:
                self.memory[state] = 0
                        # Reward /w No Prior Values for State:
                        # Winner: 0.01          * (0.05            * 100          - 0) = 0.050
                        # Loser:  0.01          * (0.05            * 0            - 0) = 0.000
                        # Draw 1: 0.01          * (0.05            * 10           - 0) = 0.005
                        # Draw 2: 0.01          * (0.05            * 50           - 0) = 0.025
            self.memory[state] += learning_rate * (discount_factor * reward_value - self.memory[state])
            reward_value = self.memory[state]

    def add_memory(self, hashed):
        if debug:
            print("add_memory")
        self.taken_state_hashes.append(hashed)

    def save_pickle_policy(self, file):
        if debug:
            print("save_pickle_policy")
        writing = open(file, 'wb')
        pickle.dump(self.memory, writing)
        writing.close()

    def load_pickle_policy(self, file, ignore_policy_only: bool = False):
        if debug:
            print("load_pickle_policy")
        reading = open(file, 'rb')
        self.memory = pickle.load(reading)
        reading.close()
        self.use_policy_only = not ignore_policy_only

class HumanAgent(Agent):
    def __init__(self, name: str):
        super().__init__()
        self.alias = name

    def move(self, valid_positions, board, symbol):
        if debug:
            print("move")
        print(f"Valid Moves: {get_legal_moves()}")
        print(print_board(board))
        print()
        time.sleep(1)
        move = input("Which position do you want to take? [Format: 'X,Y', '0,1']")
        xy = move.split(',')
        move = (int(xy[0]), int(xy[1]))
        if move in valid_positions:
            return move

class RandomAgent(Agent):
    def __init__(self):
        super().__init__()

    def move(self, valid_positions: set, board, symbol):
        move = valid_positions.pop()
        return move # Just pop a random valid position

In [51]:
# Agents for Training
agent_1 = MachineAgent()
agent_2 = MachineAgent()

# Human Player
# Uncomment to play as a player
#agent_1.load_pickle_policy('./a_1_1000000_policy')
#agent_2 = HumanAgent(input("Whats your name?"))

In [52]:
# Logging Disabled
tic = time.time()
play(10000)
toc = time.time()

print(f"Finished running training set in roughly {toc-tic:0.4f} seconds.")

# Logging Enabled
# play(100, debug=True)

Round 0: 
_____________
| X | 0 | 0 |
|---+---+---|
| X | 0 | X |
|---+---+---|
|   | 0 | X |
-------------
Congratulations on the win: Player 2: 'agent_1'
Resetting States
_____________
| 0 | X | X |
|---+---+---|
| 0 | X | 0 |
|---+---+---|
| X | 0 | X |
-------------
It's a draw!
Resetting States
_____________
| 0 | X | X |
|---+---+---|
| 0 | X | 0 |
|---+---+---|
| X | 0 | X |
-------------
It's a draw!
Resetting States
_____________
| X | X | 0 |
|---+---+---|
| 0 | 0 | X |
|---+---+---|
| X | X | 0 |
-------------
It's a draw!
Resetting States
_____________
| X | X | 0 |
|---+---+---|
| 0 | 0 | X |
|---+---+---|
| X | X | 0 |
-------------
It's a draw!
Resetting States
_____________
|   | X | 0 |
|---+---+---|
| X |   | 0 |
|---+---+---|
|   | X | 0 |
-------------
Congratulations on the win: Player 2: 'agent_1'
Resetting States
_____________
| X | X | 0 |
|---+---+---|
| X | X | 0 |
|---+---+---|
| 0 | 0 | X |
-------------
Congratulations on the win: Player 1: 'agent_0'
Resett

In [53]:
reading = open('./a_1_10000_policy', 'rb')
a1_policy = pickle.load(reading)
reading.close()

reading = open('./a_2_10000_policy', 'rb')
a2_policy = pickle.load(reading)
reading.close()

print("Policy Length: " + str(len(a1_policy)))
print("Key-Values for Policy 1:")
for k, v in a1_policy.items(): print(f"Key: {k}, Value: {v}")

print()

print("Policy Length: " + str(len(a2_policy)))
print("Key-Values for Policy 2:")
for k, v in a1_policy.items(): print(f"Key: {k}, Value: {v}")

Policy Length: 34
Key-Values for Policy 1:
Key: [ 1. -1. -1.  1.  0.  1.  0. -1.  1.], Value: 0.0
Key: [ 0. -1. -1.  1.  0.  1.  0.  0.  1.], Value: 0.0
Key: [ 0. -1.  0.  1.  0.  0.  0.  0.  1.], Value: 0.0
Key: [0. 0. 0. 1. 0. 0. 0. 0. 0.], Value: 0.0
Key: [-1.  1.  1. -1.  1. -1.  1. -1.  1.], Value: 0.00995
Key: [ 0.  1.  1. -1.  0. -1.  1. -1.  1.], Value: 7.45e-06
Key: [ 0.  1.  1. -1.  0. -1.  0.  0.  1.], Value: 4.962500000000001e-09
Key: [ 0.  1.  0. -1.  0.  0.  0.  0.  1.], Value: 3.100000000000001e-12
Key: [0. 1. 0. 0. 0. 0. 0. 0. 0.], Value: 0.005671910491245569
Key: [ 1.  1. -1. -1. -1.  1.  1.  1. -1.], Value: 0.00995
Key: [ 1.  1. -1. -1.  0.  1.  0.  1. -1.], Value: 7.45e-06
Key: [ 0.  1.  0. -1.  0.  1.  0.  1. -1.], Value: 4.962500000000001e-09
Key: [ 0.  1.  0. -1.  0.  1.  0.  0.  0.], Value: 3.100000000000001e-12
Key: [ 0.  1.  0.  1.  0. -1.  0.  1. -1.], Value: 0.0
Key: [ 0.  1.  0.  1.  0. -1.  0.  0.  0.], Value: 6.250000000000003e-12
Key: [ 1.  1. -1.  1.  1.

In [54]:
agent_1 = MachineAgent()
agent_1.load_pickle_policy('./a_1_10000_policy')
play(10000)

Round 0: 
_____________
|   | X | 0 |
|---+---+---|
| 0 |   | 0 |
|---+---+---|
| X | X | X |
-------------
Congratulations on the win: Player 1: 'agent_2'
Resetting States
_____________
| 0 | X |   |
|---+---+---|
|   | X | 0 |
|---+---+---|
|   | X |   |
-------------
Congratulations on the win: Player 1: 'agent_2'
Resetting States
_____________
| 0 | X |   |
|---+---+---|
|   | X | 0 |
|---+---+---|
|   | X |   |
-------------
Congratulations on the win: Player 1: 'agent_2'
Resetting States
_____________
| 0 | X |   |
|---+---+---|
| 0 | X | 0 |
|---+---+---|
|   | X | X |
-------------
Congratulations on the win: Player 1: 'agent_2'
Resetting States
_____________
| 0 | X |   |
|---+---+---|
|   | X | 0 |
|---+---+---|
|   | X |   |
-------------
Congratulations on the win: Player 1: 'agent_2'
Resetting States
_____________
| 0 | X |   |
|---+---+---|
| 0 | X | 0 |
|---+---+---|
|   | X | X |
-------------
Congratulations on the win: Player 1: 'agent_2'
Resetting States
____________

In [55]:
agent_2 = RandomAgent()
play(10000)

Round 0: 
_____________
| 0 | X |   |
|---+---+---|
|   | X | 0 |
|---+---+---|
|   | X |   |
-------------
Congratulations on the win: Player 1: 'agent_2'
Resetting States
_____________
| 0 | X |   |
|---+---+---|
|   | X | 0 |
|---+---+---|
|   | X |   |
-------------
Congratulations on the win: Player 1: 'agent_2'
Resetting States
_____________
| 0 | X |   |
|---+---+---|
|   | X | 0 |
|---+---+---|
|   | X |   |
-------------
Congratulations on the win: Player 1: 'agent_2'
Resetting States
_____________
| 0 | X |   |
|---+---+---|
|   | X | 0 |
|---+---+---|
|   | X |   |
-------------
Congratulations on the win: Player 1: 'agent_2'
Resetting States
_____________
| 0 | X |   |
|---+---+---|
|   | X | 0 |
|---+---+---|
|   | X |   |
-------------
Congratulations on the win: Player 1: 'agent_2'
Resetting States
_____________
| 0 | X |   |
|---+---+---|
|   | X | 0 |
|---+---+---|
|   | X |   |
-------------
Congratulations on the win: Player 1: 'agent_2'
Resetting States
____________