Can generate test cases by having random bots play and screenshoting position 1 or 2 moves before game is won. 
Alternatively use Illustrator().get_winning_moves()

For the future considerations

- Reduce coupling in game class (certain variables must be updated simultaneously which is bad)
- Make sure illustrator doesn't use boardgame directly
- Make sure nothing uses boardgame directly - need to practice good OOP principles

List of choices made: 

- Reward lies between 0 and 1 
- Player is -1 or 1 
- Game state includes color 

In [1]:
import numpy as np
from __future__ import annotations

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from matplotlib import pyplot as plt

import pygame
from pygame.locals import *

from itertools import product
import json
import time

pygame 2.0.1 (SDL 2.0.14, Python 3.8.16)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [2]:
with open("winning_positions", "r") as f:
    data = json.load(f)
        
    diags = {int(k): v for k, v in data['diags'].items()}
    space_diags = {int(k): v for k, v in data['space_diags'].items()}

In [76]:
class BG: 
    
    def __init__(self):
        self.reset()
               
    # Will update self.reward, self.done
    def check_done(self, a):
        
        def convert(p):
            return p // 16, (p % 16) // 4, p % 4
        
        def unconvert(i, j, k):
            return 16 * i + 4 * j + k
        
        if self.moves_left == 0:
            self.reward = 0.5
            self.done = True
        
        i, j, k = convert(a)
        
        # Straight Line checks
        if sum([self.state[unconvert(i, j, t)] for t in range(4)]) == 4 * self.player or \
            sum([self.state[unconvert(i, t, k)] for t in range(4)]) == 4 * self.player or \
            sum([self.state[unconvert(t, j, k)] for t in range(4)]) == 4 * self.player: 
                self.reward = (self.player + 1) // 2
                self.done = True
                return None
        
        # Diag checks 
        for pos in diags[i] + diags[4+j] + diags[8+k]:
            if sum([self.state[x] for x in pos]) == 4 * self.player:
                self.reward = (self.player + 1) // 2
                self.done = True
                return None
        
        # Space diag checks 
        if a in space_diags and sum([self.state[x] for x in space_diags[a]]) == 4 * self.player:
            self.reward = (self.player + 1) // 2
            self.done = True
            return None
        
    def step(self, a: int) -> List[np.array, int, bool, np.array]:
        
        if self.state[a] != 0: 
            raise ValueError('invalid move!', a, self.get_internal())
            
        self.state[a] = self.player
        self.game_state[0][a + (64 if self.player == -1 else 0)] = 1
        #self.game_state[0][[128, 129]] = 1 - self.game_state[0][[128, 129]]
        self.mask[a] = 0
        self.moves_left -= 1
        self.actions_taken.append(a) 
        
        self.check_done(a)
        self.player *= -1 
        
        return self.game_state, self.reward, self.done, self.mask
    
    def get_state(self):
        return self.game_state
    
    def undo(self): 
        if len(self.actions_taken) == 0:
            raise ValueError('no actions taken yet', self.get_internal())
        
        recent_action = self.actions_taken.pop()
        self.game_state[0][[recent_action, recent_action + 64]] = 0
        #self.game_state[0][[128, 129]] = 1 - self.game_state[0][[128, 129]]
        self.state[recent_action] = 0
        self.mask[recent_action] = 1
        self.moves_left += 1
        self.done = False
        self.player *= -1
        self.reward = 0.5
     
    def set_internal(self, internal_dict: Dict):
        self.game_state = internal_dict.get('game_state', np.zeros((1, 128), dtype = np.float32))
        #self.game_state = internal_dict.get('game_state', np.zeros((1, 130), dtype = np.float32))
        self.state = internal_dict.get('state', np.zeros(64, dtype = np.int32))
        self.mask = internal_dict.get('mask', np.ones(64, dtype = np.float32))
        
        self.reward = internal_dict.get('reward', 0.5)
        self.done = internal_dict.get('done', False)
        
        self.actions_taken = internal_dict.get('actions_taken', [])
        self.moves_left = internal_dict.get('moves_left', 64)
        self.player = internal_dict.get('player', 1.0)
        
    def get_internal(self) -> Dict:
        return {'game_state': self.game_state.copy(),
                'state': self.state.copy(), 
                'mask': self.mask.copy(), 
                'reward': self.reward, 
                'done': self.done, 
                'moves_left': self.moves_left,
                'actions_taken': self.actions_taken.copy(),
                'player': self.player}
        
    def reset(self) -> List[np.array, int, bool, np.array]:
        self.set_internal({})
        #self.game_state[0][128] = 1
        return self.game_state, self.reward, self.done, self.mask
    
    def render(self): 
        conv = {-1: 'O', 0: ' ', 1: 'X'}
        #for row in self.state.reshape(4, 4, 4): 
         #   temp = row.astype(np.int)

         #   lines = ['|'.join(conv[v] for v in row) for row in temp]
         #   board = '\n-------\n'.join(lines)
         #   print(board, end="\n\n")
    
        boards = ['\n-------\n'.join(['|'.join(conv[v] for v in row) for row in temp]) for temp in self.state.reshape(4, 4, 4).astype(int)]
        grid = "\n\n=======\n\n".join(boards)
        print(grid, end="\n\n")

In [460]:
diags

{0: [[0, 5, 10, 15], [3, 6, 9, 12]],
 1: [[16, 21, 26, 31], [19, 22, 25, 28]],
 2: [[32, 37, 42, 47], [35, 38, 41, 44]],
 3: [[48, 53, 58, 63], [51, 54, 57, 60]],
 4: [[0, 17, 34, 51], [3, 18, 33, 48]],
 5: [[4, 21, 38, 55], [7, 22, 37, 52]],
 6: [[8, 25, 42, 59], [11, 26, 41, 56]],
 7: [[12, 29, 46, 63], [15, 30, 45, 60]],
 8: [[0, 20, 40, 60], [12, 24, 36, 48]],
 9: [[1, 21, 41, 61], [13, 25, 37, 49]],
 10: [[2, 22, 42, 62], [14, 26, 38, 50]],
 11: [[3, 23, 43, 63], [15, 27, 39, 51]]}

In [4]:
# Define constants that we will use later on 
BLACK = (0, 0, 0)
WHITE = (255, 255, 255)
RED = (255, 0, 0)
GREEN = (0, 255, 0)
BLUE = (0, 0, 255)

ORANGE = (255, 187, 0)
TEAL = (0, 225, 255)

class UnplayablePlayedError(ValueError):
    pass

class Illustrator:
    
    def __init__(self, boardgame: BG, topleft: np.array, spacing: int, normal_display: bool = False):
        self.bg = boardgame
        
        self.topleft = topleft
        
        self.spacing = spacing
        self.x_shift = np.array([spacing, 0])
        self.y_shift = np.array([0, spacing])
        self.b_shift = np.array([spacing, spacing])
        
        if normal_display: self.type = 1
        else: self.type = 3 # Top row is xy slices, middle row is yz slices, bottom row is xz slices
        
        self.consider = 64
        
    def get_centerrad(self, i, j, k):
        center = self.topleft + self.x_shift * (i + 5 * k) + self.y_shift * j + self.b_shift * 0.5
        rad = 2 * self.spacing // 5
        
        return center, rad    
        
    def get_winning_moves(self, prev_flag = True): 
            
        winning_moves = []
        if prev_flag:
            self.bg.player *= -1
           
        for a in range(64):
            
            if self.bg.mask[a] == 0:
                continue
                    
            self.bg.state[a] = self.bg.player
            
            self.bg.check_done(a)
            if self.bg.done:
                winning_moves.append(a)
                
            self.bg.state[a] = 0
            
            self.bg.done = False
        
        if prev_flag: 
            self.bg.player *= -1
        self.reward = 0.5
            
        return winning_moves        
        
    def get_action(self, pos: np.array): 
        
        def unswap(i, j, k, b): 
            if b == 0: return i, j, k
            elif b == 1: return i, k, j
            return k, i, j
        
        x, y = pos - self.topleft
        a, inpos = divmod(x, 5 * self.spacing)
        b, downpos = divmod(y, 5 * self.spacing)
        
        s = self.spacing  
        
        # Check out of bounds 
        if  y < 0 or x < 0 or y >= 15 * s or downpos >= 4 * s or x >= 20 * s or inpos >= 4 * s: 
            raise UnplayablePlayedError(f"Failed at mousecoor {pos}")
        
        k = a
        j = downpos // self.spacing
        i = inpos // self.spacing 
                
        i, j, k = unswap(i, j, k, b)
        
        action = i + 4 * j + 16 * k
        
        if self.bg.mask[action] == 0:
            raise UnplayablePlayedError(f"Already played action {action}")
        
        return action
        
    def considering(self, pos: np.array): 
        try: 
            self.consider = self.get_action(pos)
        except UnplayablePlayedError: 
            self.consider = 64
    
    def draw(self, gameDisplay: pygame.Surface):
        
        def unconvert(a): 
            return a // 16, (a % 16) // 4, a % 4
        
        def swap(i, j, k, b):            
            if b == 0: return i, j, k
            elif b == 1: return i, k, j
            return j, k, i
        
        board = self.bg.state.reshape(4, 4, 4).astype(int)
        
        for b in range(self.type):
            tl_adj = self.y_shift * b * 5
                    
            #Draw lines
            for x in range(4):
                tl = self.topleft + self.x_shift * x * 5 + tl_adj
                for i in range(1,4):
                    pygame.draw.line(gameDisplay, WHITE, tl + i*self.x_shift, tl + i*self.x_shift + self.y_shift*4)
                    pygame.draw.line(gameDisplay, WHITE, tl + i*self.y_shift, tl + i*self.y_shift + self.x_shift*4)
                
            # Draw tiles  
            for k, j, i in product(range(4), range(4), range(4)):
        
                if not board[k][j][i]:
                    continue
                    
                color = BLUE if board[k][j][i] == 1 else RED

                i, j, k = swap(i, j, k, b)
                    
                center, rad = self.get_centerrad(i, j, k)
                center += tl_adj

                pygame.draw.circle(gameDisplay, color, center, rad)
            
        if self.bg.done: 
            return
               
        for b in range(self.type):
            tl_adj = self.y_shift * b * 5
            
            # Draw winning squares for opponent 
            color = ORANGE if self.bg.player == 1 else TEAL
            for move in self.get_winning_moves():
                k, j, i = unconvert(move) 
                i, j, k = swap(i, j, k, b)

                center, rad = self.get_centerrad(i, j, k)
                center += tl_adj
                pygame.draw.circle(gameDisplay, color, center, rad)    

            # Draw square being considered
            if self.consider != 64: 

                k, j, i = unconvert(self.consider)
                i, j, k = swap(i, j, k, b)

                color = BLUE if self.bg.player == 1 else RED
                center, rad = self.get_centerrad(i, j, k)
                center += tl_adj
                rad /= 2
                pygame.draw.circle(gameDisplay, color, center, rad)  

            # Draw winning squares for opponent 
            color = ORANGE if self.bg.player == -1 else TEAL
            for move in self.get_winning_moves(prev_flag = False):
                k, j, i = unconvert(move)
                i, j, k = swap(i, j, k, b)

                center, rad = self.get_centerrad(i, j, k)
                center += tl_adj
                pygame.draw.circle(gameDisplay, color, center, rad)    
                
class GUI: 
    
    # Create some pygame related environment variables and set the positions of the display
    def __init__(self, topleft: np.array = np.array([60, 60]), spacing: int = 35, normal_display: bool = True):
        
        pygame.init()
        pygame.display.set_caption("4d Tic Tac Toe")
        
        if normal_display: size = (800, 300)
        else: size = (800, 600)
        self.gameDisplay = pygame.display.set_mode(size) 
        
        self.game = BG()
        self.illustrator = Illustrator(self.game, topleft, spacing, normal_display)
        
        self.bots = {}
        self.compete = False

        
    def playable(self, a):
        return self.game.mask[a] == 1
        
    def update(self, action: int) -> bool: 
        
        self.illustrator.consider = 64
        self.game.step(action)
        
        return self.game.done
        
    # Keep collecting user input and running the game until the user exits the window
    def main_loop(self):
        run = True
        play = True
        bot_move = False
        while run: 
            self.draw_all()
            
            if bot_move and self.game.player in self.bots and play:
                bot = self.bots[self.game.player]
                action = bot.choose_action(self.game, test = True)

                play = not self.update(action)
                bot_move = False
            
            for event in pygame.event.get():    
                if event.type == QUIT:
                    run = False
                    pygame.quit()
                    
                elif event.type == MOUSEMOTION and play: 
                    self.illustrator.considering(np.array(event.pos))
        
                elif event.type == MOUSEBUTTONDOWN and play:
                    
                    try: 
                        action = self.illustrator.get_action(np.array(event.pos))
                        play = not self.update(action)
                    
                    except UnplayablePlayedError as e: 
                        pass
                    
                    if self.compete: bot_move = True
        
                elif event.type == KEYDOWN: 
                    if event.key == pygame.K_SPACE:
                        run = False
                        
                    if event.key == pygame.K_u:
                        try: 
                            self.game.undo()
                            play = True
                        except ValueError: 
                            pass
                        
                    if event.key == pygame.K_n:
                        bot_move = True
                        
        pygame.quit()
        
    def connect_bots(self, bots_list):
        self.bots = {k: v for k,v in zip([1, -1], bots_list)}
        self.turn = self.game.player
        
    def enable_competition(self):
        self.compete = True
        
    # Draws all the components and updates the pygame display
    def draw_all(self):
        self.gameDisplay.fill(BLACK)
        self.illustrator.draw(self.gameDisplay)
        pygame.display.update()  

In [5]:
class Human:
    
    def __init__(self): 
        pass
    
    def choose_action(self, env, test = True):
        return int(input("Take an action please: "))
    
class RandomBot: 
    def __init__(self):
        pass
        
    def evaluate(self, env):
        return "IS A RANDOM BOT"
    
    def choose_action(self, env, test = True):
        return (np.random.uniform(low = 0.5, high = 1, size=[64]) * env.mask).argmax()

In [6]:
STATE_SPACE = 128 + 2
ACTION_SPACE = 64

inputs = keras.Input(shape=(STATE_SPACE,), name="inp")
x = layers.Dense(256, activation="relu", name="layer_1")(inputs)
x = layers.Dense(128, activation="relu", name="layer_2")(x)
x = layers.Dense(32, activation="relu", name="layer_3")(x)
outputs = layers.Dense(1, activation="sigmoid", name="value")(x)
value_network = keras.Model(inputs=inputs, outputs=outputs)

In [7]:
class Minimax:
    
    player_to_exp = {1: max, -1: min}

        
    def __init__(self, max_depth = 3, max_search = 500, 
                 debug = False, training = True, explore = True, 
                 epsilon = 0.9, decay = 0.9999, min_epsilon = 0.15):
        
        self.value = 0
        self.e = BG()
        self.max_depth = max_depth
        self.max_search = max_search
        
        self.states = []
        self.rewards = []
        
        self.loss_fn = keras.losses.MSE
        self.optimizer = keras.optimizers.SGD(learning_rate=0.0012, momentum=0.0003, name="SGD")
        self.loss_h = []
        self.episode_loss = []
        
        self.epsilon = epsilon
        self.decay = decay
        self.min_epsilon = min_epsilon
        self.explore = explore
        
        self.debug = debug
        self.training = training
        
        
    def choose_action(self, env, test = False):
        
        self.e.set_internal(env.get_internal())    
        
        self.checked = 0
        self.evals = []
            
        a = self.expand(self.max_depth)[0]
        
        if self.training: 
            r = tf.expand_dims(tf.stack(self.rewards), axis = 1)
            
            l = self.update(tf.stack(self.states), r)
            self.states, self.rewards = [], []
            self.episode_loss.append(tf.math.reduce_mean(l))
            
        if self.explore and not test:      
            if self.epsilon > self.min_epsilon:
                self.epsilon*=self.decay

            if np.random.rand() < self.epsilon:
                p = np.random.uniform(size=(ACTION_SPACE))
                p*= env.mask
                a = int(p.argmax())
        
        return a
        
    def evaluate(self, s):     
        #return tf.squeeze(value_network(s))
    
        start = time.time()
        v = tf.squeeze(value_network(s))
        self.evals.append(time.time() - start)
        return v
    
    @staticmethod
    def pick_random(options):
        ind = np.random.choice(np.arange(len(options)))
        return options[ind]
            
    def expand(self, depth):
        
        if depth == 0 or self.checked > self.max_search: 
        
            self.checked += 1
            return -1, self.evaluate(self.e.get_state())
        
        info = []
        
        for a in self.e.mask.nonzero()[0]:
            s, r, d, m = self.e.step(a)
            
            if d: info.append((a, r))  
            else: info.append((a, self.expand(depth-1)[1])) 
                
            self.e.undo()
            
        _, reward = Minimax.player_to_exp[self.e.player](info, key = lambda x : x[1])
        possibilities = [(x, y) for x,y in info if y == reward]
        self.add_update(self.e.get_state(), reward)
        
        if self.debug and depth == self.max_depth:
            stuff = [(x, round(float(y), 3)) for x, y in info]
            stuff.sort(key = lambda t : t[1] * self.e.player)
            print(stuff[:3], stuff[-3:], "\n")

        return Minimax.pick_random(possibilities)[0], reward
    
    @tf.function(input_signature=(tf.TensorSpec(shape=[None, STATE_SPACE],dtype=tf.float32), 
                                  tf.TensorSpec(shape=[None, 1],dtype=tf.float32)))
    def update(self, s, r):
        
        with tf.GradientTape() as tape:
            r_pred = value_network(s)
            loss = tf.norm(self.loss_fn(r, r_pred))
            
        gradient = tape.gradient(loss, value_network.trainable_weights)
        self.optimizer.apply_gradients(zip(gradient, value_network.trainable_weights))
        
        return loss
        
    def reset_metric(self):
        if len(self.episode_loss) > 0:
            self.loss_h.append(np.mean(self.episode_loss))
            self.episode_loss = []
            
    def add_update(self, s, r):        
        self.states.append(tf.squeeze(s))
        self.rewards.append(r)

In [169]:
import time 
start = time.time()

agents = [Minimax(max_depth = 2), Minimax(max_depth = 2)]
games = 500

for g in range(1, games+1):
    
    env = BG()
    s, r, d, m = env.reset()
    
    index = 0

    while not d: 
        a = agents[index].choose_action(env)
        s, r, d, m = env.step(a)

        index = 1 - index
        
    agents[0].reset_metric()
    agents[1].reset_metric()
    
    if g % 1 == 0: 
        l = (sum(agents[0].loss_h[-10:]) + sum(agents[1].loss_h[-10:]))/20
        print(f"Game number {g} completed in {time.time() - start} seconds with total loss {l}")
        print(f"There was an exploration rate of {agents[0].epsilon} with a game length of {len(env.actions_taken)}")

print(time.time() - start)

Game number 1 completed in 34.72173023223877 seconds with total loss 0.027400816977024078
1 strategies were explored with an exploration rate of 0.8983813762658757
Game number 2 completed in 81.88843727111816 seconds with total loss 0.2318860962986946
2 strategies were explored with an exploration rate of 0.8964070111162198
Game number 3 completed in 111.58340954780579 seconds with total loss 0.2644715905189514
3 strategies were explored with an exploration rate of 0.8952423809429291
Game number 4 completed in 139.80132484436035 seconds with total loss 0.29132389426231386
4 strategies were explored with an exploration rate of 0.8940792638807852
Game number 5 completed in 181.31613516807556 seconds with total loss 0.4431479573249817
5 strategies were explored with an exploration rate of 0.8923820412889367
Game number 6 completed in 218.04813528060913 seconds with total loss 0.5649575471878052
6 strategies were explored with an exploration rate of 0.8908662048517141
Game number 7 complet

Game number 51 completed in 1695.1818075180054 seconds with total loss 0.7162800412625074
51 strategies were explored with an exploration rate of 0.8411679839235603
Game number 52 completed in 1721.6364328861237 seconds with total loss 0.820936449803412
52 strategies were explored with an exploration rate of 0.8402431616448706
Game number 53 completed in 1755.2069051265717 seconds with total loss 0.7717380322515964
53 strategies were explored with an exploration rate of 0.8390675855340806
Game number 54 completed in 1783.4009699821472 seconds with total loss 0.7945739036425948
54 strategies were explored with an exploration rate of 0.8380612580314928
Game number 55 completed in 1809.5194523334503 seconds with total loss 0.7975278845056891
55 strategies were explored with an exploration rate of 0.8371398514430975
Game number 56 completed in 1855.835443496704 seconds with total loss 0.8604960968717933
56 strategies were explored with an exploration rate of 0.8354671613519952
Game number 

Game number 101 completed in 3169.4997959136963 seconds with total loss 0.6021224103868008
101 strategies were explored with an exploration rate of 0.7913879739328935
Game number 102 completed in 3197.6717517375946 seconds with total loss 0.6354925505816936
102 strategies were explored with an exploration rate of 0.7904388305061707
Game number 103 completed in 3227.0514204502106 seconds with total loss 0.7258662424981595
103 strategies were explored with an exploration rate of 0.7894908254253342
Game number 104 completed in 3250.548172712326 seconds with total loss 0.7252049423754215
104 strategies were explored with an exploration rate of 0.7887016897760579
Game number 105 completed in 3286.8525400161743 seconds with total loss 0.8339482434093952
105 strategies were explored with an exploration rate of 0.7875194650194164
Game number 106 completed in 3316.476986885071 seconds with total loss 0.8108820535242558
106 strategies were explored with an exploration rate of 0.786574961251025
G

Game number 151 completed in 4538.130165338516 seconds with total loss 0.8668863553553819
151 strategies were explored with an exploration rate of 0.7477625624525019
Game number 152 completed in 4563.173403739929 seconds with total loss 0.8468203537166119
152 strategies were explored with an exploration rate of 0.7470151362934869
Game number 153 completed in 4597.14680147171 seconds with total loss 0.7465881479904055
153 strategies were explored with an exploration rate of 0.7461938303786563
Game number 154 completed in 4657.377447605133 seconds with total loss 0.7827748341485858
154 strategies were explored with an exploration rate of 0.7447773373696144
Game number 155 completed in 4680.429226875305 seconds with total loss 0.8063348723575473
155 strategies were explored with an exploration rate of 0.7442561496106319
Game number 156 completed in 4734.2991144657135 seconds with total loss 0.68214141856879
156 strategies were explored with an exploration rate of 0.7429919258387404
Game n

Game number 201 completed in 6283.0062375068665 seconds with total loss 0.7742407787591219
201 strategies were explored with an exploration rate of 0.7073904117886922
Game number 202 completed in 6304.344742774963 seconds with total loss 0.7889400128275156
202 strategies were explored with an exploration rate of 0.7068953870276705
Game number 203 completed in 6337.114789009094 seconds with total loss 0.6605241641402244
203 strategies were explored with an exploration rate of 0.7061888096587545
Game number 204 completed in 6357.591466665268 seconds with total loss 0.6591703616082668
204 strategies were explored with an exploration rate of 0.705765202287158
Game number 205 completed in 6383.754285573959 seconds with total loss 0.7292339041829109
205 strategies were explored with an exploration rate of 0.7052007877000672
Game number 206 completed in 6437.972739934921 seconds with total loss 0.805569214373827
206 strategies were explored with an exploration rate of 0.70400290495468
Game nu

Game number 251 completed in 8040.565610408783 seconds with total loss 0.6556061409413815
251 strategies were explored with an exploration rate of 0.6699345374682468
Game number 252 completed in 8059.189292669296 seconds with total loss 0.6953770957887173
252 strategies were explored with an exploration rate of 0.6695326772225488
Game number 253 completed in 8088.54629945755 seconds with total loss 0.7341913156211376
253 strategies were explored with an exploration rate of 0.6689303387885801
Game number 254 completed in 8112.685411930084 seconds with total loss 0.6480713292956353
254 strategies were explored with an exploration rate of 0.6684622280033892
Game number 255 completed in 8144.467058420181 seconds with total loss 0.6017783984541893
255 strategies were explored with an exploration rate of 0.6677940665031871
Game number 256 completed in 8188.466809272766 seconds with total loss 0.6513367474079133
256 strategies were explored with an exploration rate of 0.6668597622596731
Game 

Game number 301 completed in 9777.153287410736 seconds with total loss 0.43647126145660875
301 strategies were explored with an exploration rate of 0.6369412850652153
Game number 302 completed in 9814.53553366661 seconds with total loss 0.5665349304676056
302 strategies were explored with an exploration rate of 0.636240999864276
Game number 303 completed in 9843.591791629791 seconds with total loss 0.5557885952293873
303 strategies were explored with an exploration rate of 0.635668611957722
Game number 304 completed in 9897.439201831818 seconds with total loss 0.581524670869112
304 strategies were explored with an exploration rate of 0.6345888393946029
Game number 305 completed in 9931.34316277504 seconds with total loss 0.5921021372079849
305 strategies were explored with an exploration rate of 0.6338911405904444
Game number 306 completed in 9957.295016288757 seconds with total loss 0.6365025125443935
306 strategies were explored with an exploration rate of 0.633384205131998
Game numb

Game number 351 completed in 11395.33252120018 seconds with total loss 0.37230931110680104
351 strategies were explored with an exploration rate of 0.6053912462305095
Game number 352 completed in 11439.881155967712 seconds with total loss 0.48573819138109686
352 strategies were explored with an exploration rate of 0.604544249171519
Game number 353 completed in 11477.00872707367 seconds with total loss 0.5344441182911396
353 strategies were explored with an exploration rate of 0.603819194938748
Game number 354 completed in 11515.45513510704 seconds with total loss 0.5161965079605579
354 strategies were explored with an exploration rate of 0.6030950102926798
Game number 355 completed in 11546.28967833519 seconds with total loss 0.5425953730940819
355 strategies were explored with an exploration rate of 0.602492186602783
Game number 356 completed in 11586.237943172455 seconds with total loss 0.6224075742065907
356 strategies were explored with an exploration rate of 0.6017094165318352
Gam

KeyboardInterrupt: 

In [171]:
value_network.save("new_model")

INFO:tensorflow:Assets written to: 400_game_3,6h_model\assets


In [1]:
x = agents[0].loss_h
x2 = agents[1].loss_h

plt.plot(np.arange(len(x)), x)
plt.plot(np.arange(len(x2)), x2)

NameError: name 'agents' is not defined

In [8]:
value_network = keras.models.load_model("400_game_3,6h_model")



In [17]:
pygame.quit()

In [9]:
gui = GUI() 

gui.connect_bots([RandomBot(), Minimax(max_depth = 3, max_search = 4000, training = False, debug = True)])
gui.enable_competition()
gui.main_loop()

[(1, 0.276), (17, 0.231), (7, 0.225)] [(37, -0.052), (33, -0.056), (42, -0.074)] 



In [12]:
def simulate(n = 100):
    env = BG()
    
    records = [0, 0, 0]
    for _ in range(n):
        s, r, d, m = env.reset()
        
        agents = [RandomBot(), RandomBot()]
        #agents = [Minimax(max_depth = 1, debug = False, training = False, explore = False), RandomBot()] 
        #agents = [RandomBot(), Minimax(max_depth = 1, debug = False, training = False, explore = False)]

        index = 0

        while not d: 
            a = agents[index].choose_action(env)
            s, r, d, m = env.step(a)

            index = 1 - index

        records[int(2 * r)] += 1
        print(f"{_+1} games passed    ", end = '\r')
    return records
    
simulate()

1 games passed    2 games passed    3 games passed    4 games passed    5 games passed    6 games passed    7 games passed    8 games passed    9 games passed    10 games passed    11 games passed    12 games passed    13 games passed    14 games passed    15 games passed    16 games passed    17 games passed    18 games passed    19 games passed    20 games passed    21 games passed    22 games passed    23 games passed    24 games passed    25 games passed    26 games passed    27 games passed    28 games passed    29 games passed    30 games passed    31 games passed    32 games passed    33 games passed    34 games passed    35 games passed    36 games passed    37 games passed    38 games passed    39 games passed    40 games passed    41 games passed    42 games passed    43 games passed    44 games passed    45 games passed    46 games passed    47 games passed    48 games passed    49 games passed    50 games passed    51 games 

[44, 0, 56]

In [19]:
env = BG()
    
agents = [RandomBot(), RandomBot()]
index = 0

s, r, d, m = env.reset()
while not d: 
    a = agents[index].choose_action(env)
    s, r, d, m = env.step(a)

    index = 1 - index


In [32]:
bot = Minimax(max_depth = 2, debug = True, training = False, explore = False)
bot.choose_action(env, test = True)

[(3, 0.0), (6, 0.0), (16, 0.0)] [(9, 1.0), (14, 1.0), (57, 1.0)] 



57

In [31]:
env.undo()

In [25]:
env.render()

O|O|O| 
-------
X| | |O
-------
X| |X|X
-------
X|X| |X


 | |X|X
-------
 |O| | 
-------
 |O|O| 
-------
X|O|O|X


 |X| |O
-------
 |O| | 
-------
O|X|X|O
-------
X|O| | 


O|X| | 
-------
O| |X| 
-------
X| |X|X
-------
O|O|O| 



In [33]:
bot = Minimax(max_depth = 1, debug = False, training = False, explore = False)

bot.choose_action(env)

9

In [34]:
gui = GUI()
gui.game = env
gui.illustrator.bg = env

gui.main_loop()

In [19]:

env.undo()

In [18]:
env = BG()

s, r, d, m = env.reset()

agents = [RandomBot(), RandomBot()]
index = 0 

while not d: 
    a = agents[index].choose_action(env, test = True)
    s, r, d, m = env.step(a)

    env.render()
    print()
    index = 1 - index

print(r)

 | |X| 
-------
 | | | 
-------
 | | | 
-------
 | | | 


 | | | 
-------
 | | | 
-------
 | | | 
-------
 | | | 


 | | | 
-------
 | | | 
-------
 | | | 
-------
 | | | 


 | | | 
-------
 | | | 
-------
 | | | 
-------
 | | | 


 | |X| 
-------
 | | | 
-------
 | | | 
-------
 | | | 


 | | | 
-------
 | | | 
-------
 | | | 
-------
 | | | 


 | | | 
-------
 | | | 
-------
 | | | 
-------
 | | | 


 | | | 
-------
 | | | 
-------
 | | |O
-------
 | | | 


 | |X| 
-------
 | | | 
-------
 | | | 
-------
 | | | 


 | | | 
-------
 | | | 
-------
 | | | 
-------
X| | | 


 | | | 
-------
 | | | 
-------
 | | | 
-------
 | | | 


 | | | 
-------
 | | | 
-------
 | | |O
-------
 | | | 


 | |X| 
-------
 | | | 
-------
 | | | 
-------
 | | | 


 | | | 
-------
 | | |O
-------
 | | | 
-------
X| | | 


 | | | 
-------
 | | | 
-------
 | | | 
-------
 | | | 


 | | | 
-------
 | | | 
-------
 | | |O
-------
 | | | 


 | |X| 
-------
 | | | 
-------
 | | |X
-------
 | | | 


 | | | 
------

In [145]:
env.render()

X|O| |X
-------
 |O|O|X
-------
 |X|X|O
-------
 |O|X|O


X|O|O|X
-------
 |X|X|O
-------
O|O| | 
-------
X|O| | 


 | | | 
-------
 |O| |X
-------
 |X| |X
-------
O| | | 


 | | | 
-------
 | | | 
-------
 | |X| 
-------
 | | | 



In [147]:
bot = Minimax(-1, max_depth = 2, debug = True)
bot.choose_action(env, test = True)

MIN
[61, 63]
[(2, 1.0), (4, -0.36423319578170776), (8, 1.0), (12, -0.32946106791496277), (20, 1.0), (26, 1.0), (27, 1.0), (30, -0.3352173864841461), (31, 1.0), (32, 1.0), (33, 1.0), (34, -0.3320755958557129), (35, 1.0), (36, -0.334259957075119), (38, 1.0), (40, 1.0), (42, 1.0), (45, -0.334259957075119), (46, 1.0), (47, -0.3338382840156555), (48, 1.0), (49, -0.334259957075119), (50, 1.0), (51, -0.32777732610702515), (52, 1.0), (53, -0.3293707072734833), (54, 1.0), (55, -0.3150988221168518), (56, 1.0), (57, -0.33156096935272217), (59, 1.0), (60, 1.0), (61, -0.3919256031513214), (62, 1.0), (63, -0.3919256031513214)]


61

In [146]:
env.actions_taken

[58,
 1,
 0,
 44,
 7,
 29,
 3,
 5,
 10,
 6,
 14,
 23,
 19,
 24,
 28,
 25,
 16,
 11,
 9,
 37,
 41,
 13,
 43,
 15,
 39,
 17,
 21,
 18,
 22]

In [99]:
value_network(env.game_state)

<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[-0.18240757]], dtype=float32)>

In [95]:
env.undo()

In [93]:
env.actions_taken

[58,
 5,
 18,
 17,
 63,
 38,
 33,
 41,
 62,
 30,
 6,
 61,
 45,
 39,
 7,
 51,
 12,
 47,
 20,
 15,
 35,
 25,
 0,
 48,
 16,
 19,
 29,
 40,
 55,
 1,
 4,
 10,
 60,
 2,
 36,
 44,
 43,
 27]

In [66]:
def simulate(n=1000):

    env = BG()
    tallies = [0, 0, 0]
    
    for x in range(n):
    
        s, r, d, m = env.reset()

        agents = [RandomBot(), RandomBot()]
        index = 0 

        while not d: 
            a = agents[index].choose_action(env)
            s, r, d, m = env.step(a)

            #env.render()
            #print()
            index = 1 - index

        #print(_)
        
        tallies[int(r) + 1] += 1
    
    return tallies
        
simulate(100000)

DRAW
DRAW
DRAW
DRAW
DRAW
DRAW


[47331, 1, 52668]

In [17]:
def convert(p):
    return p // 16, (p % 16) // 4, p % 4

def unconvert(i, j, k):
    return 16 * i + 4 * j + k


diags = {}

for i in range(4):
    diags[i] = [[unconvert(i, j, j) for j in range(4)], [unconvert(i, j, 3-j) for j in range(4)]]
       
for i in range(4):
    diags[4+i] = [[unconvert(j, i, j) for j in range(4)], [unconvert(j, i, 3-j) for j in range(4)]]
    
for i in range(4):
    diags[8+i] = [[unconvert(j, j, i) for j in range(4)], [unconvert(j, 3-j, i) for j in range(4)]]

    
space_diags = []
space_diags.append([unconvert(i, i, i) for i in range(4)])
space_diags.append([unconvert(3-i, i, i) for i in range(4)])
space_diags.append([unconvert(i, i, 3-i) for i in range(4)])
space_diags.append([unconvert(i, 3-i, i) for i in range(4)])
space_diags

[[0, 5, 10, 15], [3, 6, 9, 12], [0, 17, 34, 51], [3, 18, 33, 48], [0, 20, 40, 60], [48, 36, 24, 12], [48, 53, 58, 63], [51, 54, 57, 60], [12, 29, 46, 63], [15, 30, 45, 60], [3, 23, 43, 63], [51, 39, 27, 15]]


[[0, 21, 42, 63], [48, 37, 26, 15], [3, 22, 41, 60], [12, 25, 38, 51]]

In [123]:
len(diags)

12

# Game speed Test

The percentage of time spent in evaluation must be over 90% to indicate that our game is fast enough

In [9]:
import time

env = BG()
bot = Minimax(max_depth = 2, debug = False) #Make sure the bot.eval lines are commented back in inside the Minimax class
bot.choose_action(env, test = True)


start = time.time()

bot.choose_action(env, test = True)

tot_time = time.time() - start

print(tot_time)
print(bot.checked)
print(sum(bot.evals))
print(len(bot.evals))
print(sum(bot.evals) / tot_time)

1.2429416179656982
560
1.1324031352996826
560
0.9110670355966259


In [10]:
bot.evals[0] #2ms

0.002989530563354492

In [15]:
a = np.array([10,20,30])
b = np.array([2, 4, 5])

In [21]:
2 * np.sqrt(np.log(2) / a) 

array([0.52655377, 0.37232974, 0.30400596])

In [22]:
np.argmax(b)

2

In [26]:
j.mask

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.], dtype=float32)

In [48]:
def default_policy(env):
    log_probs = tf.expand_dims(tf.math.log(env.mask), axis = 0)
    return tf.random.categorical(log_probs, 1)[0, 0].numpy()

e = BG()

a = default_policy(e)
s, r, d, m = e.step(a)   
while not d: 
    a = default_policy(e)
    s, r, d, m = e.step(a)
    
r

1.0

In [49]:
e.mask

array([0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 1., 0., 1., 0., 0., 1., 0.,
       1., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
       1., 0., 0., 0., 0., 1., 1., 1., 1., 0., 0., 0., 1., 0., 0., 0., 0.,
       1., 0., 0., 1., 0., 0., 1., 0., 1., 0., 0., 0., 1.], dtype=float32)

In [50]:
e.render()

O|X| |O
-------
O| |O|O
-------
O| | |O
-------
 |O|X| 


X| |O|O
-------
 |O| |X
-------
X|X|X|X
-------
X|X|O| 


X|X| |X
-------
X|X|O| 
-------
 | | |X
-------
X|O| |X


O|O|O| 
-------
O|O| |X
-------
X| |X| 
-------
O|O|X| 



In [345]:
mask = env.mask

In [348]:
mask = np.array([0,0,0,0,0,1,0])

In [350]:
np.random.choice(np.where(mask == 1)[0])

5

In [442]:
"""
Problems with current design: 

1) Mask double maintained in Node and game - large storage cost for Nodes

"""

# Selection -> Need UCB of ALL child nodes, need actual nodes as well
# Expansion -> Need to randomly pick unexplored nodes + create node 
# Simulation -> Just need environment, simulation simple
# Backup -> Recurse backwards to update the values of nodes selected

"""
State 
-> Keep track of UCB + node object of child nodes (split mean and UCB because for max add but min subtract)
-> Unexplored mask 
-> Parent responsible for updating child's UCB
-> Note whether is a min or max node 

Fast - Could create a list of tuples of Nodes and information in MCTS and then go thru list to backup
Overhead - Use Publisher - Subscriber Pattern to update all nodes once game result is reached 
"""

class UCBState:
        
    def __init__(self, mask):
        self.unexplored = mask.copy()
        self.action_to_node = [None] * ACTION_SPACE # Indicies represent possible actions

        self.N = 0
        self.n = np.zeros(64)
        self.wins = np.zeros(64)
      
    @property
    def is_unexplored(self):
        return self.unexplored.sum() > 0
    
    def update_explore(self, action, node):
        #print("CALLED")
        self.unexplored[action] = 0
        self.action_to_node[action] = node
    
    def update_values(self, action, result):
        self.N += 1
        self.n[action] += 1 
        self.wins[action] += result        

class MCTS:
        
    def __init__(self, num_rollouts = 20, c = 0.2):
        self.e = BG()
        self.reset_search()
        
        self.num_rollouts = num_rollouts
        self.c = c
        
        self.def_time = 0
        self.tree_time = 0
        self.expand_time = 0
        self.select_time = 0
        self.simulate_time = 0
        
    def times(self):
        return [self.def_time, self.tree_time, self.expand_time, self.select_time, self.simulate_time]

    def default_policy(self, mask = None): # Used during expansion and simulation
        
        s = time.time()
        
        if mask is None: 
            mask = self.e.mask
            
            
        #log_probs = tf.expand_dims(tf.math.log(mask), axis = 0)
        #ans = tf.random.categorical(log_probs, 1)[0, 0].numpy()
        ans = np.random.choice(np.where(mask == 1)[0])
        
        self.def_time += time.time() - s
        return ans
    
    def tree_policy(self, node, test = False, debug = False): 
        # Certain counts will just be 0 (illegal actions and unexplored actions); so only use counts > 0
        s = time.time()
        
        usable = node.n.nonzero()[0]

        wins, n = node.wins[usable], node.n[usable]

        ucb = wins / n
        if not test: ucb += self.c * np.sqrt(2) * np.sqrt(np.log(node.N) / n) * self.e.player # Subtract if argmin 

        ucb *= self.e.player
        
        chosen_ucb_idx = np.random.choice(np.where(ucb == ucb.max())[0])
        ans = usable[chosen_ucb_idx] 
        
        if debug:
            print(ucb)
            print(self.e.player)
            print(sum(node.n))
        
        self.tree_time += time.time() - s
        return ans
    
    def choose_action(self, env, test = False):
        
        root = self.next_node 
        
        counter = 0
        
        for action in env.actions_taken:
            if root.action_to_node[action] is not None: 
                root = root.action_to_node[action]
            else: 
                #print("CREaTING NEW ACTION")
                node = UCBState(env.get_internal()['mask'])
                root.update_explore(action, node)
                root = node        

        for _ in range(self.num_rollouts):
            
            self.e.set_internal(env.get_internal())
            backup = []
            cur = root
                
            cur, r, d = self.select(cur, backup) 
            
            if not d: 
                r, d = self.expand(cur, backup)
                r = self.simulate(r, d)
                
            self.backup_tree(backup, r)
            
        self.e.set_internal(env.get_internal())
            
        a = self.tree_policy(root, test)
        return a
                
    def select(self, cur, backup):
        
        sta = time.time()
        
        r, d = -1, False
        
        while cur is not None and not cur.is_unexplored:
            a = self.tree_policy(cur)
            backup.append((cur, a))
            cur = cur.action_to_node[a]     # Will be None if action leads to terminal state
            s, r, d, m = self.e.step(a)     # If d is true, cur should be None (simulate should have not created a node)
                
        ans = cur, r, d
        self.select_time += time.time() - sta
        
        return ans
    
    def expand(self, cur, backup):
        
        sta = time.time()

        a = self.default_policy(cur.unexplored) 
        s, r, d, m = self.e.step(a) 
        
        backup.append((cur, a))
        
        if d: cur.unexplored[a] = 0
        else: cur.update_explore(a, UCBState(self.e.mask))
            
        ans = r, d
        
        self.expand_time += time.time() - sta
        return ans
            
    def simulate(self, r, d):
        
        sta = time.time()
        
        while not d: 
            a = self.default_policy()
            s, r, d, m = self.e.step(a)
              
        self.simulate_time += time.time() - sta
                
        return r
    
    def backup_tree(self, backup, r):
        for node, action in backup: 
            node.update_values(action, r)
            
    def reset_search(self):
        mask = self.e.get_internal()['mask'].copy()
        self.next_node = UCBState(mask)

In [443]:
def get_depth_node(node, counter = 0): 
    counter += 1
    
    for t in node.action_to_node:
        if t is not None: 
            counter = get_depth_node(t, counter)
            
    return counter

In [444]:
get_depth_node(agents[1].next_node)

10002

In [445]:
get_depth_node(agents[1].next_node)

10002

In [456]:
def simulate(n = 10):
    env = BG()
    
    records = [0, 0, 0]
    for _ in range(n):
        s, r, d, m = env.reset()
        
        agents = [MCTS(5000, 0), MCTS(5000, 0.15)]
        #agents = [Minimax(max_depth = 1, debug = False, training = False, explore = False), RandomBot()] 
        #agents = [RandomBot(), Minimax(max_depth = 1, debug = False, training = False, explore = False)]

        index = 0

        while not d: 
            a = agents[index].choose_action(env)
            s, r, d, m = env.step(a)

            index = 1 - index
        
        reward_to_index = {0 : 2, 0.5: 1, 1: 0} # Player 1 is index 0 
        records[reward_to_index[r]] += 1
        print(f"{_+1} games passed with a reward of {r}   ", end = '\r')
    return records
    
simulate()

10 games passed with a reward of 1.0   

[2, 0, 8]

In [424]:
env = BG()

s, r, d, m = env.reset()

agents = [RandomBot(), MCTS(5000)]
index = 0 

a = agents[0].choose_action(env, test = True)
s, r, d, m = env.step(a)

a = agents[1].choose_action(env, test = True)
s, r, d, m = env.step(a)

CREaTING NEW ACTION
[-0.62745098 -0.50617284 -0.5        -0.53424658 -0.64       -0.5625
 -0.5862069  -0.51282051 -0.58333333 -0.47311828 -0.64       -0.5625
 -0.5        -0.49411765 -0.48351648 -0.45631068 -0.52702703 -0.51282051
 -0.48314607 -0.4375     -0.50617284 -0.57142857 -0.49411765 -0.46464646
 -0.51282051 -0.60714286 -0.49425287 -0.7        -0.5625     -0.48863636
 -0.49411765 -0.65217391 -0.50617284 -0.61111111 -0.48863636 -0.44761905
 -0.61111111 -0.46       -0.57142857 -0.43478261 -0.52702703 -0.46391753
 -0.59649123 -0.55223881 -0.64       -0.57894737 -0.5625     -0.41085271
 -0.57377049 -0.44144144 -0.43362832 -0.47311828 -0.44761905 -0.42735043
 -0.51282051 -0.64       -0.5        -0.4375     -0.48863636 -0.5
 -0.5        -0.45544554 -0.52702703]
-1.0
5000.0


In [433]:
a = agents[1].choose_action(env, test = True)
s, r, d, m = env.step(a)

[-0.50574713 -0.55714286 -0.65957447 -0.51807229 -0.65306122 -0.53947368
 -0.45454545 -0.6        -0.42857143 -0.54929577 -0.59016393 -0.54054054
 -0.4893617  -0.54054054 -0.47916667 -0.47916667 -0.51807229 -0.525
 -0.51807229 -0.53246753 -0.50588235 -0.65957447 -0.4587156  -0.421875
 -0.44444444 -0.6        -0.65957447 -0.48913043 -0.53246753 -0.49450549
 -0.578125   -0.55882353 -0.53947368 -0.54929577 -0.525      -0.51190476
 -0.50588235 -0.58730159 -0.65306122 -0.50574713 -0.54794521 -0.63461538
 -0.56923077 -0.54929577 -0.4893617  -0.50588235 -0.48421053 -0.45794393
 -0.50588235 -0.51190476 -0.48351648 -0.44444444 -0.54929577 -0.65306122
 -0.47916667 -0.48913043 -0.53246753 -0.42857143 -0.54794521 -0.45454545
 -0.45045045]
-1.0
5001.0


In [434]:
env.actions_taken

[20, 48, 24, 25]

In [412]:
agents[1].times()

[5.798228979110718,
 0.5081002712249756,
 0.665541410446167,
 1.222064733505249,
 16.888139247894287]

In [374]:
a = agents[0].choose_action(env, test = True)
s, r, d, m = env.step(a)

a = agents[1].choose_action(env, test = True)
s, r, d, m = env.step(a)

CREaTING NEW ACTION
[0.69230769 0.64285714 0.38461538 0.5625     0.81818182 0.75
 0.37037037 0.69230769 0.6        0.47619048 0.47619048 0.64285714
 0.45454545 0.4        0.45454545 0.5        0.45454545 0.5625
 0.5        0.41666667 0.69230769 0.47619048 0.5        0.47619048
 0.5        0.5        0.55555556 0.64285714 0.69230769 0.5
 0.45454545 0.81818182 0.75       0.27777778 0.5625     0.47619048
 0.41666667 0.47619048 0.45454545 0.45454545 0.45454545 0.52631579
 0.75       0.52941176 0.47619048 0.47619048 0.5625     0.45454545
 0.52631579 0.69230769 0.45454545 0.52631579 0.5625    ]
1.0
1000.0


In [343]:
self.def_time = 0
        self.tree_time = 0
        self.expand_time = 0
        self.select_time = 0
        self.simulate_time = 0

IndentationError: unexpected indent (2647345337.py, line 2)

In [353]:
agents[1].times()

[0.5300107002258301,
 0.03822779655456543,
 0.08184623718261719,
 0.0726308822631836,
 1.621001958847046]

In [310]:
node.n[[usable]]

array([[2., 1., 1., 1., 1., 1., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1.]])

In [292]:
usable = node.n.nonzero()[0]

wins, n = node.wins[[usable]], node.n[[usable]]

ucb = wins / n


In [303]:
node.wins

array([0., 0., 2., 0., 1., 0., 0., 0., 0., 0., 1., 1., 0., 1., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 1., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.])

In [293]:
ucb = ucb * - 1

In [302]:
ucb.shape

(1, 18)

In [295]:
usable

array([ 2,  4, 10, 11, 13, 20, 23, 24, 31, 34, 42, 43, 47, 49, 50, 59, 60,
       61], dtype=int64)

In [291]:
node

MCTS 3.0

In [301]:
np.where(ucb == ucb.max())[0]

array([0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [300]:
np.random.choice(np.where(ucb == ucb.max())[0])


0

In [298]:
usable[0] 

2

In [377]:
env = BG()

s, r, d, m = env.reset()

agents = [RandomBot(), MCTS(num_rollouts = 100)]
index = 0 

while not d: 
    a = agents[index].choose_action(env, test = True)
    s, r, d, m = env.step(a)

    print(a)
    env.render()
    print()
    index = 1 - index
    
print(r)

57
 | | | 
-------
 | | | 
-------
 | | | 
-------
 | | | 


 | | | 
-------
 | | | 
-------
 | | | 
-------
 | | | 


 | | | 
-------
 | | | 
-------
 | | | 
-------
 | | | 


 | | | 
-------
 | | | 
-------
 |X| | 
-------
 | | | 


CREaTING NEW ACTION
[0.  1.  0.  1.  0.5 1.  0.  1.  0.5 0.5 1.  1.  1.  0.  1.  0.5 0.5 1.
 0.  1.  1.  0.5 1.  0.5 0.5 1.  1.  1.  0.5 0.5 0.5 0.  1.  0.  1.  1.
 0.  1.  1.  1.  0.5 1.  0.5 0.5 0.5 1.  1.  0.  0.  1.  1.  0.5 1.  0.5
 0.5 0.5 1.  1.  1.  0.5 0.  1.  0.5]
1.0
100.0
58
 | | | 
-------
 | | | 
-------
 | | | 
-------
 | | | 


 | | | 
-------
 | | | 
-------
 | | | 
-------
 | | | 


 | | | 
-------
 | | | 
-------
 | | | 
-------
 | | | 


 | | | 
-------
 | | | 
-------
 |X|O| 
-------
 | | | 


37
 | | | 
-------
 | | | 
-------
 | | | 
-------
 | | | 


 | | | 
-------
 | | | 
-------
 | | | 
-------
 | | | 


 | | | 
-------
 |X| | 
-------
 | | | 
-------
 | | | 


 | | | 
-------
 | | | 
-------
 |X|O| 
-------
 | | | 


CREaTING N

[-1.         -1.         -0.         -1.         -0.5        -0.33333333
 -0.5        -1.         -0.5        -1.         -0.5        -0.5
 -0.5        -1.         -1.         -0.5        -1.         -1.
 -0.5        -0.33333333 -0.5        -0.5        -0.5        -0.5
 -1.         -0.5        -1.         -0.5        -0.5        -1.
 -1.         -1.         -0.33333333 -0.33333333 -0.5        -0.
 -0.5        -0.5        -0.5        -0.5        -0.5        -1.
 -1.         -0.5        -1.         -1.         -0.        ]
-1.0
100.0
5
X|X|X| 
-------
 |O|O| 
-------
 | | |O
-------
 | | | 


 | | | 
-------
 | | |X
-------
 | | | 
-------
 | | | 


 |X| | 
-------
 |X|O| 
-------
X| | | 
-------
O| |O|X


 | | |O
-------
 | | | 
-------
 |X|O| 
-------
 |O| | 


3
X|X|X|X
-------
 |O|O| 
-------
 | | |O
-------
 | | | 


 | | | 
-------
 | | |X
-------
 | | | 
-------
 | | | 


 |X| | 
-------
 |X|O| 
-------
X| | | 
-------
O| |O|X


 | | |O
-------
 | | | 
-------
 |X|O| 
-------
 |O|

In [379]:
env = BG()

s, r, d, m = env.reset()

agents = [RandomBot(), MCTS(1000)]
index = 0 

a = agents[0].choose_action(env, test = True)
s, r, d, m = env.step(a)

a = agents[1].choose_action(env, test = True)
s, r, d, m = env.step(a)

CREaTING NEW ACTION
[0.64285714 0.64285714 0.42857143 0.52941176 0.5625     0.52941176
 0.5625     0.75       0.47368421 0.75       0.6        0.45
 0.47368421 0.8        0.6        0.45       0.75       0.8
 0.69230769 0.52941176 0.5        0.47368421 0.52941176 0.41666667
 0.5625     0.39130435 0.52941176 0.5625     0.47368421 0.8
 0.5625     0.8        0.47368421 0.45       0.5625     0.6
 0.47368421 0.42857143 0.8        0.42857143 0.75       0.64285714
 0.69230769 0.47368421 0.64285714 1.         0.8        0.64285714
 0.5        0.45       0.69230769 0.5        0.88888889 0.4
 0.64285714 0.8        0.6        0.5625     0.8        0.4
 0.64285714 0.52941176 0.52941176]
1.0
1000.0


In [273]:
usable = node.n.nonzero()[0]

wins, n = node.wins[[usable]], node.n[[usable]]

ucb = wins / n
#if not test: ucb += node.c * np.sqrt(2) * np.sqrt(np.log(node.N) / n) * self.e.player # Subtract if argmin 

ucb *= -1
ucb

usable[np.argmax(ucb)]

20

In [271]:
usable

(array([ 2,  4, 10, 11, 13, 20, 23, 24, 31, 34, 42, 43, 47, 49, 50, 59, 60,
        61], dtype=int64),)

In [265]:
node.n

array([0., 0., 2., 0., 1., 0., 0., 0., 0., 0., 1., 1., 0., 1., 0., 0., 0.,
       0., 0., 0., 1., 0., 0., 2., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
       1., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 1., 0., 1., 1.,
       0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 0., 0.])

In [248]:
a = agents[0].choose_action(env, test = True)
s, r, d, m = env.step(a)

a = agents[1].choose_action(env, test = True)
s, r, d, m = env.step(a)

Creating for action,  40
CALLED
ADDING TO MCTS 3.0 FOR ACTION  17 WITH NODE  MCTS 4.0
CALLED
ADDING TO MCTS 3.0 FOR ACTION  38 WITH NODE  MCTS 4.0
CALLED
ADDING TO MCTS 3.0 FOR ACTION  61 WITH NODE  MCTS 4.0
CALLED
ADDING TO MCTS 3.0 FOR ACTION  24 WITH NODE  MCTS 4.0
CALLED
ADDING TO MCTS 3.0 FOR ACTION  42 WITH NODE  MCTS 4.0
CALLED
ADDING TO MCTS 3.0 FOR ACTION  15 WITH NODE  MCTS 4.0
CALLED
ADDING TO MCTS 3.0 FOR ACTION  32 WITH NODE  MCTS 4.0
CALLED
ADDING TO MCTS 3.0 FOR ACTION  31 WITH NODE  MCTS 4.0
CALLED
ADDING TO MCTS 3.0 FOR ACTION  56 WITH NODE  MCTS 4.0
CALLED
ADDING TO MCTS 3.0 FOR ACTION  54 WITH NODE  MCTS 4.0
CALLED
ADDING TO MCTS 3.0 FOR ACTION  21 WITH NODE  MCTS 4.0
CALLED
ADDING TO MCTS 3.0 FOR ACTION  46 WITH NODE  MCTS 4.0
CALLED
ADDING TO MCTS 3.0 FOR ACTION  9 WITH NODE  MCTS 4.0
CALLED
ADDING TO MCTS 3.0 FOR ACTION  16 WITH NODE  MCTS 4.0
CALLED
ADDING TO MCTS 3.0 FOR ACTION  44 WITH NODE  MCTS 4.0
CALLED
ADDING TO MCTS 3.0 FOR ACTION  51 WITH NODE  MCTS 4.0


In [249]:
env.actions_taken

[18, 8, 40, 1]

In [256]:
agents[1].next_node.action_to_node[18].action_to_node[8].action_to_node[40].action_to_node

[MCTS 4.0,
 None,
 None,
 None,
 None,
 MCTS 4.0,
 None,
 None,
 None,
 MCTS 4.0,
 None,
 None,
 None,
 None,
 None,
 MCTS 4.0,
 MCTS 4.0,
 MCTS 4.0,
 None,
 None,
 None,
 MCTS 4.0,
 None,
 None,
 MCTS 4.0,
 None,
 MCTS 4.0,
 None,
 None,
 None,
 None,
 MCTS 4.0,
 MCTS 4.0,
 None,
 None,
 None,
 None,
 None,
 MCTS 4.0,
 None,
 None,
 None,
 MCTS 4.0,
 None,
 MCTS 4.0,
 None,
 MCTS 4.0,
 None,
 None,
 None,
 None,
 MCTS 4.0,
 None,
 None,
 MCTS 4.0,
 None,
 MCTS 4.0,
 MCTS 4.0,
 None,
 None,
 None,
 MCTS 4.0,
 None,
 None]

In [258]:
agents[1].tree_policy(agents[1].next_node.action_to_node[18].action_to_node[8].action_to_node[40])


1

In [216]:
node = agents[1].next_node.action_to_node[15].action_to_node[56].action_to_node[17]
node

MCTS 3.0

In [None]:
(array([ 2,  4, 10, 11, 13, 23, 24, 42, 43, 49, 60], dtype=int64),)


In [135]:
mt = agents[1]

In [136]:
env.actions_taken

[31, 0, 35]

In [138]:
mt.next_node.action_to_node[31].action_to_node

[<__main__.UCBState at 0x1d9a1584970>,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 <__main__.UCBState at 0x1d9a1584ee0>,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [92]:
agents[1].choose_action(env)

ValueError: False

In [69]:
m = MCTS()
m.default_policy()

6

In [450]:
gui = GUI() 

gui.connect_bots([RandomBot(), MCTS(5000, 0.15)])
gui.main_loop()

In [389]:
gui.bots[-1].times()

[19.951217889785767,
 3.8535499572753906,
 4.175041913986206,
 8.74736213684082,
 61.05209231376648]

In [385]:
pygame.quit()