In [5]:
import numpy as np
import os
import time
from agents import Agent, AlphaFour
from collections import namedtuple, deque
from random import choice, sample
import importlib
from connectboard import ConnectBoard

os.environ["KERAS_BACKEND"] = "plaidml.keras.backend"

import keras

In [6]:
def get_game_state(game_board: np.ndarray) -> np.ndarray:
    """Returns the AlphaFour representation of the game board.
    
    When AlphaFour gets the game board, the current player is 1 
    and the opponent is -1. We translate that to a two layer state
    where the current player's pieces are 1's in the first slice,
    and the opponent's pieces are 1's in the second slice
    
    Args:
        game_board: A numpy array representing the current game state.
        
    Returns:
        A numpy array containing the AlphaFour representation of the 
        given game state.
    """
    p1_pieces = np.where(game_board == 1, 1, 0)
    p2_pieces = np.where(game_board == -1, 1, 0)
    alpha_four_state = np.array([p1_pieces, p2_pieces])

    return alpha_four_state

# Named tuple for training data. Stores state, move probabilities, and state value
TrainingSample = namedtuple('TrainingSample', 'state probs value')

In [10]:
def self_play(agent1: AlphaFour, agent2: AlphaFour):
    game_board = np.zeros((6,7))
    turn = 0
        
    states = []
    probs = []
    values = []
    
    state = get_game_state(game_board)
    
    while True:
        states.append(state.copy())  # Add current state

        if state[2].all():
            # Get best move, and probability of all moves from current state
            move,prob = agent1.get_move_with_prob(state)
            state[0] += move
            state[2] = np.zeros((6,7))
        else:
            move,prob = agent2.get_move_with_prob(state)
            state[1] += move
            state[2] = np.ones((6,7))
        
        probs.append(prob) # Store move probabilities from current state
                
        val = helpers.winner(state[0] - state[1])
        if val is not None:
            # Add the final state to our arrays
            states.append(state.copy())
            probs.append(np.zeros((1,7)))
            
            val = -1*abs(val) # If game is over, current player lost unless it's a tie. 
            break
        
        turn += 1
                
    for i in range(len(probs)):
        values.append(val * (-1)**i)
    values = values[::-1]
    
    data = [Data(states[i], probs[i], values[i]) for i in range(len(probs))]
                
    return data

In [7]:
D = self_play(AlphaFour('P1'), AlphaFour('P2'))

  ucb = np.where(node.N > 0, node.W/node.N + self._EXPLORATION_CONSTANT*node.P/(1+node.N), np.inf)


In [8]:
REPLAY_BUFFER_SIZE = 100000  # Number of past steps to store. This is where our training sample is drawn from
SELF_PLAY_BATCH_SIZE = 100 # How many games to play before updating the buffer
TRAINING_SET_SIZE = 1024    # Size of the training set to sample from the replay buffer

replay_buffer = deque(maxlen=REPLAY_BUFFER_SIZE)

# START OF ONE TRAINING LOOP
# ==========================================================================
players = [AlphaFour('Best'), AlphaFour('New')]

total_count = 0

# Generate new self play games.
for ii in range(SELF_PLAY_BATCH_SIZE):
    print(f'\r{ii}', end='')
    i = np.random.randint(2)  # randomize who plays first
    p1 = players[i]
    p2 = players[(i+1)%2]
    
    D = self_play(p1,p2)
    
    for d in D:
        total_count += 1
        replay_buffer.append(d)
        
train_set = sample(replay_buffer, TRAINING_SET_SIZE)

# Train new bot with updated data

# Play matches between new and old. If new wins more than 55%, replace old with New


# ==========================================================================

18

KeyboardInterrupt: 

In [None]:
state = train_set[123].S
moves = helpers.get_legal_moves(state[0] + state[1])
moves = moves[1:,:,:]
print(moves)

for col in range(7):
    print(moves[:,:,col].sum())

In [None]:
state = train_set[123].S
moves = helpers.get_legal_moves(state[0] + state[1])
moves = moves[1:]

col_has_move = moves.sum(axis=0).sum(axis=0)

move_idx = 0
for i in range(7):
    if col_has_move[i]:
        new_state = state + np.array([moves[move_idx], np.zeros((6,7)), np.zeros((6,7))])
        move_idx += 1
        print(new_state, '\n\n')
        