# Implementing AlphaZero

In [1]:
%matplotlib inline

import numpy as np
import itertools
import random
import math
import matplotlib.pyplot as plt

import import_ipynb
from rules import Game
from players import RandomPlayer, GreedyPlayer, MCTSPlayer, UCTPlayer, HumanPlayer, Player

importing Jupyter notebook from rules.ipynb
Current player: 0 - Score: 0/0
------------------
  5  5  6  0  5  5
  5  5  4  4  4  0
importing Jupyter notebook from players.ipynb


In [2]:
from keras.models import *
from keras.layers import *
from keras.optimizers import *

Using TensorFlow backend.


In [5]:
n_pits = 6
seeds_per_pit = 4

def cnn():
    dropout = 0.3
    lr = 0.001
    conv_size = 3

    inputlen = n_pits * 2
    padd_len = conv_size - 1
    num_classes = n_pits * seeds_per_pit * 2

    Left_pad = Lambda(lambda x: x[:, -padd_len:])

    def _conv_pad(x):
        return Concatenate(axis=1)([
            Left_pad(x),
            x
        ])
    Conv_pad = Lambda(_conv_pad)

    board = Input(shape=(inputlen,))
    board_reshaped = Reshape((inputlen, 1))(board)

    conv1 = Activation('relu')(BatchNormalization(axis=2)(Conv1D(20, conv_size, padding='valid')(Conv_pad(board_reshaped))))
    conv2 = Activation('relu')(BatchNormalization(axis=2)(Conv1D(20, conv_size, padding='valid')(Conv_pad(conv1))))
    conv3 = Activation('relu')(BatchNormalization(axis=2)(Conv1D(20, conv_size, padding='valid')(Conv_pad(conv2))))

    flat = Flatten()(conv3)       

    fc1 = Dropout(dropout)(Activation('relu')(BatchNormalization(axis=1)(Dense(256)(flat))))
    fc2 = Dropout(dropout)(Activation('relu')(BatchNormalization(axis=1)(Dense(128)(fc1))))

    policy = Dense(n_pits, activation='softmax', name='policy')(fc2)
    value = Dense(1, activation='tanh', name='value')(fc2)

    model = Model(inputs=board, outputs=[policy, value])
    model.compile(loss=['categorical_crossentropy','mean_squared_error'], optimizer=Adam(lr))
    
    return model

cnn().summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, 12)           0                                            
__________________________________________________________________________________________________
reshape_2 (Reshape)             (None, 12, 1)        0           input_3[0][0]                    
__________________________________________________________________________________________________
lambda_4 (Lambda)               multiple             0           reshape_2[0][0]                  
                                                                 activation_4[0][0]               
                                                                 activation_5[0][0]               
__________________________________________________________________________________________________
conv1d_4 (

In [6]:
def bnn(n_pits=6, seeds_per_pit=4, depth=5, width=48, dropout=0.3, lr=0.001, top_dense=128):
    inputlen = n_pits * 2
    
    board = Input(shape=(inputlen,))
    
    dense = board
    for i in range(depth):
        dense = Dropout(dropout)(Dense(width, activation='relu', name="dense-%s" % i)(dense))

    policy_dense = Dropout(dropout)(Dense(top_dense, activation='relu', name="policy-dense")(dense))
    policy = Dense(n_pits, activation='softmax', name='policy')(policy_dense)
    
    value_dense = Dropout(dropout)(Dense(top_dense, activation='relu', name="value-dense")(dense))
    value = Dense(1, activation='tanh', name='value')(value_dense)

    model = Model(inputs=board, outputs=[policy, value])
    model.compile(loss=['categorical_crossentropy','mean_squared_error'], optimizer=Adam(lr))
    
    return model

bnn().summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            (None, 12)           0                                            
__________________________________________________________________________________________________
dense-0 (Dense)                 (None, 48)           624         input_4[0][0]                    
__________________________________________________________________________________________________
dropout_10 (Dropout)            (None, 48)           0           dense-0[0][0]                    
__________________________________________________________________________________________________
dense-1 (Dense)                 (None, 48)           2352        dropout_10[0][0]                 
__________________________________________________________________________________________________
dropout_11

# Train

In [None]:
model = bnn()
root = game = Game.start_game()

finished = False
while not finished:
    view = game.view_from_current_player
    view = np.reshape(view, (1,) + view.shape)
    
    policy, _ = model.predict_on_batch(view)
    action = np.argmax(policy[0])
    if action not in game.legal_actions:
        action = random.choice(game.legal_actions)
        
    game, _, finished = game.step(action)
game.update_stats(game.winner)
game
#model.train_on_batch(x, y)

In [None]:
game.show_state()

In [None]:
class AlphaPlayer(Player):
    def __init__(self, player_id, budget, model, mode="competitive"):
        self.root = Game.start_game()
        self.initialize_node(self.root)

        self.player_id = player_id
        self.budget = budget
        self.model = model
        
        assert mode in ("competitive", "exploratory")
        self.mode = mode
    
    def evaluate_node(self, node):
        view = game.view_from_current_player
        view = np.reshape(view, (1,) + view.shape)
        policy, value = model.predict_on_batch(view)
        return policy, value
    
    def initialize_node(self, node):
        _, value = self.evaluate_node(node)
        
        node.W = 0
        node.Q = 0
        node.P = value # ? FIXME
        
    def get_action(self):
        for _ in range(self.budget):
            self.simulate()
        
        if self.mode == "competitive":
            return self.get_competitive_action()
        if self.mode == "exploratory":
            return self.get_exploratory_action()
    
    def simulate(self):
        node = self.root
        finished = node.game_finished
        while not finished:
            # Evaluate each non evaluated child
            for action in node.legal_unvisited_actions:
                new_node, _, _ = node.step(action)
                self.initialize_node(new_node)
            
            # Choose the best child with a combinaison of Q (reward from last plays) and U (a priori value)
            # upper confidence bound Q(s, a)+U(s, a)
            # where  U(s, a)∝P(s, a)/(1+N(s, a)) 
            action = 1
            node, _, finished = node.step(action)
            self.initialize_node(node)
            

        # Backtrack stats
        node.update_stats(node.winner)
        

    def get_competitive_action(self):
        return max_rand(self.children, key=lambda x: 0 if x is None else x.n_playouts)
    
    def get_exploratory_action(self):
        temperature = 1 if self.root.depth < 10 else 0.001