In [1]:
import tensorflow as tf
from tensorflow.keras import Model,layers,Sequential,utils
import pandas as pd
import numpy as np

In [2]:
class Environment:
    def __init__(self,n=15,goal=5):
        self.n = n
        self.goal = goal
        self.board = np.zeros((n,n))
        self.winner = None
        
    def is_empty(self,i,j):
        if self.board[i,j]==0:
            return True
        else:
            return False
        
    def kernel_check(self,kernel):
        for i in range(self.goal):
            if (sum(kernel[i,:]) == self.goal) or (sum(kernel[:,i])==self.goal):
                self.winner = 1
                return True
            elif (sum(kernel[i,:]) == -self.goal) or (sum(kernel[:,i])==-self.goal):
                self.winner = -1
                return True
            
        if (kernel.trace() == self.goal) or (kernel[::-1].trace()==self.goal):
            self.winner = 1
            return True
        elif (kernel.trace() == -self.goal) or (kernel[::-1].trace()==-self.goal):
            self.winner = -1
            return True
        
        return False
    
    def game_over(self):
        if not 0 in self.board:
            return True
        k = self.n-self.goal+1
        for i in range(k):
            for j in range(k):
                kernel = self.board[i:i+5,j:j+5]
                if self.kernel_check(kernel):
                    return True
        return False
    
    def rewards(self,agent):
        if self.winner == agent.piece:
            return 1
        elif self.winner == None:
            return 0
        else:
            return -1
        
    def get_state(self,board):
        vector = np.ndarray.flatten(board.copy())
        state = 0
        for i in range(len(vector)):
            state+=vector[i]*3**i
        return state
    
    def get_board(self,state):
        vector = np.zeros(self.n**2)
        for i in range(len(vector)):
            vector[i]=state%3
            state//=3
        vector[vector == 2]=-1
        return np.reshape(vector,(self.n,self.n))

In [None]:
class Agent:
    def __init__(self,sym,env,Qvalue,Qtarget,buffersize = 10000,epsilon = 1,):
        if sym == 'x':
            self.piece = 1
        elif sym == 'o':
            self.piece = -1
        
        self.replay_buffer = []
        self.Qvalue = Qvalue
        self.Qtarget = Qtarget
        self.e = epsilon
        self.batch = None
        
    def valid_mask(self,board):
        mask = np.ndarray.flatten(board.copy())
        mask[mask!=0]=1
        return
    
    def move(self):
        mask = self.valid_mask(self.env.board)
        valid_moves = np.where(mask==0)
        current_state = self.env.get_state(self.env.board)
        n = np.random.rand(0,1)
        if n<=self.e:
            next_move = np.random.choice(valid_moves)

        else:
            best_value = -1
            for move in valid_moves:
                next_state = current_state+3**move
                Qvalue = self.Qvalue(self.env.get_board(next_state)[tf.newaxis,:])
                if Qvalue>=best_value:
                    best_value = Qvalue
                    next_move = move
        
        action = (next_move//15,next_move%15)
        self.env.board[action]=self.piece
        next_state = self.env.get_state(self.env.board)
        self.env.game_over()
        reward = self.env.rewards(self)
        self.replay_buffer.append((current_state,next_state,next_move,reward))
    
    def update_buffer(self):
        if len(self.replay_buffer)>5000:
            self.replay_buffer = self.replay_buffer[len(self.replay_buffer-5000)::] 
            
    def prepare_batch(self,batchsize):
        batch = np.random.choice(self.replay_buffer,batchsize,replace = False)
        current_state = pd.Series(batch[:,0])
        state_boards = current_state.apply(self.env.board)
        # create vector masking the invalid moves dim = (batchsize,225)
        masks = np.array(state_boards.apply(self.valid_mask))
        # list of arrays representing the boards dim = (batchsize,15,15)
        inputs = np.array(state_boards)
        return (inputs,masks,rewards)

In [85]:
class QModel(Model):
    def __init__(self,n_filters=16):
        super (QModel,self).__init__()
        self.conv1 = layers.Conv2D(n_filters,(5,5),strides = 1,padding = 'same',activation = 'relu')
        self.conv2 = layers.Conv2D(n_filters*2,(5,5),strides = 1,padding = 'same',activation = 'relu')
        self.flat = layers.Flatten()
        self.final = layers.Dense(225,'tanh')
        
    def call(self,inputs,mask):
        #inputs dim = (batchsize,15,15)
        x = self.conv1(inputs)
        x = self.conv2(x)
        x = self.flat(x)
        x = self.final(x)
        #outputs dim = (batchsize,225)
        if mask != None:
            x+=mask*-1e2
        return x

In [86]:
Qtarget = QModel()
Qvalue = QModel()

In [10]:
def loss_function(outputs,inputs,target_model,masks,rewards):
    target = target_model(inputs,masks)
    loss_mask = tf.math.logical_not(tf.math.less(outputs, -1))
    loss_mask = tf.cast(loss_mask,'float32')
    loss = outputs
    
    return loss

In [92]:
n = tf.constant([[1,2,4],[-100,-100,2],[0,3,-1]])
l=tf.cast(tf.math.logical_not(tf.math.less(n, -1)),'float32')

In [99]:
tf.reduce_sum(25*l)/tf.cast(tf.math.count_nonzero(l),'float32')

<tf.Tensor: id=475, shape=(), dtype=float32, numpy=25.0>

In [100]:
tf.cast(tf.math.count_nonzero(l),'float32')

<tf.Tensor: id=482, shape=(), dtype=float32, numpy=7.0>