# Implementing a Tic Tac Toe RL Algorithm

In [1420]:
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
import time


## Creating Tic Tac Toe Environment

In [327]:
class tictactoe:
    def __init__(self, dim):
        self.dim = dim
        convert = {0: ' ', 1: 'X', 2: 'O'} #0: empty space, 1: computer, 2: human 
        self.initboardstate()
        return 
    
    #Dimension 
    def adddim(self, dim):
        self.dim = dim 
    def getdim(self):
        return self.dim
    
    #Board State 
    def initboardstate(self):
        self.board = np.zeros((self.dim, self.dim))
        self.board[0,0] = 1
    def getboardstate(self):
        return self.board
    def addboardstate(self, state):
        self.board = (state)
        #Make sure dimension of board is self consistent 
        self.adddim(self.getboardstate().shape[0])
        
    #Visualizing the board   
    def drawboard(self):
        #Checking for correct dimension
        dim = self.dim 
        #self.board = np.zeros((self.dim, self.dim))
        board_state = self.getboardstate()
        for i in range(dim):
            for j in range(dim -1):
                print("  | "+convert[board_state[i,j]], end = '')
            print(" | "+convert[board_state[i,dim-1]]+" | ")
            if i != dim-1:
                print('_'*dim*6)
                
    #Check for win 
    def checkrow(self, board):     
        #Check if row has the same symbol
        for i in range(0, board.shape[0]):
            if (np.any(board[i,:] != 0)):
                if (np.all(board[i,:]==1)):
                    return "1win"
                elif (np.all(board[i,:]==2)):
                    return "2win"
        return "Nowin"
    def checkcolumn(self, board):
        return self.checkrow(np.transpose(board))
    def getdiagonal(self, board):
        l = board.shape[0]
        output = np.zeros((2,l))
        for i in range(0, (l)):
            output[0,i] = board[i,i]
            output[1,i] = board[i, l-1-i]
        return output
    def checkdiagonal(self, board):
        return self.checkrow(self.getdiagonal(board))
    def checkwin(self, board):
        if self.checkrow(board) != "Nowin":
            return self.checkrow(board)
        if self.checkcolumn(board) != "Nowin":
            return self.checkcolumn(board)
        if self.checkdiagonal(board) != "Nowin":
            return self.checkdiagonal(board)
        return "Nowin"
    def checkdraw(self, board):
        if (np.all(board != 0)) and self.checkwin(board) == "Nowin":
            return "Draw"
        else:
            return self.checkwin(board)
    
    #Update board
    def updateboard(self, pos, val):
        [i,j] = pos
        board = self.getboardstate()
        board[i,j] = val 
        self.addboardstate(board)


# Simple Detour

Try to get it to fill an arbitrarily size grid 

In [1523]:
class SimGame2:
    
    def __init__(self, dim):
        self.board_size = dim
        self.initgame()
        
        return
    def initgame(self, random = False):
        if random == False:
            self.board = np.zeros((self.board_size, self.board_size))
        else:
            self.board = np.random.randint(0, 2, size=(self.board_size, self.board_size))
            #print(self.board)
        #self.board[1,1] = 0
        #self.board[0,1] = 0

    def dispboard(self, board = "Default"):
        if board == "Default":
            print(self.board)
        else:
            print(board)
    def fill(self, pos):
        #Check if board is filled 
        if np.all(self.board != 0):
            return(10, True)

        [x,y] = pos
        if self.board[x,y] == 0:
            self.board[x,y]= 1
            
            #Check if game is over 
            game_status = np.all(self.board != 0)

            return (10, game_status )
        else:
            return (-10, False )
    def getstateid(self, board = "Defined"):
        if board == "Defined":
            board = self.board
        id = 0
        for i in range(len(board.flatten())):
            id += (2**i)*board.flatten()[i]
        return id 
    
    def actidtopos(self, id):
        
        col = id%(self.board_size)
        row = (int((id)/(self.board_size)))
        return [row,col]
    def actpostoid(self, pos):
        [x,y] = pos 
        return (self.board_size)*x + y
    def checkwin(self):
        if np.all(self.board == 1):
            return True 
        else:
            return False
    

In [1739]:
class player:
    def __init__(self, env):
        self.env = env 
    def genqtable(self):
        board_size = self.env.board_size
        qtable = np.zeros((2**(board_size**2), (board_size**2)))
        self.qtable = (qtable)
    def train(self, epsilon = 0.1, gamma = 1, games = 4000):
        r = [] 
        for g in range(games):
            game_status = False
            game = self.env
            game.initgame(random = True)
            total_reward = 0 

            while game_status == False:
                ran = random.random()
                if ran > epsilon:
                    
                    x = random.randint(0, game.board_size-1)
                    y = random.randint(0, game.board_size-1)
                    pos = [x,y]
                else:
                    idx = np.argmax(self.qtable[int(game.getstateid())])
                    pos = game.actidtopos(idx)
                reward, game_status = game.fill(pos)
                total_reward += reward 

                if np.sum(game.board) == game.board_size**2:
                    q_next = 0 
                else:
                    q_next = (self.qtable[int(game.getstateid())].max())

                self.qtable[int(game.getstateid()), game.actpostoid(pos)] = reward+ gamma*q_next
        
                #print(np.abs(start_time-end_time))
            r.append(total_reward)
            self.rewardlist = r 
    def check_accuracy(self, safety = False):
        dim = self.env.board_size
        correct = 0 
        wrong = 0
        for i in range(0, 2**(dim)**2):
            #Generate board
            flatarr = (np.array(list((bin(i)[2:])))).astype('int') 
            pad = np.zeros(int((dim**2)-len(flatarr)))
            flatarrpad = (np.concatenate((flatarr, pad)))
            arr = np.reshape(flatarrpad, (dim, dim))
            if np.all(arr) != 1:
                #Check accuracy of next move 
                [x,y] = self.predict_action(board = arr, safety = safety)

                #self.env.dispboard(board = arr)
                if arr[x,y] == 0:
                    correct+=1
                else:
                    wrong+= 1
            
        #Calculate Error Rate 
        err = correct/(correct+wrong)
        return(err)
    
    def predict_action(self, board = "Current", safety = False):
        if board == "Current":
            board = self.env.board
        idx = self.env.getstateid(board)
        #print(self.qtable)
        actid = np.argmax((self.qtable[int(idx)]))
        pos = self.env.actidtopos(int(actid))
        
        #Check if predicted state is occupied 
        [x,y] = pos 
        if safety == True:
            qrow = self.qtable[int(idx)]
            arglist = np.argsort(qrow)
            i = 2
            while board[x,y] == 1:
                actid = arglist[-i]
                pos = self.env.actidtopos(int(actid))
                [x,y] = pos
                i = i+1
                if(i == len(qrow)):
                    break
        return (pos)
    
    def play1move(self):
        pos = self.predict_action()
        print(pos)
        self.env.fill(pos)
        #self.env.dispboard()
    def play(self, random = False):
        self.env.initgame(random)
        steps = 0
        while self.env.checkwin() == False:
            self.env.dispboard()
            #print(self.env.checkwin())
            print("Hi")
            self.play1move()
            steps += 1
            if steps > 20:
                break
        self.env.dispboard()
        return steps
    
    #Hyperparameter Optimization Methods 
    def gridop(self):
        epsilon = np.linspace(0, 1, 10)
        acc = []
        for i in range(len(epsilon)):
            self.train(epsilon = i)
            acc.append(self.check_accuracy())
        plt.plot(epsilon, acc, 'o')

In [1740]:
#4000 games is enough
game = SimGame2(3)
p1 = player(game)
p1.genqtable()
p1.train(games = 2000, gamma = 0.2, epsilon = 0.9)


In [1741]:
p1.check_accuracy(safety = True)



1.0

In [1742]:
p1.play(random = True)


[[1 1 0]
 [1 0 1]
 [1 1 0]]
Hi
[0, 2]
[[1 1 1]
 [1 0 1]
 [1 1 0]]
Hi
[1, 1]
[[1 1 1]
 [1 1 1]
 [1 1 0]]
Hi
[2, 2]
[[1 1 1]
 [1 1 1]
 [1 1 1]]




3

# Back to Tic Tac Toe