In [2]:
import numpy as np
import pickle

In [23]:
class State:
    def __init__(self, p1, p2):
        self.board = np.zeros((BOARD_ROWS, BOARD_COLS))
        self.p1 = p1
        self.p2 = p2
        self.isEnd = False
        self.boardHash = None
        # P1 plays first
        self.playerSymbol = 1
        
    # get unique hash of cuurent board state
    def getHash(self):
        self.boardHash = str(self.board.reshape(BOARD_COLS * BOARD_ROWS))
        return self.boardHash
    
    def winner(self):
        # row
        for i in range(BOARD_ROWS):
            if sum(self.board[i, :]) == 3:
                self.isEnd = True
                return 1
            if sum(self.board[i, :]) == -3:
                self.isEnd = True
                return -1
            
        # column
        for i in range(BOARD_COLS):
            if sum(self.board[:, i]) == 3:
                self.isEnd = True
                return 1
            if sum(self.board[:, i]) == -3:
                self.isEnd = True
                return -1
            
        # diagonal
        diag_sum1 = sum([self.board[i, i] for i in range(BOARD_ROWS)])
        diag_sum2 = sum([self.board[i, BOARD_COLS - i - 1] for i in range(BOARD_COLS)])
        
        if max(diag_sum1, diag_sum2) == 3:
            self.isEnd = True
            return 1
        if min(diag_sum1, diag_sum2) == -3:
            self.isEnd = True
            return -1
        
        # tie
        # No available positions
        if len(self.availablePositions()) == 0:
            self.isEnd = True
            return 0
        
        self.isEnd = False
        return None
    
    def availablePositions(self):
        positions = []
        for i in range(BOARD_ROWS):
            for j in range(BOARD_COLS):
                if self.board[i, j] == 0:
                    position.append((i, j)) #tuple
        return positions
    
    def updateState(self, position):
        self.board[position] = self.playerSymbol
        # alternate player
        self.playerSymbol = -1 if self.playerSymbol == 1 else 1
        
    # when game end
    def giveReward(self):
        result = self.winner()
        # backpropagate reward
        if result == 1:
            self.p1.feedReward(1)
            self.p2.feedReward(0)
        elif result == -1:
            self.p1.feedReward(0)
            self.p2.feedReward(1)
        else:
            self.p1.feedReward(0.1)
            self.p2.feedReward(0.5)
            
    def reset(self):
        self.board = np.zeros((BOARD_ROWS, BOARD_COLS))
        self.isEnd = False
        self.boardHash = None
        self.playerSymbol = 1
        
    def play(self, rounds=100):
        for i in range(rounds):
            if i%1000 == 0:
                print(f'Round {i}')
            
            while not self.isEnd:
                # Player 1
                positions = self.availablePositions()
                p1_action = self.p1.chooseAction(positions, self.board, self.playerSymbol)
                # take action and update board
                self.updateState(p1_action)
                board_hash = self.getHash()
                self.p1.addState(board_hash)
                
                if self.winner() is not None:
                    self.giveReward()
                    self.p1.reset()
                    self.p2.reset()
                    self.reset()
                    break
                    
                else:
                    # Player 2
                    positions = self.availablePositions()
                    p2_action = self.p2.chooseAction(positions, self.board, self.playerSymbol)
                    self.updateState(p2_action)
                    board_hash = self.getHash()
                    self.p2.addState(board_hash)
                    
                    if self.winner() is not None:
                        self.giveReward()
                        self.p1.reset()
                        self.p2.reset()
                        self.reset()
                        break
        
        # play with human
        def play2(self):
             while not self.isEnd:
                # Player 1
                positions = self.availablePositions()
                p1_action = self.p1.chooseAction(positions, self.board, self.playerSymbol)
                # take action and update board
                self.updateState(p1_action)
                self.showBoard()
                
                if self.winner() is not None:
                    if self.winner() == 1:
                        print(self.p1.name, 'WINS!')
                    else:
                        print('TIE!')
                    self.reset()
                    break
                    
                else:
                    # Player 2
                    positions = self.availablePositions()
                    p2_action = self.p2.chooseAction(positions, self.board, self.playerSymbol)
                    self.updateState(p2_action)
                    self.showBoard()
                
                    if self.winner() is not None:
                        if self.winner() == -1:
                            print(self.p2.name, 'WINS!')
                        else:
                            print('TIE!')
                        self.reset()
                        break
                        
                        
        def showBoard(self):
            for i in range(3):
                print('-------------')
                out = '| '
                for j in range(3):
                    if self.board[i, j] == 1:
                        token = 'x'
                    if self.board[i, j] == -1:
                        token = 'O'
                    if self.board[i, j] == 0:
                        token = ' '   
                    out += token + ' | '
                print(out)
            print('-------------')
        

In [26]:
board = np.array([[1,0,-1], [0,1,-1], [0,0,1]])

for i in range(3):
    print('-------------')
    out = '| '
    for j in range(3):
        if board[i, j] == 1:
            token = 'x'
        if board[i, j] == -1:
            token = 'O'
        if board[i, j] == 0:
            token = ' '   
        out += token + ' | '
    print(out)
print('-------------')

-------------
| x |   | O | 
-------------
|   | x | O | 
-------------
|   |   | x | 
-------------
