In [1]:
import numpy as np
import pickle

In [10]:
ROWS=3
COLS=3

In [8]:
class State:
    def __init__(self,p1,p2):
        self.board=np.zeros((ROWS,COLS))
        self.p1=p1
        self.p2=p2
        self.boardHash = None
        self.isDone=False
        self.Psymbol=1    #Player 1 plays first
        
    def getHash(self):
        self.boardHash = str(self.board.reshape(COLS*ROWS))
        return self.boardHash
        
    def det_win(self):
        
        for i in range(ROWS):
            if sum(self.board[i,:])==3:
                self.isDone=True
                return 1
            elif sum(self.board[i,:])==-3:
                self.isDone=True
                return -1
            
        for i in range(COLS):
            if sum(self.board[:,i])==3:
                self.isDone=True
                return 1
            elif sum(self.board[:,i])==-3:
                self.isDone=True
                return -1
            
        diag_sum1=sum([self.board[i,i] for i in range(COLS)])
        diag_sum2=sum([self.board[i,COLS-i-1] for i in range(COLS)])
        
        if diag_sum1==3:
            self.isDone=True
            return 1
        elif diag_sum1==-3:
            self.isDone=True
            return -1
        elif diag_sum2==3:
            self.isDone=True
            return 1
        elif diag_sum2==-3:
            self.isDone= True
            return -1
        
        if len(self.available()) == 0:
            self.isDone = True
            return 0
        
        self.isDone = False
        return None
    
    def available(self):
        
        avl_pos=[]
        
        for i in range(ROWS):
            for j in range(COLS):
                if self.board[i,j]==0:
                    avl_pos.append((i,j))
        
        return avl_pos
    
    def updt_brd(self,pos):
        self.board[pos]=self.Psymbol
        if self.Psymbol==1:
            self.Psymbol=-1
        else:
            self.Psymbol=1
            
    def giveReward(self):
        result = self.det_win()
        if result == 1:
            self.p1.feedReward(1)
            self.p2.feedReward(0)
        elif result == -1:
            self.p1.feedReward(0)
            self.p2.feedReward(1)
        else:
            self.p1.feedReward(0.1)
            self.p2.feedReward(0.5)
    
    def reset(self):
        self.board = np.zeros((ROWS, COLS))
        self.boardHash = None
        self.isDone = False
        self.Psymbol = 1
        
    def play(self,rounds=100):
        for i in range(rounds):
            if i%1000==0:
                print(f"Round:{i}")
            while not self.isDone:
                positions=self.available()
                p1_action=self.p1.chooseAction(positions,self.board,self.Psymbol)
                self.updt_brd(p1_action)
                board_hash=self.getHash()
                self.p1.addState(board_hash)
                

                win = self.det_win()
                if win is not None:
                    self.giveReward()
                    self.p1.reset()
                    self.p2.reset()
                    self.reset()
                    break
                    
                else:
                    # Player 2
                    positions = self.available()
                    p2_action = self.p2.chooseAction(positions, self.board, self.Psymbol)
                    self.updt_brd(p2_action)
                    board_hash = self.getHash()
                    self.p2.addState(board_hash)
                    
                    win = self.det_win()
                    if win is not None:
                        # self.showBoard()
                        # ended with p2 either win or draw
                        self.giveReward()
                        self.p1.reset()
                        self.p2.reset()
                        self.reset()
                        break
                        
    def play2(self):
        while not self.isDone:
            positions=self.available()
            p1_action=self.p1.chooseAction(positions,self.board,self.Psymbol)
            self.updt_brd(p1_action)
            self.showBoard()
            win = self.det_win()
            if win is not None:
                if win==1:
                    print(self.p1.name,"wins!")
                else:
                    print("Tie!")
                self.reset()
                break
                
            else:
                positions=self.available()
                p2_action=self.p2.chooseAction(positions)
                self.updt_brd(p2_action)
                self.showBoard()
                win = self.det_win()
                if win is not None:
                    if win==-1:
                        print(self.p2.name,"wins!")
                    else:
                        print("Tie!")
                    self.reset()
                    break
    def showBoard(self):
        # p1: x  p2: o
        for i in range(0, ROWS):
            print('-------------')
            out = '| '
            for j in range(0, COLS):
                if self.board[i, j] == 1:
                    token = 'x'
                if self.board[i, j] == -1:
                    token = 'o'
                if self.board[i, j] == 0:
                    token = ' '
                out += token + ' | '
            print(out)
        print('-------------')
        
class Player:
    def __init__(self,name,exp_rate=0.3):
        self.name=name
        self.states=[]
        self.lr=0.2
        self.exp_rate=exp_rate
        self.decay_gamma=0.9
        self.states_value={}
        
    def getHash(self,board):
        boardHash = str(board.reshape(COLS * ROWS))
        return boardHash
    
    def chooseAction(self, positions, current_board, symbol):
        if np.random.uniform(0, 1) <= self.exp_rate:
            idx = np.random.choice(len(positions))
            action = positions[idx]
        else:
            value_max = -999
            for p in positions:
                next_board = current_board.copy()
                next_board[p] = symbol
                next_boardHash = self.getHash(next_board)
                value = 0 if self.states_value.get(next_boardHash) is None else self.states_value.get(next_boardHash)
                if value >= value_max:
                    value_max = value
                    action = p
            self.savePolicy()
        return action
    
    def addState(self, state):
        self.states.append(state)
        
    def feedReward(self, reward):
        for st in reversed(self.states):
            if self.states_value.get(st) is None:
                self.states_value[st] = 0
            self.states_value[st] += self.lr * (self.decay_gamma * reward - self.states_value[st])
            reward = self.states_value[st]

    def reset(self):
        self.states = []

    def savePolicy(self):
        fw = open('policy', 'wb')
        pickle.dump(self.states_value, fw)
        fw.close()

    def loadPolicy(self, file):
        fr = open(file, 'rb')
        self.states_value = pickle.load(fr)
        fr.close()


class HumanPlayer:
    def __init__(self, name):
        self.name = name

    def chooseAction(self, positions):
        while True:
            row = int(input("Input your action row:"))
            col = int(input("Input your action col:"))
            action = (row, col)
            if action in positions:
                return action

    # append a hash state
    def addState(self, state):
        pass

    # at the end of game, backpropagate and update states value
    def feedReward(self, reward):
        pass

    def reset(self):
        pass


if __name__ == "__main__":
    # training
    p1 = Player("p1")
    p2 = Player("p2")

    st = State(p1, p2)
    #print("training...")
    #st.play(50000)

    # play with human
    p1 = Player("computer", exp_rate=0)
    p1.loadPolicy("policy")

    p2 = HumanPlayer("human")

    st = State(p1, p2)
    st.play2()

-------------
|   |   |   | 
-------------
|   |   |   | 
-------------
|   | x |   | 
-------------
Input your action row:2
Input your action col:2
-------------
|   |   |   | 
-------------
|   |   |   | 
-------------
|   | x | o | 
-------------
-------------
|   |   | x | 
-------------
|   |   |   | 
-------------
|   | x | o | 
-------------
Input your action row:1
Input your action col:1
-------------
|   |   | x | 
-------------
|   | o |   | 
-------------
|   | x | o | 
-------------
-------------
| x |   | x | 
-------------
|   | o |   | 
-------------
|   | x | o | 
-------------
Input your action row:0
Input your action col:1
-------------
| x | o | x | 
-------------
|   | o |   | 
-------------
|   | x | o | 
-------------
-------------
| x | o | x | 
-------------
| x | o |   | 
-------------
|   | x | o | 
-------------
Input your action row:2
Input your action col:0
-------------
| x | o | x | 
-------------
| x | o |   | 
-------------
| o | x | o | 
-------------
