In [1]:
import numpy as np
import pickle # save and load serialized data (dictionaries, lists...)

In [2]:
BOARD_ROWS = 3
BOARD_COLS = 3

In [3]:
class State:
    def __init__(self, p1, p2):
        self.board = np.zeros([BOARD_ROWS, BOARD_COLS])
        self.p1 = p1
        self.p2 = p2
        self.isEnd = False
        self.symbol = 1
        
    def availablePositions(self):
        positions = []
        for i in range(BOARD_ROWS):
            for j in range(BOARD_COLS):
                if self.board[i,j] == 0:
                    positions.append((i,j))
        return positions
        
    def updateBoard(self, position):
        self.board[position] = self.symbol
        if self.symbol == 1:
            self.symbol = -1
        else:
            self.symbol = 1
        
    def checkWinner(self):
        diag_1 = 0
        diag_2 = 0   
        for i in range(BOARD_ROWS):
            #have winner
            if (sum(abs(self.board[i,:])) == BOARD_ROWS) or (sum(abs(self.board[:, i])) == BOARD_COLS):
                self.isEnd = True
                if sum(self.board[i,:]) == BOARD_COLS:
                    return 1
                elif sum(self.board[i,:]) == -BOARD_COLS:
                    return -1
                elif sum(self.board[:,i]) == BOARD_COLS:
                    return 1
                elif sum(self.board[:,i]) == -BOARD_COLS:
                    return -1
            diag_1 += self.board[i, i]
            diag_2 += self.board[i, BOARD_COLS-i-1]
        if (diag_1 == BOARD_COLS) or (diag_2 == BOARD_COLS):
            self.isEnd = True
            return 1
        elif (diag_1 == -BOARD_COLS) or (diag_2 == -BOARD_COLS):
            self.isEnd = True
            return -1
        
        #tie
        if len(self.availablePositions()) == 0:
            self.isEnd = True
            return 0
        
        #not finished
        self.isEnd = False
        return None
    
    def resetState(self):
        self.board = np.zeros([BOARD_ROWS, BOARD_COLS])
        self.isEnd = False
        self.symbol = 1
        
    def reward(self):
        winner = self.checkWinner()
        if winner is not None:
            if winner == 1:
                self.p1.feedReward(1)
                self.p2.feedReward(0)
            elif winner == -1:
                self.p1.feedReward(0)
                self.p2.feedReward(1)
            else:
                self.p1.feedReward(0.1)
                self.p2.feedReward(0.3)
    
    def train(self, num=10000):
        for i in range(num):
            if i%1000==0:
                print('Game: {}'.format(i))
            while not self.isEnd:
                positions = self.availablePositions()
                action = self.p1.chooseAction(positions, self.board, self.symbol)
                self.updateBoard(action)
                state = self.p1.getHash(self.board)
                self.p1.addState(state)
                winner = self.checkWinner()
                if winner is not None:
                    self.reward()
                    self.p1.resetPlayer()
                    self.p2.resetPlayer()
                    self.resetState()
                    break
                else:
                    positions = self.availablePositions()
                    action = self.p2.chooseAction(positions, self.board, self.symbol)
                    self.updateBoard(action)
                    state = self.p1.getHash(self.board)
                    self.p2.addState(state)
                    winner = self.checkWinner()
                    if winner is not None:
                        self.reward()
                        self.p1.resetPlayer()
                        self.p2.resetPlayer()
                        self.resetState()
                        break
                        
    def playHuman(self):
        while not self.isEnd:
            positions = self.availablePositions()
            p1_action = self.p1.chooseAction(positions, self.board, self.symbol)
            self.updateBoard(p1_action)
            self.showBoard()
            win = self.checkWinner()
            if win is not None:
                if win == 1:
                    print(self.p1.name + ' wins!')
                elif win == -1:
                    print(self.p2.name + ' wins!')
                else:
                    print('Tie!')
                self.resetState()
                break
            else:
                positions = self.availablePositions()
                p2_action = self.p2.chooseAction(positions)              
                self.updateBoard(p2_action)
                self.showBoard()
                win = self.checkWinner()
                if win is not None:
                    if win == 1:
                        print(self.p1.name + ' wins!')
                    elif win == -1:
                        print(self.p2.name + ' wins!')
                    else:
                        print('Tie!')
                    self.resetState()
                    break
                    
    def playRandom(self, num=1000):
        p1 = 0
        p2 = 0
        for i in range(num):
            while not self.isEnd:
                positions = self.availablePositions()
                p1_action = self.p1.chooseAction(positions, self.board, self.symbol)
                self.updateBoard(p1_action)
                win = self.checkWinner()
                if win is not None:
                    if win == 1:
                        p1 += 1
                    elif win == -1:
                        p2 += 1
                    self.resetState()
                    break
                else:
                    positions = self.availablePositions()
                    p2_action = self.p2.chooseAction(positions)              
                    self.updateBoard(p2_action)
                    win = self.checkWinner()
                    if win is not None:
                        if win == 1:
                            p1 += 1
                        elif win == -1:
                            p2 += 1                    
                        self.resetState()
                        break
        print('Trained agent({}) won: {} out of {}'.format(self.p1.name, p1, num))
        print('Untrained agent({}) won: {} out of {}'.format(self.p2.name, p2, num))
        print('Ties: {} out of {}'.format(num - p1 - p2, num))
        print('------------------------------------------')
                    
                
    def showBoard(self):
        # p1: x  p2: o
        for i in range(0, BOARD_ROWS):
            print('-------------')
            out = '| '
            for j in range(0, BOARD_COLS):
                if self.board[i, j] == 1:
                    token = 'x'
                if self.board[i, j] == -1:
                    token = 'o'
                if self.board[i, j] == 0:
                    token = ' '
                out += token + ' | '
            print(out)
        print('-------------')
                
            

In [4]:
class Player:
    def __init__(self, name, num = 10000,exploration_rate=0.2):
        self.name = name
        self.exp_rate = exploration_rate
        self.lr = 0.5
        self.gamma = 0.9
        self.states_value = {}
        self.states = []
        self.num_games = num
        
    def getHash(self, board):
        return str(board.reshape(board.size))
        
    def chooseAction(self, positions, board, symbol):
        if np.random.uniform(0,1) <= self.exp_rate:
            action = positions[np.random.choice(len(positions))]
        else:
            max_value = -999
            for p in positions:
                new_board = board.copy()
                new_board[p] = symbol
                new_board_hash = self.getHash(new_board)
                if self.states_value.get(new_board_hash) is None:
                    value = 0
                else:
                    value = self.states_value.get(new_board_hash)
                if max_value <= value:
                    max_value = value
                    action = p
        return action
    
    def addState(self, state):
        self.states.append(state)
    
    def feedReward(self, reward):
        for state in reversed(self.states):
            if self.states_value.get(state) is None:
                self.states_value[state] = 0
            self.states_value[state] += self.lr*(self.gamma*reward - self.states_value[state])
            reward = self.states_value[state]
            
    def resetPlayer(self):
        self.states = []     
        
    def savePolicy(self, num):
        fw = open('policy_' + str(self.name) + '_' + str(num), 'wb')
        pickle.dump(self.states_value, fw)
        fw.close()

    def loadPolicy(self, file):
        fr = open(file, 'rb')
        self.states_value = pickle.load(fr)
        fr.close()    
        

In [5]:
class Human:
    def __init__(self, name='human'):
        self.name = name
    
    def chooseAction(self, positions):
        print('Input row number: ')
        row = int(input())
        print('Input col number: ')
        col = int(input())
        action = (row, col)
        if action in positions:
            return action
        else:
            print('Invalid selected position.')
            return None

In [6]:
class UntrainedAgent:
    def __init__(self, name='untrained'):
        self.name = name
    
    def chooseAction(self, positions):
        action = positions[np.random.choice(len(positions))]
        return action

In [7]:
#TRAINING
episodes = [10000,20000,50000,100000]
for episode in episodes:
    p1 = Player('player_1')
    p2 = Player('player_2')

    #training
    state = State(p1, p2)
    state.train(episode)
    #save policies
    p1.savePolicy(episode)
    p2.savePolicy(episode)

Game: 0
Game: 1000
Game: 2000
Game: 3000
Game: 4000
Game: 5000
Game: 6000
Game: 7000
Game: 8000
Game: 9000
Game: 0
Game: 1000
Game: 2000
Game: 3000
Game: 4000
Game: 5000
Game: 6000
Game: 7000
Game: 8000
Game: 9000
Game: 10000
Game: 11000
Game: 12000
Game: 13000
Game: 14000
Game: 15000
Game: 16000
Game: 17000
Game: 18000
Game: 19000
Game: 0
Game: 1000
Game: 2000
Game: 3000
Game: 4000
Game: 5000
Game: 6000
Game: 7000
Game: 8000
Game: 9000
Game: 10000
Game: 11000
Game: 12000
Game: 13000
Game: 14000
Game: 15000
Game: 16000
Game: 17000
Game: 18000
Game: 19000
Game: 20000
Game: 21000
Game: 22000
Game: 23000
Game: 24000
Game: 25000
Game: 26000
Game: 27000
Game: 28000
Game: 29000
Game: 30000
Game: 31000
Game: 32000
Game: 33000
Game: 34000
Game: 35000
Game: 36000
Game: 37000
Game: 38000
Game: 39000
Game: 40000
Game: 41000
Game: 42000
Game: 43000
Game: 44000
Game: 45000
Game: 46000
Game: 47000
Game: 48000
Game: 49000
Game: 0
Game: 1000
Game: 2000
Game: 3000
Game: 4000
Game: 5000
Game: 6000
Game:

In [8]:
#TESTING
for episode in episodes:
    p1 = Player('computer', exploration_rate=0)
    p1.loadPolicy('policy_player_1_' + str(episode))

    p2 = UntrainedAgent('untrained')

    state = State(p1, p2)
    print('Number of episodes trained: {}'.format(episode))
    state.playRandom(1000)

Number of episodes trained: 10000
Trained agent(computer) won: 997 out of 1000
Untrained agent(untrained) won: 0 out of 1000
Ties: 3 out of 1000
------------------------------------------
Number of episodes trained: 20000
Trained agent(computer) won: 906 out of 1000
Untrained agent(untrained) won: 42 out of 1000
Ties: 52 out of 1000
------------------------------------------
Number of episodes trained: 50000
Trained agent(computer) won: 962 out of 1000
Untrained agent(untrained) won: 3 out of 1000
Ties: 35 out of 1000
------------------------------------------
Number of episodes trained: 100000
Trained agent(computer) won: 952 out of 1000
Untrained agent(untrained) won: 7 out of 1000
Ties: 41 out of 1000
------------------------------------------


In [10]:
#PLAY HUMAN
#load trained agent
p1 = Player('computer', exploration_rate=0)
p1.loadPolicy('policy_player_1_' + str(10000))

#human
p2 = Human('human')

state = State(p1, p2)
state.playHuman()

-------------
|   |   |   | 
-------------
|   |   |   | 
-------------
|   |   | x | 
-------------
Input row number: 
1
Input col number: 
1
-------------
|   |   |   | 
-------------
|   | o |   | 
-------------
|   |   | x | 
-------------
-------------
|   |   |   | 
-------------
|   | o | x | 
-------------
|   |   | x | 
-------------
Input row number: 
0
Input col number: 
2
-------------
|   |   | o | 
-------------
|   | o | x | 
-------------
|   |   | x | 
-------------
-------------
|   |   | o | 
-------------
|   | o | x | 
-------------
| x |   | x | 
-------------
Input row number: 
2
Input col number: 
1
-------------
|   |   | o | 
-------------
|   | o | x | 
-------------
| x | o | x | 
-------------
-------------
|   | x | o | 
-------------
|   | o | x | 
-------------
| x | o | x | 
-------------
Input row number: 
0
Input col number: 
0
-------------
| o | x | o | 
-------------
|   | o | x | 
-------------
| x | o | x | 
-------------
-------------
| o | x | 