In [18]:
import numpy as np
import time
import copy

In [19]:
class TicTacToe:
    def __init__(self):
        self.state = np.zeros(9)
        self.empty_spots = np.arange(9)
        self.qtable = {}
        self.done = False
        self.reward = 0
    
    def init(self):
        self.state.fill(0)
        self.empty_spots = np.arange(9)
        player_side = np.random.choice([1,-1])
        opponent_side = -1*player_side
        self.done = False
        return str(self.state),player_side,opponent_side
    
    def check_win(self):
        self.done = True
        for i in range(3):
            if np.absolute(self.state[3*i]+self.state[3*i+1]+self.state[3*i+2]) == 3:
                return True
            if np.absolute(self.state[i]+self.state[i+3]+self.state[i+6]) == 3:
                return True
        if np.absolute(self.state[0] + self.state[4] + self.state[8]) == 3:
            return True
        if np.absolute(self.state[2] + self.state[4] + self.state[6]) == 3:
            return True
        
        self.done = False
        return False
    
    def check_draw(self):
        if np.any(self.state == 0):
            return False
        self.done = True
        return True
    
    def action(self,pos,side):
        self.state[pos] = side
        if self.check_win():
            self.reward = 5
            return self.reward
        if self.check_draw():
            self.reward = 0.5
            return self.reward
        self.reward = 0.1
        return self.reward

    
    def random_agent_step(self,side):
        # self.render()
        # print(side, self.qtable, self.empty_spots)
        pos = np.random.choice(self.empty_spots)
        self.reward = self.action(pos,side)
        return pos
        
    def safe_agent_step(self,side):
        for pos in self.empty_spots:
            self.reward = self.action(pos,side)
            if self.reward == 5:
                return pos
            else:
                self.state[pos] = 0
        
        for pos in self.empty_spots:
            self.reward= self.action(pos,-1*side)
            if self.reward == 5:
                self.reward = self.action(pos,side)
                return pos
            else:
                self.state[pos] = 0
                
        pos = self.random_agent_step(side)
        return pos
    
    def player_to_char(self,pos):
        if self.state[pos] == 1:
            return 'x'
        if self.state[pos] == -1:
            return 'o'
        return ' '
                
    def display(self):
        for i in range(3):
            print(self.player_to_char(i*3),'||',self.player_to_char(i*3 + 1),'||',self.player_to_char(i*3 +2) )
            if i != 2:
                print('===='*3)
                
        print("")
    def update_qtable(self,):
        if str(self.state) not in self.qtable:
            self.qtable[str(self.state)] = np.zeros(9)
        return self.qtable
    def chose_action(self,):
        if str(self.state) in self.qtable:
            max_pos = np.argmax(self.qtable[str(self.state)][self.empty_spots])
            return self.empty_spots[max_pos]
        else:
            self.qtable[str(self.state)] = np.zeros(9)
            return self.empty_spots[0]

In [20]:
def train(game,learning_rate,gamma,opponent_choice, num_train_games = 10000, num_val_games = 100):
    epsilon = 1
    max_epsilon = 1
    min_epsilon = 0.01
    decay_rate = 0.05
    
    for episode in range(num_train_games):
        if episode%500 == 0:
            print('--------------Episode : {} --------------'.format(episode))
        state,player_side,opponent_side = game.init()
        opponent = np.random.choice(opponent_choice)
        turn = 1
        
        if episode == 0:
            game.update_qtable()
        while not game.done:
            if turn == opponent_side:
                if opponent == 0:
                    pos = game.random_agent_step(opponent_side)
                else:
                    pos = game.safe_agent_step(opponent_side)
            else:
                threshold = np.random.uniform(0,1)
                if threshold > epsilon:
                    pos = game.chose_action()
                    game.action(pos,player_side)
                else:
                    pos = game.random_agent_step(player_side)

            game.update_qtable()
            game.qtable[state][pos] = game.qtable[state][pos] + learning_rate*(game.reward-gamma*np.max(game.qtable[str(game.state)]) - game.qtable[state][pos])
            turn *= -1
            game.empty_spots = game.empty_spots[game.empty_spots != pos]
            state = str(game.state)
        
        if episode%500 == 0 or episode==num_train_games-1:
            test(copy.deepcopy(game),game.qtable,num_test_games = num_val_games,opponent_choice=opponent_choice, flag=0)
            
        epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode)
    

def test(game,qtable,num_test_games=100,opponent_choice=[0],flag=0):
    wins_random = 0
    draws_random = 0
    loss_random = 0
    wins_safe = 0
    draws_safe = 0
    loss_safe = 0
    games_random = 0

    game.init()
    game.qtable = qtable

    for eps in range(num_test_games):
        state,player_side,opponent_side = game.init()
        finish = False
        opponent = np.random.choice(opponent_choice)
        turn = 1
        
        if opponent == 0:
            games_random += 1
        while not game.done:
            if turn == opponent_side:
                if opponent == 0:
                    pos = game.random_agent_step(opponent_side)
                else:
                    pos = game.safe_agent_step(opponent_side)
            else:
                pos = game.chose_action()
                reward = game.action(pos,player_side)
            if game.done and game.reward == 5 and turn == player_side:
                if opponent == 0:
                    wins_random += 1
                else:
                    wins_safe += 1
            if game.done and game.reward == 5 and turn == opponent_side:
                if opponent == 0:
                    loss_random += 1
                else:
                    loss_safe += 1
            if game.done and game.reward == 0.5:
                if opponent == 0:
                    draws_random += 1
                else:
                    draws_safe += 1

            turn *= -1
            state = str(game.state)
            game.empty_spots = game.empty_spots[game.empty_spots != pos]

    if flag == 0:
        print('Games : ',num_test_games ,'\tWins : ',wins_random+wins_safe,'\tDraws : ',draws_random+draws_safe,'\tLosses : ',loss_random+loss_safe)
    elif flag == 1:
        print('Stats :')
        print('\t\t\t Games Played \t Games Won \t Games drawn \t Games Lost')
        print('Against Random Agent\t',games_random,'\t\t',wins_random,'\t\t',draws_random,'\t\t',loss_random)
        print('Against Safe Agent\t',num_test_games-games_random,'\t\t',wins_safe,'\t\t',draws_safe,'\t\t',loss_safe)
        print('Total\t\t\t',num_test_games,'\t\t',wins_random+wins_safe,'\t\t',draws_random+draws_safe,'\t\t',loss_random+loss_safe)

# Trained against Random agent

In [21]:
game_random = TicTacToe()
train(game_random,0.7,0.7,[0])

--------------Episode : 0 --------------
Games :  100 	Wins :  52 	Draws :  6 	Losses :  42
--------------Episode : 500 --------------
Games :  100 	Wins :  54 	Draws :  13 	Losses :  33
--------------Episode : 1000 --------------
Games :  100 	Wins :  61 	Draws :  14 	Losses :  25
--------------Episode : 1500 --------------
Games :  100 	Wins :  59 	Draws :  6 	Losses :  35
--------------Episode : 2000 --------------
Games :  100 	Wins :  56 	Draws :  16 	Losses :  28
--------------Episode : 2500 --------------
Games :  100 	Wins :  58 	Draws :  15 	Losses :  27
--------------Episode : 3000 --------------
Games :  100 	Wins :  71 	Draws :  10 	Losses :  19
--------------Episode : 3500 --------------
Games :  100 	Wins :  69 	Draws :  10 	Losses :  21
--------------Episode : 4000 --------------
Games :  100 	Wins :  72 	Draws :  12 	Losses :  16
--------------Episode : 4500 --------------
Games :  100 	Wins :  75 	Draws :  9 	Losses :  16
--------------Episode : 5000 --------------
Gam

In [23]:
test(game_random,game_random.qtable,100,[0,1],flag=1)

Stats :
			 Games Played 	 Games Won 	 Games drawn 	 Games Lost
Against Random Agent	 48 		 36 		 3 		 9
Against Safe Agent	 52 		 7 		 24 		 21
Total			 100 		 43 		 27 		 30


# Trained against safe agent

In [24]:
game_safe = TicTacToe()
train(game_safe,0.7,0.7,[1])

--------------Episode : 0 --------------
Games :  100 	Wins :  0 	Draws :  18 	Losses :  82
--------------Episode : 500 --------------
Games :  100 	Wins :  5 	Draws :  21 	Losses :  74
--------------Episode : 1000 --------------
Games :  100 	Wins :  14 	Draws :  37 	Losses :  49
--------------Episode : 1500 --------------
Games :  100 	Wins :  12 	Draws :  58 	Losses :  30
--------------Episode : 2000 --------------
Games :  100 	Wins :  9 	Draws :  62 	Losses :  29
--------------Episode : 2500 --------------
Games :  100 	Wins :  27 	Draws :  49 	Losses :  24
--------------Episode : 3000 --------------
Games :  100 	Wins :  32 	Draws :  52 	Losses :  16
--------------Episode : 3500 --------------
Games :  100 	Wins :  22 	Draws :  58 	Losses :  20
--------------Episode : 4000 --------------
Games :  100 	Wins :  25 	Draws :  58 	Losses :  17
--------------Episode : 4500 --------------
Games :  100 	Wins :  40 	Draws :  53 	Losses :  7
--------------Episode : 5000 --------------
Game

In [27]:
test(game_safe,game_safe.qtable,100,[0,1],flag=1)

Stats :
			 Games Played 	 Games Won 	 Games drawn 	 Games Lost
Against Random Agent	 56 		 42 		 8 		 6
Against Safe Agent	 44 		 15 		 27 		 2
Total			 100 		 57 		 35 		 8


# Trained against both agents (chosen randomly at every epoch)

In [28]:
game_both = TicTacToe()
train(game_both,0.7,0.7,[0,1])

--------------Episode : 0 --------------
Games :  100 	Wins :  26 	Draws :  14 	Losses :  60
--------------Episode : 500 --------------
Games :  100 	Wins :  28 	Draws :  25 	Losses :  47
--------------Episode : 1000 --------------
Games :  100 	Wins :  36 	Draws :  26 	Losses :  38
--------------Episode : 1500 --------------
Games :  100 	Wins :  27 	Draws :  29 	Losses :  44
--------------Episode : 2000 --------------
Games :  100 	Wins :  32 	Draws :  17 	Losses :  51
--------------Episode : 2500 --------------
Games :  100 	Wins :  33 	Draws :  21 	Losses :  46
--------------Episode : 3000 --------------
Games :  100 	Wins :  34 	Draws :  23 	Losses :  43
--------------Episode : 3500 --------------
Games :  100 	Wins :  45 	Draws :  25 	Losses :  30
--------------Episode : 4000 --------------
Games :  100 	Wins :  36 	Draws :  29 	Losses :  35
--------------Episode : 4500 --------------
Games :  100 	Wins :  49 	Draws :  22 	Losses :  29
--------------Episode : 5000 --------------


In [29]:
test(game_both,game_both.qtable,100,[0,1],flag=1)

Stats :
			 Games Played 	 Games Won 	 Games drawn 	 Games Lost
Against Random Agent	 46 		 36 		 8 		 2
Against Safe Agent	 54 		 23 		 29 		 2
Total			 100 		 59 		 37 		 4
