In [1]:
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

In [2]:

# https://deeplearningcourses.com/c/artificial-intelligence-reinforcement-learning-in-python
# Simple reinforcement learning algorithm for learning tic-tac-toe
# Use the update rule: V(s) = V(s) + alpha*(V(s') - V(s))
# Use the epsilon-greedy policy:
#   action|s = argmax[over all actions possible from state s]{ V(s) }  if rand > epsilon
#   action|s = select random action from possible actions from state s if rand < epsilon
#
#
# INTERESTING THINGS TO TRY:
#
# Currently, both agents use the same learning strategy while they play against each other.
# What if they have different learning rates?
# What if they have different epsilons? (probability of exploring)
#   Who will converge faster?
# What if one agent doesn't learn at all?
#   Poses an interesting philosophical question: If there's no one around to challenge you,
#   can you reach your maximum potential?
# What happends if you change learning rate i.e. alpha.
# Does the epsilon value change makes agent more intelligent
#   How about decaying epsilon
# How many times should two AI agents play against each other.
from __future__ import print_function, division
from builtins import range, input

# size of tic tac toe game
LENGTH = 3

## ENVIRONMENT

In [3]:
# this class represents a tic tac toe game
class Environment:
    def __init__(self):
        self.board = np.zeros((LENGTH, LENGTH))
        self.x = 1   # represents an x on the board for player 1
        self.o = -1  # represents an o on the board for player 2
        self.ended = False
        self.winner = None
        self.num_states = 3**(LENGTH*LENGTH)
        
        
    def game_over(self):
        # returns true is game is over i.e. if a player won or it's a draw
        # otherwise returns false
        for player in [self.x, self.o]:
            # check all rows for winner
            for i in range(LENGTH):
                if self.board[i].sum() == player*LENGTH:
                    self.winner = player
                    self.ended = True
                    return True
            
            # check all columns for winner
            for j in range(LENGTH):
                if self.board[:,j].sum() == player*LENGTH:
                    self.winner = player
                    self.ended = True
                    return True
            
            # check all diagonals for winner
            # check left to right diagonal
            if np.trace(self.board) == player*LENGTH:
                self.winner = player
                self.ended = True
                return True
            
            # check right to left diagonal
            elif np.trace(np.flip(self.board, axis=0)) == player*LENGTH:
                self.winner = player
                self.ended = True
                return True
            
        # check if it is a draw
        if np.all((self.board==0)==False):
            self.winner = None
            self.ended = True
            return True
        
        # game is not over
        self.winner = None
        return False
    
    
    def is_empty(self, i, j):
        return self.board[i,j]==0
    
    
    def reward(self, sym):
        # no reward until game is over
        if not self.game_over():
            return 0
        
        # if game is over
        # sym will be self.x or self.o
        return 1 if self.winner==sym else 0
           
        
    def get_state(self):
        # returns the current state as an int
        # this is like finding the integer respresented by a base-3 number
        # consider the example of converting a binary number to decimal
        k=0
        state=0
        for i in range(LENGTH):
            for j in range(LENGTH):
                if self.board[i,j]==0:
                    v=0
                elif self.board[i,j]==self.x:
                    v=1
                elif self.board[i,j]==self.o:
                    v=2
                state += (3**k)*v
                k += 1
        return state
    
    # Example board
    # -------------
    # | x |   |   |
    # -------------
    # |   |   |   |
    # -------------
    # |   |   | o |
    # -------------

    
    def draw_board(self):
        for i in range(LENGTH):
            print('-------------')
            for j in range(LENGTH):
                if self.board[i,j] == self.x:
                    print("| x ", end="")
                elif self.board[i,j] == self.o:
                    print("| o ", end="")
                else:
                    print("|   ", end="")
            print("|")
        print('-------------')
        

## AGENT

In [4]:
class Agent:
    def __init__(self, eps=0.1, alpha=0.5, verbose=False):
        self.eps = eps  # probability of choosing a random action instead of greedy
        self.verbose = verbose
        self.state_history = []
        self.alpha = alpha
        
        
    def set_symbol(self, sym):
        self.sym = sym
        
        
    def set_V(self, V):
        self.V = V
        
        
    def reset_history(self):
        self.state_history = []
        
        
    def update_state_history(self, s):
        # cannot put this in take_action, because take_action only happens
        # once every other iteration for each player while state history needs
        # to be udpdated every iteration for both players
        self.state_history.append(s)
        
        
    def take_action(self, env):
        # choose an action based on epsilon-greedy strategy
        r = np.random.rand()
        if r < self.eps:
            # take a random action
            if self.verbose:
                print('Taking a random action')
            
            # check for empty places in board or possible moves
            possible_moves = []
            for i in range(LENGTH):
                for j in range(LENGTH):
                    if env.is_empty(i,j):
                        possible_moves.append((i,j))
            idx = np.random.choice(len(possible_moves))
            next_move = possible_moves[idx]
                
        else:
            pos_value = {}
            best_value = -1
            # take the best action based on current state values
            for i in range(LENGTH):
                for j in range(LENGTH):
                    if env.is_empty(i,j):
                        # what is the state if this move is made
                        env.board[i,j] = self.sym
                        state = env.get_state()
                        # reset the env value just set above
                        env.board[i,j] = 0
                        pos_value[(i,j)] = self.V[state] 
                        if self.V[state] > best_value:
                            best_value = self.V[state]
                            next_move = (i,j)
            
            # if verbose is set then print the V values for all empty spaces
            # this will only required when taking greedy action
            if self.verbose:
                print('Taking a greedy action')
                for i in range(LENGTH):
                    print('-------------------')
                    for j in range(LENGTH):
                        if env.is_empty(i,j):
                            # print the V value
                            print('| %.2f'%pos_value[(i,j)], end="")
                        elif env.board[i,j] == env.x:
                            print("|  x  ", end="")
                        elif env.board[i,j] == env.o:
                            print("|  o  ", end="")
                    print('|')
                print('-------------------')
        
        # make the move
        env.board[next_move[0], next_move[1]] = self.sym
        
        
    def update(self, env):
        # we want to backtrack over the states, so that:
        # V(prev_state) = V(prev_state) + alpha*(V(next_state) - V(prev_state))
        # now V(next_state) = reward if it is the last or most current state
        # because this is the state which tells us whether we win/lose/draw
        #
        # NOTE: we do this update only after an end of episode(game)
        
        # set the V value for various states of this particular game
        reward = env.reward(self.sym)
        V_next = reward
        for prev in reversed(self.state_history):
            # we do not do this for last state because there is not next state for this
            self.V[prev] = self.V[prev] + self.alpha*(V_next - self.V[prev])
            V_next = self.V[prev]
        
        # clear the game history
        self.reset_history()
        
            

## HUMAN

In [5]:
class Human:
    def __init__(self):
        pass
        
    def set_symbol(self, sym):
        self.sym = sym
        
        
    def update(self, env):
        pass

    
    def update_state_history(self, s):
        pass

    
    def take_action(self, env):
        while True:
            # break if we make a legal move
            move = input('Enter coordinates i,j for your next move (i,j=0...2): ')
            i, j = move.split(',')
            i = int(i)
            j = int(j)
            if env.is_empty(i,j):
                env.board[(i,j)] = self.sym
                break        

## V VALUE

In [6]:
def initial_V_x(env, state_winner_triples):
    # initialize state value as follows
    # if x wins, V(s) = 1
    # if x loses V(s) = -1
    # if x draw V(s) = 0.5 
    # otherwise, V(s) = 0
    
    V = np.zeros(env.num_states)
    for state, winner, ended in state_winner_triples:
        if ended:
            if winner == env.x:
                v=1
            elif winner == env.o:
                v=-1
            else:
                v=0.5
        else:
            v=0
        # set V value for the state
        V[state] = v
    return V


def initial_V_o(env, state_winner_triples):
    # initialize state value as follows
    # if o wins, V(s) = 1
    # if o loses or draw, V(s) = 0
    # otherwise, V(s) = 0.5
    
    V = np.zeros(env.num_states)
    for state, winner, ended in state_winner_triples:
        if ended:
            if winner == env.o:
                v=1
            elif winner == env.x:
                v=-1
            else:
                v=0.5
        else:
            v=0
        # set V value for the state
        V[state] = v
    return V
    

In [7]:
# get all the states of the game and winner for the state if game is ended
def get_state_hash_and_winner(env, i=0, j=0):
    results = []
    for v in [0, env.x, env.o]:
        env.board[i,j] = v
        if j==2:
            # j goes back to 0 here
            if i==2:
                # the board is full, collect results and return
                state = env.get_state()
                ended = env.game_over()
                winner = env.winner
                results.append((state, winner, ended))            
            else:
                results += get_state_hash_and_winner(env, i+1, 0)
        else:
            # increment j. i stays the same
            results += get_state_hash_and_winner(env, i, j+1)
    return results

## PLAY GAME DEFINITION

In [8]:
def play_game(p1, p2, env, verbose=False):
    # set initialy current__player to None
    current_player = None
    
    # loop until game is over
    while not env.game_over():
        # alternate player terms
        # p1 always starts the game
        if current_player == p1:
            current_player = p2
        else:
            current_player = p1
        # draw the board for the user who wants to make a move
        if verbose:
            env.draw_board() 
       
        # current player makes a move
        current_player.take_action(env)
        
        # update state histories
        state = env.get_state()
        p1.update_state_history(state)
        p2.update_state_history(state)
        
    if verbose:
        env.draw_board() 

    # do the value function update
    p1.update(env)
    p2.update(env)

## TRAIN AI

In [None]:
# initialize player 1 and player 2
p1 = Agent(eps=0.1, alpha=0.5, verbose=False)
p2 = Agent(eps=1, alpha=0.5, verbose=False)

# create tic tac toe environment
env = Environment()

# give each player their symbol
p1.set_symbol(env.x)
p2.set_symbol(env.o)

# get the initital V values
state_winner_triples = get_state_hash_and_winner(env)
Vx = initial_V_x(env, state_winner_triples)
Vo = initial_V_o(env, state_winner_triples)

# set the initial V values for player 1 and player 2 if both are AI
p1.set_V(Vx)
p2.set_V(Vo)

# train p1 as player 1
for i in tqdm(range(15000)):
    p2.eps = p2.eps/(i/2000+1)
    play_game(p1, p2, Environment(), verbose=False)

 71%|████████████████████████████████████████████████████▉                      | 10585/15000 [00:26<00:10, 419.26it/s]

## HUMAN VS AI

In [252]:
# initialize player 1 and player 2
human = Human()

# give each player their symbol
human.set_symbol(env.x)
p1.verbose=True
p2.verbose=True
p2.eps=0
while True:
    play_game(human, p2, Environment(), True)
    if input('play again ? Y/N')=='Y':
        continue
    else:
        break


-------------
|   |   |   |
-------------
|   |   |   |
-------------
|   |   |   |
-------------
Enter coordinates i,j for your next move (i,j=0...2): 1,1
-------------
|   |   |   |
-------------
|   | x |   |
-------------
|   |   |   |
-------------
Taking a greedy action
-------------------
| -0.15| -0.11| -0.11|
-------------------
| -0.16|  x  | 0.00|
-------------------
| -0.13| -0.10| -0.17|
-------------------
-------------
|   |   |   |
-------------
|   | x | o |
-------------
|   |   |   |
-------------
Enter coordinates i,j for your next move (i,j=0...2): 0,0
-------------
| x |   |   |
-------------
|   | x | o |
-------------
|   |   |   |
-------------
Taking a greedy action
-------------------
|  x  | -0.13| 0.13|
-------------------
| -0.12|  x  |  o  |
-------------------
| -0.12| -0.16| -0.16|
-------------------
-------------
| x |   | o |
-------------
|   | x | o |
-------------
|   |   |   |
-------------
Enter coordinates i,j for your next move (i,j=0...2): 


KeyboardInterrupt: 

19683