# Part IIB: Solving Tic-Tac-Toe using $\varepsilon$-soft On-Policy Techniques

- Here, you will implement an On-Policy algorithm using $\varepsilon$-soft policies in order to make a Tic-Tac-Toe engine capable of playing Tic-Tac-Toe on an $N \times N$ board.

- You can read about the algorithm to be used in [my notes](../report.pdf) or in [Sutton and Barto](../SuttonBarto.pdf).

- The Tic-Tac-Toe engine must simulate episodes before hand and used the knowledge it gained to play against the human player.

- Since Tic-Tac-Toe is a two player game, the opponent must be simulated as part of the environment to convert this into an MDP. This can be done in two ways, either you can make the opponent another instance of the engine, or the opponent can play randomly. You will implement both these techniques.

In [None]:
'''
For playing TicTacToe run all the cells and wait till the model is trained
You will be asked to enter number corresponding to the boards where you want to make your move, for example in 1 3x3 TicTacToe:
1 | 2 | 3
4 | 5 | 6
7 | 8 | 9
The model trains for a 3x3 TicTacToe by default, you can definitely modify the values(of N, number of iterations etc) for your convenience but training model for bigger N will take lots of time
'''

In [8]:
import numpy as np

In [15]:
def find_2D_array_index(arr_3d, target_arr) -> int:
    indices = np.where(np.all(arr_3d == target_arr, axis=(1, 2)))
    if indices[0].size > 0:
        return int(indices[0][0])
    else: 
        return None

In [10]:
class TicTacToe:
    def __init__(self, N=3):
        self.N = N

    def win_check(self, board) -> int:
        list_row = np.sum(board, axis=0)
        list_col= np.sum(board, axis=1)
        list_diag = np.array([sum([board[i][i] for i in range(self.N)]), sum([board[i][self.N-1-i] for i in range(self.N)])])

        if np.any(list_row==self.N) or np.any(list_col==self.N) or np.any(list_diag==self.N):
            return 1
        elif np.any(list_row==-self.N) or np.any(list_col==-self.N) or np.any(list_diag==-self.N):
            return -1
        elif np.all(board!=0):
            return 0
        else:
            return None
    def valid_moves(self, board):
        if self.win_check(board) is not None:
            return np.array([])
        move_list = np.array([])
        for i in range(self.N):
            for j in range(self.N):
                if board[i][j] == 0:
                    move_list = np.append(move_list, np.array([self.N*i+j]), axis=0)
        return move_list
    
    def make_move(self, board, move:float) :
        move = int(move)
        i, j = move//self.N, move%self.N
        boardone = np.copy(board)
        if np.sum(board)==0:
            current_player=1
        else:
            current_player=-1
        if boardone[i][j] != 0:
            raise ValueError("Invalid move: Already occupied.")
        boardone[i][j] = current_player
        return boardone
        
    def game_print_board(self, board):
        for i in board:
            str = "|"
            for j in i:
                if j == 1:
                    str = str + "X|"
                elif j == -1:
                    str = str + "O|"
                else:
                    str = str + " |"
            print(str)
            print("------")

In [11]:
class Initializer(TicTacToe):
    def __init__(self, N=3):
        super().__init__(N)
        self.states = np.zeros((1, self.N, self.N))
        self.returns = [[[] for _ in range(self.N**2)]]
        self.Q = np.zeros((1,self.N**2))
        self.policy = np.random.dirichlet(np.ones(self.N**2)).reshape(1,-1)
        self.recursive(np.zeros((self.N,self.N)), 1)
    
    def recursive(self, board, current_player):

        valid_moves = self.valid_moves(board)
        for move in valid_moves:
            new_board = self.make_move(board, move)
            if find_2D_array_index(self.states, new_board) is None:
    
                self.states = np.append(self.states, new_board.reshape(1, self.N, self.N), axis=0)
                self.returns.append([[] for _ in range(self.N**2)])
                win = self.win_check(new_board)
                if win is None:
                    self.Q = np.append(self.Q, np.zeros((1,self.N**2)), axis=0)
                    
                    new_valid_moves = self.valid_moves(new_board)
                    random_array = np.random.rand(self.N**2)
                    mask = np.isin(np.arange(self.N**2), new_valid_moves, invert=True)
                    random_array[mask] = 0
                    random_array = random_array/np.sum(random_array)
                    
                    self.policy = np.append(self.policy, random_array.reshape(1,-1), axis=0)
                    self.recursive(new_board, -current_player)
                else:
                    self.Q = np.append(self.Q, win*np.ones((1,self.N**2)), axis=0)
                    self.policy = np.append(self.policy, np.empty((1,self.N**2)), axis=0)
            
    def return_values(self):
        return self.states, self.returns, self.Q, self.policy

    def print_stats(self):
        print("Total Number of states : ", len(self.states))
        print("First state : \n", self.states[0])
        print("Policy for first state : \n", self.policy[0])
        print("Q for first state : \n", self.Q[0])
        print("Return for first state so far : \n", self.returns[0])
        
        print("\nSecond state : \n", self.states[1000])
        print("Policy for second state : \n", self.policy[1000])
        print("Q for second state : \n", self.Q[1000])
        print("Return for second state so far : \n", self.returns[1000])

In [12]:
class RLAgent(Initializer):
    def __init__(self, N=3, gamma=0.75, epsilon=0.8):
        super().__init__(N)
        self.gamma = gamma
        self.epsilon = epsilon
        self.episode_states = np.array([])
        self.episode_actions = np.array([])

    def train(self, iterations=200000):
        sim_player = 1
        counter = 0
        for num_episode in range(iterations):
            counter = counter + 1
            self.simulate_episode(sim_player)
            sim_player = -sim_player
            if counter%(iterations//10) == 0:
                print(f"{counter}/{iterations} done !")
        

    def simulate_episode(self, sim_player):
        self.episode_states = np.array([])
        self.episode_actions = np.array([])
        self.opponent_actions = np.array([])
        player = 1
        board = np.zeros((self.N, self.N))
        while self.win_check(board) is None:
            valid_moves = self.valid_moves(board)
            index = int(find_2D_array_index(self.states, board))
            if player == sim_player:
                mover = np.random.choice(np.arange(self.N**2), p=self.policy[index])
                self.episode_states = np.append(self.episode_states, np.array([index]), axis=0)
                self.episode_actions = np.append(self.episode_actions, np.array([mover]), axis=0)
            else:
                mover = np.random.choice(np.arange(self.N**2), p=self.policy[index])
                self.opponent_actions = np.append(self.opponent_actions, np.array([mover]), axis=0)

            board = self.make_move(board, mover)
            player = -player
        self.update_policy(sim_player)
    
    def update_policy(self, sim_player:int):
        self.episode_states = self.episode_states[::-1]
        self.episode_actions = self.episode_actions[::-1]
        for k in range(len(self.episode_states)):
            index = int(self.episode_states[k])
            board = self.states[index]
            move = int(self.episode_actions[k])
            resulting_board = self.make_move(board, move)
            next_valid_moves = self.valid_moves(resulting_board)
            if self.opponent_actions[-1] in next_valid_moves:
                win = self.win_check(self.make_move(resulting_board, self.opponent_actions[-1]))
            else:
                win = self.win_check(resulting_board)
            if win==sim_player:
                self.returns[index][move].append(1)
            elif win==(-sim_player):
                self.returns[index][move].append(-1)
            elif win==0:
                self.returns[index][move].append(0)
            else:
                resulting_index = int(self.episode_states[k-1])
                previous_move = int(self.episode_actions[k-1])
                self.returns[index][move].append(self.returns[resulting_index][previous_move][-1]*self.gamma)
            self.Q[index][move] = sum(self.returns[index][move])/len(self.returns[index][move])
            positive_indices = np.where(self.policy[index]> 0)[0]
            max_index = positive_indices[np.argmax(self.Q[index][positive_indices])]
            
            self.policy[index] = np.sign(self.policy[index])
            non_zero_indices = np.nonzero(self.policy[index])[0]
            non_max_indices = non_zero_indices[non_zero_indices != max_index]
            value_buf = np.sum(self.policy[index])
            self.policy[index][max_index] = 1 - self.epsilon + self.epsilon / value_buf
            self.policy[index][non_max_indices] = self.epsilon / value_buf

    def optimal_policy(self, board):
        index = find_2D_array_index(self.states, board)
        val = np.sum(board)
        if val==0:
            current_player=1
        elif val==1:
            current_player=-1
        max_index = np.argmax(self.policy[index])
        return max_index
    
    def show_policy(self):
        for i in range(100):
            self.print_board(self.states[-i-1])
            print(self.policy[-i-1], "\n")

In [13]:
class TicTacToeEngine:

    def __init__(self, N=3, player=1):
        self.N= N
        self.player = player
        self.game(self.N, self.player)
    
    def game(self, N=3, player=1):
        Agent = RLAgent(N)
        Agent.train()
        a = "Y"
        while True:
            if a == "Y":
                board = np.zeros((N,N))
                Agent.game_print_board(board)
                while Agent.win_check(board) is None:
                    print("Your move -")
                    move = int(input("Enter cell number : "))
                    move = move - 1
                    board = Agent.make_move(board, move)
                    Agent.game_print_board(board)
                    if Agent.win_check(board) is not None : break
                    print("Computer's move - ")
                    board = Agent.make_move(board, Agent.optimal_policy(board))
                    Agent.game_print_board(board)
                win = Agent.win_check(board)
                if win == 1:
                    print("Congrats!! You won :)")
                elif win == -1:
                    print("Sorry you lost against the might of my model")
                else:
                    print("OHHH... It's a DRAW!!")
            elif a == "n":
                break
            else:
                print("Invalid Input!!")
            a = input("Do you want to play again? (Y/n)")

In [14]:
Engine = TicTacToeEngine(3)

20000/200000 done !
40000/200000 done !
60000/200000 done !
80000/200000 done !
100000/200000 done !
120000/200000 done !
140000/200000 done !
160000/200000 done !
180000/200000 done !
200000/200000 done !
| | | |
------
| | | |
------
| | | |
------
Your move -
| | | |
------
| |X| |
------
| | | |
------
Computer's move - 
| | | |
------
| |X| |
------
| | |O|
------
Your move -
|X| | |
------
| |X| |
------
| | |O|
------
Computer's move - 
|X| |O|
------
| |X| |
------
| | |O|
------
Your move -
|X| |O|
------
| |X|X|
------
| | |O|
------
Computer's move - 
|X| |O|
------
|O|X|X|
------
| | |O|
------
Your move -
|X|X|O|
------
|O|X|X|
------
| | |O|
------
Computer's move - 
|X|X|O|
------
|O|X|X|
------
| |O|O|
------
Your move -
|X|X|O|
------
|O|X|X|
------
|X|O|O|
------
OHHH... It's a DRAW!!
