# Part IIB: Solving Tic-Tac-Toe using $\varepsilon$-soft On-Policy Techniques

- Here, you will implement an On-Policy algorithm using $\varepsilon$-soft policies in order to make a Tic-Tac-Toe engine capable of playing Tic-Tac-Toe on an $N \times N$ board.

- You can read about the algorithm to be used in [my notes](../report.pdf) or in [Sutton and Barto](../SuttonBarto.pdf).

- The Tic-Tac-Toe engine must simulate episodes before hand and used the knowledge it gained to play against the human player.

- Since Tic-Tac-Toe is a two player game, the opponent must be simulated as part of the environment to convert this into an MDP. This can be done in two ways, either you can make the opponent another instance of the engine, or the opponent can play randomly. You will implement both these techniques.

# Opponent playing randomly 


In [3]:
import random
import numpy as np
import copy
from itertools import chain

In [54]:
class State:
    def __init__(self,board,player) -> None:
        self.board=board
        self.actions=self.moves_left()
        self.Q=np.zeros(len(self.moves_left()))
        self.move_played=None                         # To keep track of move played at this state for Q value updation
        self.player=player
        self.visits=0


    def moves_left(self):
        return [(i, j) for i in range(len(self.board)) for j in range(len(self.board)) if self.board[i][j] == 0]
    
    def find_best_move(self):
        return self.moves_left()[np.argmax(self.Q)]

class TicTacToeEngine(State):
    '''
    Implement the Engine here, add whatever helper classes you may need.
    '''

    

    def __init__(self,epsilon,n,start=True,randomOpp=True) -> None:   
        # Engine is second player as default 
        # You can play with O's(Second player) by start=False parameter
        # Engine Trains using an opponent that plays random moves
        # You can change the training opponent of the engine using randomOpp=False
        self.n=n
        self.epsilon=epsilon
        self.states={}
        self.randomOpp=randomOpp
        self.start=start

    def print_board(self):
        output = ''
        for i in range(self.n):
            for j in range(self.n):
                if self.board[i][j] == 1:
                    output += ' X  '
                elif self.board[i][j] == -1:
                    output += ' O  '
                else:
                    output += ' -  '
            output += '\n'
        print(output)

    def won(self):
        for i in range(len(self.board)):
            for j in range(len(self.board)):
                if self.board[i][0] == 0 or self.board[i][j] != self.board[i][0]:
                    break
            else:
                return self.board[i][0]
            for j in range(len(self.board)):
                if self.board[0][i] == 0 or self.board[j][i] != self.board[0][i]:
                    break
            else:
                return self.board[0][i]
        for i in range(len(self.board)):
            if self.board[0][0] == 0 or self.board[i][i] != self.board[0][0]:
                break
        else:
            return self.board[0][0]
        for i in range(len(self.board)):
            if self.board[0][len(self.board) - 1] == 0 or self.board[i][len(self.board) - i - 1] != self.board[0][len(self.board) - 1]:
                break
        else:
            return self.board[0][len(self.board) - 1]
        return 0

        
    def train(self):
        
        for _ in range(100000):
            trajectory=[]
            self.board=[[0]*self.n for _ in range(self.n)]
            play=1
            while(True):
                temp_board=copy.deepcopy(self.board)
                state=tuple(chain.from_iterable(temp_board))    # This is done because Lists being mutable cannot be key of a dictionary
                trajectory.append(temp_board)  
                if state not in self.states.keys():
                    self.states[state]=State(temp_board,play)
                if((self.randomOpp and not(play==1^self.start)) or self.epsilon<random.random()):
                    self.states[state].move_played=random.choice(self.moves_left())
                else:
                    self.states[state].move_played=self.states[state].find_best_move()

                self.board[self.states[state].move_played[0]][self.states[state].move_played[1]]=play

                if(self.won()!=0):
                    break
                else:
                    if(len(self.moves_left())==0):
                        break
                play*=-1

            for board in trajectory:
                state=tuple(chain.from_iterable(board))
                self.states[state].visits+=1
                move=self.states[state].actions.index(self.states[state].move_played)
                reward=self.states[state].player*self.won()
                self.states[state].Q[move]+=(reward-self.states[state].Q[move])/self.states[state].visits
   

    def play(self):
        self.board=[[0]*self.n for _ in range(self.n)]
        player=1 if self.start else -1
        if not self.start:
            state=tuple(chain.from_iterable(self.board))
            print(self.states[state].Q)
            move=self.states[state].find_best_move()
            self.board[move[0]][move[1]]=1
            self.print_board()

        while True:
            wrong_move=True
            while wrong_move:
                try:

                    row = int(input("Enter row number (Top Row is 0): "))
                    col = int(input("Enter column number (Left column is 0): "))
                    if self.board[row][col] == 0:
                        wrong_move = False
                    else:
                        print("That box is already occupied. Try again")
                except (ValueError, IndexError):
                        print("Wrong input, Try again")

            self.board[row][col]=player
            self.print_board()
            if(self.won()!=0):
                break
            else:
                if(len(self.moves_left())==0):
                    print("Draw")
                    break

            state=tuple(chain.from_iterable(self.board))
            if state in self.states.keys():
                print(self.states[state].Q)
                move=self.states[state].find_best_move()
    
            else:
                move=random.choice(self.moves_left())
            self.board[move[0]][move[1]]=-player
            self.print_board()
            if(self.won()!=0):
                break
            else:
                if(len(self.moves_left())==0):
                    print("Draw")
                    break               

        if(self.won()==player):
            print("You won")
        elif(self.won()==-player):
            print("Computer Won")

        



In [55]:
tictac=TicTacToeEngine(0.8,3)   #0.8 gave best till now
tictac.train()
tictac.play()

 -   -   -  
 -   X   -  
 -   -   -  

[-0.25927827 -0.24695004 -0.41945152 -0.35999124 -0.31311036 -0.40939287
 -0.47431692 -0.41581997]
 -   O   -  
 -   X   -  
 -   -   -  

 -   O   -  
 -   X   -  
 -   -   X  

[-0.31127073 -0.45517557 -0.369726   -0.40848208 -0.3328807  -0.95239817]
 O   O   -  
 -   X   -  
 -   -   X  

 O   O   X  
 -   X   -  
 -   -   X  

[-0.48266901 -0.33001416 -0.39042923 -1.        ]
 O   O   X  
 -   X   O  
 -   -   X  

 O   O   X  
 X   X   O  
 -   -   X  

[ 0. -1.]
 O   O   X  
 X   X   O  
 O   -   X  

Wrong input, Try again
 O   O   X  
 X   X   O  
 O   X   X  

Draw


In [60]:
def print_board(board,n):
    output = ''
    for i in range(n):
        for j in range(n):
            if board[i][j] == 1:
                output += ' X  '
            elif board[i][j] == -1:
                output += ' O  '
            else:
                output += ' -  '
        output += '\n'
    print(output)

for i,states in enumerate(tictac.states.values()):
    print_board(states.board,3)
    print(states.Q)

 -   -   -  
 -   -   -  
 -   -   -  

[0.21745711 0.25873081 0.18805061 0.08726879 0.35964159 0.5023495
 0.08399152 0.07461564 0.30803966]
 -   -   -  
 -   -   X  
 -   -   -  

[-0.11571664 -0.37861932 -0.12295199 -0.47410851  0.12141595 -0.18074692
 -0.09639718 -0.42514338]
 -   -   -  
 -   -   X  
 -   -   O  

[ 0.37885624 -0.33631268  0.12110071  0.12351     0.2943525   0.16150884
  0.11885063]
 X   -   -  
 -   -   X  
 -   -   O  

[-0.35208451 -0.14359638 -0.39508938 -0.70285628  0.17070708  0.21176003]
 X   -   -  
 -   O   X  
 -   -   O  

[0.37448248 0.16143927 0.13580269 0.60099921 0.13555321]
 X   -   -  
 -   O   X  
 X   -   O  

[-0.57919382 -0.14260489 -0.16901545 -0.20075174]
 X   O   -  
 -   O   X  
 X   -   O  

[-0.30136657  1.          0.32748301]
 -   X   -  
 -   -   -  
 -   -   -  

[-0.19126541 -0.1009854  -0.27539445  0.16115633 -0.34971906 -0.14500608
 -0.39270416 -0.03886168]
 O   X   -  
 -   -   -  
 -   -   -  

[-0.07052141  0.19866913  0.2572563

In [29]:
xoo
---
xx-
tictac.play()

[-0.29346943 -0.12980025  0.01019578  0.11465792  0.0325866   0.01152551
  0.00775525 -0.02262677  0.00875   ]
 -   -   -  
 X   -   -  
 -   -   -  

 -   -   -  
 X   -   -  
 -   -   O  

[-0.04019764 -0.11828212  0.02191696  0.10232969 -0.0560381   0.09828299
 -0.04424464]
 -   -   -  
 X   X   -  
 -   -   O  

 -   -   -  
 X   X   O  
 -   -   O  

[-0.47189066 -0.22455902  0.10374383 -0.12646858 -0.11567651]
 -   -   X  
 X   X   O  
 -   -   O  

 -   -   X  
 X   X   O  
 O   -   O  

[-0.9588121  -0.42478709  0.098193  ]
 -   -   X  
 X   X   O  
 O   X   O  

Wrong input, Try again
 -   O   X  
 X   X   O  
 O   X   O  

[0.]
 X   O   X  
 X   X   O  
 O   X   O  

Draw


In [37]:
from itertools import chain
 
ini_list = [[1, 2, 3],
            [3, 6, 7],
            [7, 5, 4]]
             
# printing initial list
print ("initial list ", str(ini_list))
 
# converting 2d list into 1d
# using chain.from_iterables
flatten_list = tuple(chain.from_iterable(ini_list))
print(flatten_list)

initial list  [[1, 2, 3], [3, 6, 7], [7, 5, 4]]
(1, 2, 3, 3, 6, 7, 7, 5, 4)


In [35]:
if 1 in a.keys():
    print("hell yeah")

AttributeError: 'tuple' object has no attribute 'keys'

In [50]:
play=1
start=True

print(not(play==1 ^start))

True
