# LAB10

The goal of this lab is to use Reinforcement Learning to create an agent capable of playing tic tac toe.

In [2]:
from itertools import permutations
from random import choice
from math import ceil
from tqdm import tqdm

## Defining the game
We save the positions of the Xs and Os in separate sets and we map the board as a magic-square:

| 2     | 9     | 4     |
|-------|-------|-------|
| **7** | **5** | **3** |
| **6** | **1** | **8** |

To check if any player has won we check if exists a permutation of 3 cells with their symbol whose sum is 15.

In [3]:
class TicTacToe:
    def __init__(self):
        self.x_pos = set()
        self.o_pos = set()
        self.moves = set(range(1, 10)) #available moves
        self.next = 'x'

    def play(self, pos): #Returns True if the move was accepted and played, False otherwise
        if pos>9 or pos<1 or pos not in self.moves: #square non-existent or already taken
            return False
        if self.next == 'x':
            self.x_pos.add(pos)
            self.moves.remove(pos)
            self.next = 'o'
        else:
            self.o_pos.add(pos)
            self.moves.remove(pos)
            self.next = 'x'
        return True
    
    def check_victory(self):
        states = [None, 'x wins', 'o wins', 'invalid state', 'the game was a tie']
        res = 0
        for t in permutations(self.x_pos, 3):
            if sum(t) == 15:
                if res==0 or res==1:
                    res = 1
                else:
                    res = 3
                break
        for t in permutations(self.o_pos, 3):
            if sum(t) == 15:
                if res==0 or res==2:
                    res = 2
                else:
                    res = 3
                break
        if res == 0 and not self.moves:
            res = 4
        return states[res], res
    
    def board(self):  #shows the board state
        coord = {
        1: (2, 1),
        2: (0, 0),
        3: (1 ,2),
        4: (0, 2),
        5: (1, 1),
        6: (2, 0),
        7: (1, 0),
        8: (2, 2),
        9: (0, 1),
        }
        mat = [["-" for _ in range(3)] for _ in range(3)]
        for i in range(1, 10):
            if i in self.x_pos:
                mat[coord[i][0]][coord[i][1]] = 'x'
            elif i in self.o_pos:
                mat[coord[i][0]][coord[i][1]] = 'o'
        res = ""
        for i in range(len(mat)):
            for j in range(len(mat[0])):
                res += mat[i][j]
            res += "\n"
        print(res)

    def evaluate_board(self, tup):
        x, o = tup
        self.x_pos = set(x)
        self.o_pos = set(o)
        moves_list = []
        for i in range(1, 10):
            if i not in self.x_pos and i not in self.o_pos:
                moves_list.append(i)
        self.moves = set(moves_list)
        _, res = self.check_victory()
        return res
        


            


### Creating the Agents
We start by creating a random agent, that plays a random move from the available ones.

In [4]:
def random_agent(game):
    return choice(tuple(game.moves))

game = TicTacToe()
while not game.check_victory()[0]:
    game.play(random_agent(game))
print(game.check_victory()[0])
game.board()

x wins
xxo
oxo
xox



### Markov Decision Problem
We can model the problem as a Markov Decision Problem, to do so, we must define some key elements:
- **States**: they represent the possible list of states that our agent might have to face, in our case it's the list of board states in which it's our turn;
- **Actions**: this is a list of all the possible moves we can employ, in this case the list of empty squares in a state;
- **Transition model**: it's the way to get a new state from a previous one and a given action, in this case we expect that after making a move a different random empty space is filled by the opponent;
- **Rewards**: these should be the rewards to give our agent when a certain breakthrough is reached for example 1 for winning and -1 for losing and -0.5 for tying;
- **Discount**: it's an hyperparameter that influences how much our agent prefers instant rewards over delayed ones.

In [5]:
#utility functions, the mdp implementation is in the next cell

def list_to_state(myList): #given a list or a tuple, it splits it in half and transforms it into a string
    if isinstance(myList, list):
        cut = ceil(len(myList)/2)
        x = myList[:cut]
        if x is None:
            x = []
        o = myList[cut:]
        if o is None:
            o = []
        x.sort()
        o.sort()
    elif isinstance(myList, tuple):
        x, o = myList
        if x is None:
            x = []
        if o is None:
            o = []
    else:
        assert('Wrong type of variable passed to list_to_state')
    xstring = ' '.join(map(lambda e: str(e), x))
    ostring = ' '.join(map(lambda e: str(e), o))
    return xstring+','+ostring

def state_to_tuple(state): #converts a string back to a tuple containing the positions of xs and os
    halves = state.split(',')
    x = list(map(lambda e: int(e), halves[0].split()))
    o = list(map(lambda e: int(e), halves[1].split()))
    return tuple((x, o))

In [6]:
class mdp:
    def __init__(self, symbol='x'):
        self.symbol = symbol # this is the symbol we are playing as
        self.states = self.generate_states()
        self.discount = 1

    def generate_states(self): #generate all states in which is our turn
        all_states = list()
        squares = range(1, 10)
        for p in permutations(squares, 9):
            for taken in range(10):
                all_states.append(p[:taken])
        actual_states = set()
        evaluator = TicTacToe()
        for s in tqdm(all_states):
            if(len(s)%2 == 0  and self.symbol == 'x') or (len(s)%2 == 1 and self.symbol == 'o'): #only states in which is our turn
                state = list_to_state(list(s))
                evaluation = evaluator.evaluate_board(state_to_tuple(state))
                if evaluation != 3: #ignore invalid states
                    actual_states.add(state)
        return actual_states
      

    def generate_actions(self, state):
        actions = set()
        x, o = state_to_tuple(state)
        for move in range(1, 10):
            if move not in x and move not in o:
                actions.add(move)
        return actions

    def reward(self, state):
        evaluator = TicTacToe()
        winner = evaluator.evaluate_board(state_to_tuple(state))
        if winner == 0:
            return 0
        if (winner==1 and self.symbol=='x') or (winner==2 and self.symbol=='o'):
            return 1
        elif winner==4:
            return -0.5
        else:
            return -1
    
    def transition(self, state, actions, action): #returns the possible following states given a starting one and a move
        x, o = state_to_tuple(state)
        evaluator = TicTacToe()
        enemy = actions.copy()
        enemy.remove(action)
        if evaluator.evaluate_board(state_to_tuple(state)) != 0: #check if state is terminal
            return [state], 0 #return back the state and set the coefficient to 0
        following = []
        if self.symbol == 'x':
            x.append(action)
            x.sort()
        else:
            o.append(action)
            o.sort()
        if evaluator.evaluate_board(tuple((x, o))) != 0: #check if state is terminal
            following.append(list_to_state(tuple((x, o))))
        if len(following)!=0:
            return following, 1
        base_x = x.copy()
        base_o = o.copy()
        for e in enemy:
            x = base_x.copy()
            o = base_o.copy()
            if self.symbol == 'x':
                o.append(e)
                o.sort()
            else:
                x.append(e)
                x.sort()
            if evaluator.evaluate_board(tuple((x, o))) != 3:      #discard impossible states
                following.append(list_to_state(tuple((x, o))))
        return following, 1/len(following)

            
        
    def q_value(self, state, utility):
        actions = self.generate_actions(state)
        res = []
        for a in actions:
            possible, prob = self.transition(state, actions, a)
            v=0
            for p in possible:
                try:
                    v += prob * (self.reward(p) + self.discount*utility[p][0])
                except KeyError:        #this should happen if the move found is terminal
                    v += self.reward(p)
            res.append(tuple((v, a)))
        if len(res)==0:
            res.append(tuple((self.reward(state), None)))
        return res
            

### Bellman Equation
We want to find an agent able to perform the optimal policy, that is, a way to choose a good move in a given state. A good way to estimate such a policy is to iteratively calculate the Bellman Equation for each state, that is:
$$U(s) = max_{a\in A(s)}\sum_{s'}P(s'|s, a)[R(s, a, s')+\gamma U(s')]$$
Where:
- $a \in A(s)$ denotes the possible actions in state $s$;
- $s'$ is one of the possible states generated by each action;
- $P(s'|s, a)$ is the probability of reaching state $s'$ performing action $a$ in state $s$;
- $R(s, a, s')$ is the reward calculated on the transition, _in this case it only depends on the resulting state $s'$ unless $s$ is already a terminal state_;
- $\gamma$ is the discount parameter defined in `mdp.discount`;
- $U(s)$ is the utility associated to each state, notice that with each iteration the utility of each state becomes more accurate.

In [7]:
def value_iteration(problem:TicTacToe, epsilon):
    utilities_prime = dict()
    print("threshold:" ,str(epsilon * (1 - problem.discount)/problem.discount))
    epoch=0
    for s in problem.states:
        utilities_prime[s] = tuple((0, 0))
    while True:
        delta = 0
        epoch += 1
        utilities = utilities_prime.copy()
        for s in tqdm(problem.states):
            utilities_prime[s] = max(problem.q_value(s, utilities), key = lambda x: x[0])
            delta = max(delta, abs(utilities_prime[s][0] - utilities[s][0]))
        print('epoch:', epoch, 'delta:', delta)
        if delta <= epsilon * (1 - problem.discount)/problem.discount:
            break
    return utilities_prime


This function should be able to return a _dictionary_ containing, for each state, the evaluation of how "good" it is and the best move to use in that case. We can try it against the random agent and see if it's able to win.

In [8]:
#utility function
def game_to_state(game:TicTacToe):
    x = list(game.x_pos)
    o = list(game.o_pos)
    x.sort()
    o.sort()
    return list_to_state(tuple((x, o)))

In [9]:
#These are the dictionaries containing the policies
policy_x = value_iteration(mdp(symbol='x'), 0.0001)
policy_o = value_iteration(mdp(symbol='o'), 0.0001)

100%|██████████| 3628800/3628800 [00:09<00:00, 379434.83it/s]


threshold: 0.0


100%|██████████| 3055/3055 [00:00<00:00, 8848.09it/s]


epoch: 1 delta: 1


100%|██████████| 3055/3055 [00:00<00:00, 8918.83it/s]


epoch: 2 delta: 1.0


100%|██████████| 3055/3055 [00:00<00:00, 8479.35it/s]


epoch: 3 delta: 0.8333333333333331


100%|██████████| 3055/3055 [00:00<00:00, 8851.87it/s]


epoch: 4 delta: 0.16145833333333348


100%|██████████| 3055/3055 [00:00<00:00, 8549.11it/s]


epoch: 5 delta: 0.0026041666666667407


100%|██████████| 3055/3055 [00:00<00:00, 8041.60it/s]


epoch: 6 delta: 0


100%|██████████| 3628800/3628800 [00:10<00:00, 331432.16it/s]


threshold: 0.0


100%|██████████| 2835/2835 [00:00<00:00, 9180.66it/s]


epoch: 1 delta: 1


100%|██████████| 2835/2835 [00:00<00:00, 9415.64it/s]


epoch: 2 delta: 1.0


100%|██████████| 2835/2835 [00:00<00:00, 9148.01it/s]


epoch: 3 delta: 0.8


100%|██████████| 2835/2835 [00:00<00:00, 8166.92it/s]


epoch: 4 delta: 0.16666666666666669


100%|██████████| 2835/2835 [00:00<00:00, 8495.21it/s]


epoch: 5 delta: 0.033333333333333326


100%|██████████| 2835/2835 [00:00<00:00, 8595.40it/s]

epoch: 6 delta: 0





In [13]:

#evaluate policy for X
wins = 0
for _ in range(10_000):
    game = TicTacToe()
    while not game.check_victory()[0]:
        if game.next == 'x':
            game.play(policy_x[game_to_state(game)][1])
        else:
            game.play(random_agent(game))
    if game.check_victory()[1] == 1:
        wins += 1
print('The agent has a winrate of ' + str(wins/100) +'% when playing X against a random opponent')

#evaluate policy for O
wins = 0
for _ in range(10_000):
    game = TicTacToe()
    while not game.check_victory()[0]:
        if game.next == 'o':
            game.play(policy_o[game_to_state(game)][1])
        else:
            game.play(random_agent(game))
    if game.check_victory()[1] == 2:
        wins += 1
print('The agent has a winrate of ' + str(wins/100) +'% when playing O against a random opponent')


The agent has a winrate of 99.41% when playing X against a random opponent
The agent has a winrate of 93.81% when playing O against a random opponent


### Try it yourself

| 2     | 9     | 4     |
|-------|-------|-------|
| **7** | **5** | **3** |
| **6** | **1** | **8** |

Here you can try to challenge the agent at tic tac toe. (fancy interface not included)

In [11]:
s = None
game = TicTacToe()
while s != 'x' and s!='o':
    s = input('Choose which symbol to play as (x or o):')

print('You are playing as '+ s)

while not game.check_victory()[0]:
    if game.next==s:
        game.board()
        a = 0
        while not game.play(int(a)):
            a = input('Choose which move to play, refer to the table above:')
    elif game.next=='o':
        game.play(policy_o[game_to_state(game)][1])
    elif game.next=='x':
        game.play(policy_x[game_to_state(game)][1])
game.board()
print(game.check_victory()[0])




You are playing as x
---
---
---

--o
---
-x-

x-o
--o
-x-

x-o
-xo
-xo

o wins


## References
[Giovanni Squillero](https://github.com/squillero/computational-intelligence) for the original code<br>
[Artificial Intelligence: A Modern Approach 4th Edition by Russel, Norvig](https://www.google.it/books/edition/Artificial_Intelligence_A_Modern_Approac/cb0qEAAAQBAJ?hl=it) to implement Reinforcement Learning and the Bellman Equation <br>
[This medium article](https://medium.com/@nour.oulad.moussa/tic-tac-toe-with-reinforcement-learning-part-i-markov-decision-process-value-policy-iteration-c4bcbb0b9fbe) for some ideas.