# LAB10

The goal of this lab is to use Reinforcement Learning to create an agent capable of playing tic tac toe.

In [3]:
from itertools import permutations
from random import choice
from math import ceil
from tqdm import tqdm

## Defining the game
We save the positions of the Xs and Os in separate sets and we map the board as a magic-square:

| 2     | 9     | 4     |
|-------|-------|-------|
| **7** | **5** | **3** |
| **6** | **1** | **8** |

To check if any player has won we check if exists a permutation of 3 cells with their symbol whose sum is 15.

In [4]:
class TicTacToe:
    def __init__(self):
        self.x_pos = set()
        self.o_pos = set()
        self.moves = set(range(1, 10)) #available moves
        self.next = 'x'

    def play(self, pos): #Returns True if the move was accepted and played, False otherwise
        if pos>9 or pos<1 or pos not in self.moves: #square non-existent or already taken
            return False
        if self.next == 'x':
            self.x_pos.add(pos)
            self.moves.remove(pos)
            self.next = 'o'
        else:
            self.o_pos.add(pos)
            self.moves.remove(pos)
            self.next = 'x'
        return True
    
    def check_victory(self):
        states = [None, 'x wins', 'o wins', 'invalid state', 'the game was a tie']
        res = 0
        for t in permutations(self.x_pos, 3):
            if sum(t) == 15:
                res += 1
                break
        for t in permutations(self.o_pos, 3):
            if sum(t) == 15:
                res += 2
                break
        if res == 0 and not self.moves:
            res = 4
        return states[res], res
    
    def board(self):  #shows the board state
        coord = {
        1: (2, 1),
        2: (0, 0),
        3: (1 ,2),
        4: (0, 2),
        5: (1, 1),
        6: (2, 0),
        7: (1, 0),
        8: (2, 2),
        9: (0, 1),
        }
        mat = [["-" for _ in range(3)] for _ in range(3)]
        for i in range(1, 10):
            if i in self.x_pos:
                mat[coord[i][0]][coord[i][1]] = 'x'
            elif i in self.o_pos:
                mat[coord[i][0]][coord[i][1]] = 'o'
        res = ""
        for i in range(len(mat)):
            for j in range(len(mat[0])):
                res += mat[i][j]
            res += "\n"
        print(res)

    def evaluate_board(self, tup):
        self.x_pos = set(tup[0])
        self.o_pos = set(tup[1])
        moves_list = []
        for i in range(1, 10):
            if i not in self.x_pos and i not in self.o_pos:
                moves_list.append(i)
        self.moves = set(moves_list)
        _, res = self.check_victory()
        return res
        


            


### Creating the Agents
We start by creating a random agent, that plays a random move from the available ones.

In [5]:
def random_agent(game):
    return choice(tuple(game.moves))

game = TicTacToe()
while not game.check_victory()[0]:
    game.play(random_agent(game))
print(game.evaluate_board(tuple((game.x_pos, game.o_pos))))
game.board()

1
xxx
oox
xoo



### Markov Decision Problem
We can model the problem as a Markov Decision Problem, to do so, we must define some key elements:
- **States**: they represent the possible list of states that our agent might have to face, in our case it's the list of board states in which it's our turn and the game is not already finished;
- **Actions**: this is a list of all the possible moves we can employ, in this case the list of empty squares in a space;
- **Transition model**: it's defined as the way to get a new state from a previous one and a given action;
- **Rewards**: these should be the rewards to give our agent when a certain breakthrough is reached for example 1 for winning and -1 for losing and tying;
- **Discount**: it's an hyperparameter that influences how much our agent prefers instant rewards over delayed ones.

In [10]:
class mdp:
    def __init__(self, symbol='x'):
        self.symbol = symbol # this is the symbol we are playing as
        self.states = self.generate_states()
        self.discount = 0.89

    def generate_states(self): #generate all states in which is our turn
        all_states = list()
        squares = range(1, 10)
        count = 0
        for taken in range(10):
            for p in permutations(squares, 9):
                count += 1
                all_states.append(frozenset(p[:taken]))
        actual_states = list()
        evaluator = TicTacToe()
        for s in all_states:
            if (len(s)%2 == 0 and self.symbol == 'x') or (len(s)%2 == 1 and self.symbol == 'o'): #ignore states in which is not our turn
                cut = ceil(len(s)/2)
                state = tuple((frozenset(list(s)[:cut]), frozenset(list(s)[cut:])))
                evaluation = evaluator.evaluate_board(state)
                if evaluation!=4:
                    actual_states.append(state)
        return tuple(set(actual_states))

    def generate_actions(self, state):
        actions = set()
        x, o = state
        for move in range(1, 10):
            if move not in x and move not in o:
                actions.add(move)
        return actions

    def reward(self, state):
        evaluator = TicTacToe()
        winner = evaluator.evaluate_board(state)
        if winner == 0:
            return 0
        if (winner==1 and self.symbol=='x') or (winner==2 and self.symbol=='o'):
            return 1
        else:
            return -1
    
    def transition(self, state, actions, action): #returns the possible following states given a starting one and a move
        x, o = state
        x = list(x)
        o = list(o)
        enemy = actions.copy()
        enemy.remove(action)
        following = []
        if self.symbol == 'x':
            x.append(action)
        else:
            o.append(action)
        base_x = x.copy()
        base_o = o.copy()
        for s in self.states:
            x_prime, o_prime = s
            x_prime = list(x_prime)
            o_prime = list(o_prime)
            for e in enemy:
                x = base_x.copy()
                o = base_o.copy()
                if self.symbol == 'x':
                    o.append(e)
                else:
                    x.append(e)
                if set(x) == set(x_prime) and set(o) == set(o_prime):
                    following.append(s)
        return following, len(following)

            
        
    def q_function(self, state, utility):
        actions = self.generate_actions(state)
        res = []
        for a in actions:
            possible, den = self.transition(state, actions, a)
            v=0
            for p in possible:
                v += 1/den * (self.reward(p) + self.discount*utility[p])
            res.append(v)
        return res
            



def value_iteration(problem:TicTacToe, epsilon):
    utilities_prime = dict()
    print("threshold:" ,str(epsilon * (1 - problem.discount)/problem.discount))
    epoch=0
    for s in problem.states:
        utilities_prime[s] = 0
    delta = 0
    while True:
        epoch += 1
        print('epoch ', epoch, 'delta ', delta)
        utilities = utilities_prime.copy()
        for s in tqdm(problem.states):
            utilities_prime[s] = max(problem.q_function(s, utilities))
            if abs(utilities_prime[s] - utilities[s]) > delta:
                delta = utilities_prime[s] - utilities[s]
        if delta > epsilon * (1 - problem.discount)/problem.discount or epoch>9:
            break
                
    return utilities_prime

p = mdp()
value_iteration(p, 0.1)   


        



threshold: 0.012359550561797751
epoch  1 delta  0


  0%|          | 0/281 [03:07<?, ?it/s]


KeyboardInterrupt: 

### Bellman Equation
We want to find an agent able to perform the optimal policy, that is, a way to choose a good move in a given state. A good way to estimate such a policy is using the Bellman Equation.

In [314]:
def value_iteration(problem:TicTacToe, epsilon):
    utilities_prime = dict()
    print("threshold:" ,str(epsilon * (1 - problem.discount)/problem.discount))
    epoch=0
    for s in problem.states:
        utilities_prime[s] = 0
    delta = 0
    while True:
        epoch += 1
        print('epoch ', epoch, 'delta ', delta)
        utilities = utilities_prime.copy()
        for s in tqdm(problem.states):
            utilities_prime[s] = max(problem.q_function(s, utilities))
            if abs(utilities_prime[s] - utilities[s]) > delta:
                delta = utilities_prime[s] - utilities[s]
        if delta > epsilon * (1 - problem.discount)/problem.discount or epoch>9:
            break
                
    return utilities_prime

p = mdp()
value_iteration(p, 0.1)     
    

threshold: 0.012359550561797751
epoch  1 delta  0


  0%|          | 0/281 [00:00<?, ?it/s]

100%|██████████| 281/281 [00:02<00:00, 120.59it/s]


epoch  2 delta  0


 64%|██████▎   | 179/281 [00:01<00:00, 117.14it/s]


KeyboardInterrupt: 

In [297]:
a = [1, 2, 3, 3, 1, 2, 2, 1, 3]
b = [3, 1, 2]

print(set(a)==set(b))

True
