# LAB10

The goal of this lab is to use Reinforcement Learning to create an agent capable of playing tic tac toe.

In [3]:
from itertools import permutations
from random import choice
from math import ceil
from tqdm import tqdm

## Defining the game
We save the positions of the Xs and Os in separate sets and we map the board as a magic-square:

| 2     | 9     | 4     |
|-------|-------|-------|
| **7** | **5** | **3** |
| **6** | **1** | **8** |

To check if any player has won we check if exists a permutation of 3 cells with their symbol whose sum is 15.

In [10]:
class TicTacToe:
    def __init__(self):
        self.x_pos = set()
        self.o_pos = set()
        self.moves = set(range(1, 10)) #available moves
        self.next = 'x'

    def play(self, pos): #Returns True if the move was accepted and played, False otherwise
        if pos>9 or pos<1 or pos not in self.moves: #square non-existent or already taken
            return False
        if self.next == 'x':
            self.x_pos.add(pos)
            self.moves.remove(pos)
            self.next = 'o'
        else:
            self.o_pos.add(pos)
            self.moves.remove(pos)
            self.next = 'x'
        return True
    
    def check_victory(self):
        states = [None, 'x wins', 'o wins', 'invalid state', 'the game was a tie']
        res = 0
        for t in permutations(self.x_pos, 3):
            if sum(t) == 15:
                res += 1
                break
        for t in permutations(self.o_pos, 3):
            if sum(t) == 15:
                res += 2
                break
        if res == 0 and not self.moves:
            res = 4
        return states[res], res
    
    def board(self):  #shows the board state
        coord = {
        1: (2, 1),
        2: (0, 0),
        3: (1 ,2),
        4: (0, 2),
        5: (1, 1),
        6: (2, 0),
        7: (1, 0),
        8: (2, 2),
        9: (0, 1),
        }
        mat = [["-" for _ in range(3)] for _ in range(3)]
        for i in range(1, 10):
            if i in self.x_pos:
                mat[coord[i][0]][coord[i][1]] = 'x'
            elif i in self.o_pos:
                mat[coord[i][0]][coord[i][1]] = 'o'
        res = ""
        for i in range(len(mat)):
            for j in range(len(mat[0])):
                res += mat[i][j]
            res += "\n"
        print(res)

    def evaluate_board(self, tup):
        x, o = tup
        self.x_pos = set(x)
        self.o_pos = set(o)
        moves_list = []
        for i in range(1, 10):
            if i not in self.x_pos and i not in self.o_pos:
                moves_list.append(i)
        self.moves = set(moves_list)
        _, res = self.check_victory()
        return res
        


            


### Creating the Agents
We start by creating a random agent, that plays a random move from the available ones.

In [13]:
def random_agent(game):
    return choice(tuple(game.moves))

game = TicTacToe()
while not game.check_victory()[0]:
    game.play(random_agent(game))
print(game.evaluate_board(tuple((game.x_pos, game.o_pos))))
game.board()

2
xx-
ooo
oxx



### Markov Decision Problem
We can model the problem as a Markov Decision Problem, to do so, we must define some key elements:
- **States**: they represent the possible list of states that our agent might have to face, in our case it's the list of board states in which it's our turn;
- **Actions**: this is a list of all the possible moves we can employ, in this case the list of empty squares in a space;
- **Transition model**: it's defined as the way to get a new state from a previous one and a given action;
- **Rewards**: these should be the rewards to give our agent when a certain breakthrough is reached for example 1 for winning and -1 for losing and tying;
- **Discount**: it's an hyperparameter that influences how much our agent prefers instant rewards over delayed ones.

In [32]:
def list_to_state(myList): #given a list or a tuple, it splits it in half and transforms it into a string
    if isinstance(myList, list):
        cut = ceil(len(myList)/2)
        x = myList[:cut]
        if x is None:
            x = []
        o = myList[cut:]
        if o is None:
            o = []
        x.sort()
        o.sort()
    elif isinstance(myList, tuple):
        x, o = myList
        if x is None:
            x = []
        if o is None:
            o = []
    else:
        assert('Wrong type of variable passed to list_to_state')
    xstring = ' '.join(map(lambda e: str(e), x))
    ostring = ' '.join(map(lambda e: str(e), o))
    return xstring+','+ostring

def state_to_tuple(state): #converts a string back to a tuple containing the positions of xs and os
    halves = state.split(',')
    x = list(map(lambda e: int(e), halves[0].split()))
    o = list(map(lambda e: int(e), halves[1].split()))
    return tuple((x, o))

In [77]:
class mdp:
    def __init__(self, symbol='x'):
        self.symbol = symbol # this is the symbol we are playing as
        self.states = self.generate_states()
        self.discount = 0.89

    def generate_states(self): #generate all states in which is our turn
        all_states = list()
        squares = range(1, 10)
        for p in permutations(squares, 9):
            for taken in range(10):
                all_states.append(p[:taken])
        actual_states = set()
        evaluator = TicTacToe()
        for s in all_states:
            if(len(s)%2 == 0 and self.symbol == 'x') or (len(s)%2 == 1 and self.symbol == 'o'): #only states in which is our turn
                state = list_to_state(list(s))
                evaluation = evaluator.evaluate_board(state_to_tuple(state))
                if evaluation != 4:
                    actual_states.add(state)
        return actual_states
      

    def generate_actions(self, state):
        actions = set()
        x, o = state_to_tuple(state)
        for move in range(1, 10):
            if move not in x and move not in o:
                actions.add(move)
        return actions

    def reward(self, state):
        evaluator = TicTacToe()
        winner = evaluator.evaluate_board(state_to_tuple(state))
        if winner == 0:
            return 0
        if (winner==1 and self.symbol=='x') or (winner==2 and self.symbol=='o'):
            return 1
        else:
            return -1
    
    def transition(self, state, actions, action): #returns the possible following states given a starting one and a move
        x, o = state_to_tuple(state)
        x = list(x)
        o = list(o)
        enemy = actions.copy()
        enemy.remove(action)
        following = []
        if self.symbol == 'x':
            x.append(action)
            x.sort()
        else:
            o.append(action)
            o.sort()
        base_x = x.copy()
        base_o = o.copy()
        for e in enemy:
            x = base_x.copy()
            o = base_o.copy()
            if self.symbol == 'x':
                o.append(e)
                o.sort()
            else:
                x.append(e)
                x.sort()
            following.append(list_to_state(tuple((x, o))))
        return following, len(following)

            
        
    def q_function(self, state, utility):
        actions = self.generate_actions(state)
        res = []
        for a in actions:
            possible, den = self.transition(state, actions, a)
            v=0
            for p in possible:
                v += 1/den * (self.reward(p) + self.discount*utility[p])
            res.append(v)
        return res
            

        



### Bellman Equation
We want to find an agent able to perform the optimal policy, that is, a way to choose a good move in a given state. A good way to estimate such a policy is using the Bellman Equation.

In [97]:
def value_iteration(problem:TicTacToe, epsilon):
    utilities_prime = dict()
    print("threshold:" ,str(epsilon * (1 - problem.discount)/problem.discount))
    epoch=0
    for s in problem.states:
        utilities_prime[s] = 0
    while True:
        delta = 0
        epoch += 1
        utilities = utilities_prime.copy()
        for s in tqdm(problem.states):
            utilities_prime[s] = max(problem.q_function(s, utilities))
            delta = max(delta, abs(utilities_prime[s] - utilities[s]))
        print('epoch:', epoch, 'delta:', delta)
        if delta < epsilon * (1 - problem.discount)/problem.discount:
            break
    return utilities_prime


In [93]:
p = mdp()
  

In [98]:
value_iteration(p, 0.1) 

threshold: 0.012359550561797751


100%|██████████| 3139/3139 [00:00<00:00, 14227.61it/s]


epoch: 1 delta: 1.0


100%|██████████| 3139/3139 [00:00<00:00, 12716.27it/s]


epoch: 2 delta: 0.8900000000000001


100%|██████████| 3139/3139 [00:00<00:00, 13309.49it/s]


epoch: 3 delta: 0.7590958333333334


100%|██████████| 3139/3139 [00:00<00:00, 12698.13it/s]


epoch: 4 delta: 0.6462215833333335


100%|██████████| 3139/3139 [00:00<00:00, 13557.24it/s]

epoch: 5 delta: 0





{'2 3 6 9,4 5 7 8': 0,
 '5,4': 1.4347541666666666,
 '1 2 4 5,3 7 8 9': 0,
 '6 7 8,2 4 5': 0.0,
 '1 3 6,7 8 9': 0.0,
 '1 4 5,2 6 8': 0.0,
 '3 4 9,2 7 8': -0.5,
 '1 2 7 9,3 4 5 8': 0,
 '2 4 7 9,1 3 5 8': 0,
 '1 3 5 6,2 4 7 8': 0,
 '3 5 7,1 6 9': 1.0,
 '3 5 9,1 2 4': 1.0,
 '1 6,3 7': 0.9450000000000001,
 '5 6 7 8,1 3 4 9': 0,
 '1 3 4,2 7 9': 0.0,
 '1 4,2 3': 0.6675,
 '2 3 5,6 7 8': 0.0,
 '4 6 8 9,1 2 3 7': 0,
 '2 3 5 6,1 4 8 9': 0,
 '3 5 9,1 6 8': -1.0,
 '2 3 4 6,5 7 8 9': 0,
 '3 7,4 5': 0.11125,
 '6 8,2 5': 1.8900000000000001,
 '1 6 8,2 4 9': -1.0,
 '3 6 7,2 4 5': -0.5,
 '2 6 9,3 4 7': -0.5,
 '2 9,1 6': 0.945,
 '2 3 4,1 8 9': -0.5,
 '2 5,4 9': 1.8900000000000001,
 '4 9,3 6': 1.8900000000000001,
 '1 2 8,3 4 6': 1.0,
 '1 2 3 4,5 7 8 9': 0,
 '3 4 8,1 5 6': 1.0,
 '1 2 3,7 8 9': 0.0,
 '2 3 6 8,1 4 5 7': 0,
 '4 9,6 8': 0.9450000000000001,
 '1 6 7 8,2 3 4 5': 0,
 '2 3 4,5 6 7': 1.0,
 '1 2,3 9': 0.6675,
 '5 6 7 8,1 2 3 9': 0,
 '2 4 6,5 7 8': 0.0,
 '2 8,1 3': 1.8900000000000001,
 '4 5 7 8,1 3 6 9