Copyright **`(c)`** 2023 Giovanni Squillero `<giovanni.squillero@polito.it>`  
[`https://github.com/squillero/computational-intelligence`](https://github.com/squillero/computational-intelligence)  
Free for personal or classroom use; see [`LICENSE.md`](https://github.com/squillero/computational-intelligence/blob/master/LICENSE.md) for details.  

# LAB10

Use reinforcement learning to devise a tic-tac-toe player.

### Deadlines:

* Submission: 25/12 ([CET](https://www.timeanddate.com/time/zones/cet))
* Reviews: 06/01 ([CET](https://en.wikipedia.org/wiki/Sol_Invictus))

Notes:

* Reviews will be assigned  on Monday, December 4
* You need to commit in order to be selected as a reviewer (ie. better to commit an empty work than not to commit)

You can see it this way: picking 3 numbers whose sum is 15

In [709]:
from itertools import combinations
from collections import namedtuple, defaultdict
from random import choice, random, randint
from copy import deepcopy
from tqdm.auto import tqdm
import numpy as np

In [710]:
any(sum(c)==15 for c in combinations({1,2,3,4,5,6},3))

True

In [711]:
State= namedtuple('State',['x','o'])

In [712]:
MAGIC=[2,7,6,
       9,5,1,
       4,3,8]

In [713]:
def win(elements):
    """ Checks if elements is winning"""
    return any(sum(c)==15 for c in combinations(elements,3))

def state_value(pos:State):
    """Evaluate position: +1 first player wins"""
    if win(pos.x):
        return 1
    elif win(pos.o):
        return -1
    else:
        return 0

In [714]:
def print_board(pos):
    """ Nicely prints the board"""
    for r in range(3):
        for c in range(3):
            index=r*3+c
            if MAGIC[index] in pos.x:
                print('X', end='')
            elif MAGIC[index] in pos.o:
                print('O', end='')
            else:
                print('-', end='')
        print("\n")

print("\n")





In [715]:
def random_game():
    #starts with an empty position and keeps on adding one element in both side until someone wins
    trajectory=list()
    state=State(set(), set())
    available=set(range(1,9+1))
    while True:
        
        x= choice(list(available))
        state.x.add(x)
        trajectory.append(deepcopy(state))
        available.remove(x) 
        
        
        if(win(state.x)) or not available:
            break
               
        o = choice(list(available))
        state.o.add(o)
        trajectory.append(deepcopy(state))
        available.remove(o)
        if(win(state.o)):
            break

    return trajectory 


In [716]:
class RandomPlayer:
    def __init__(self, symbol) -> None:
        self._symbol=symbol
        
    @property
    def symbol(self)-> int:
        return self._symbol
    
    def move(self,available_moves, state=None):
        return choice(list(available_moves))

In [717]:
class Agent:
    def __init__(self, symbol) -> None:
        self._symbol=symbol
        self._dict_moves=defaultdict(float)
        self._winning_games=0

    @property
    def symbol(self)-> int:
        return self._symbol
    
    @property
    def dict_moves(self)-> int:
        return self._dict_moves
    
    def move(self,available_moves, state)->int:

        if random()<0.05:
            best_move = choice(list(available_moves))
            temp_state = deepcopy(state) 
            temp_state.x.add(best_move)
            if (frozenset(state.x), frozenset(state.o)) not in self.dict_moves.keys():
                self.dict_moves[(frozenset(temp_state.x), frozenset(temp_state.o))]=0.001
            
        else:
            
            #keys= [k for k in self.dict_moves.keys() if (k[1] == state.o and len(k[0])==len(state.x)+1 )]
            keys= [k for k in self.dict_moves.keys() if (k[1] == state.o and k[0]==state.x )]

            if keys:
                #new_state = sorted(keys, key=lambda k: self.dict_moves[k], reverse=True)

                """ best_move=list(new_state[0][0]-state.x)[0]

                while(best_move not in available_moves and i<len(new_state)):

                    best_move=list(new_state[i][0]-state.x)[0]
                    #print(best_move)
                    i+=1
                    if (best_move not in available_moves):
                        best_move=None
                
                if(best_move is None):
                    best_move = choice(list(available_moves))
                    state.x.add(best_move)
                    
                    self.dict_moves[(frozenset(state.x), frozenset(state.o))]=0.01
                print("best move", best_move)
                """  
                best_move = None
                max_value = float('-inf')

                for move in available_moves:
                    temp_state = deepcopy(state)  
                    temp_state.x.add(move)
                    hashable_state = (frozenset(temp_state.x), frozenset(temp_state.o))

                    if hashable_state in self.dict_moves.keys():
                        move_value = self.dict_moves[hashable_state]
                        if move_value > max_value:
                            max_value = move_value
                            best_move = move

                if best_move is None:
                    best_move = choice(list(available_moves))
                    #state.x.add(best_move)
                    temp_state = deepcopy(state) 
                    temp_state.x.add(best_move)
                    self.dict_moves[(frozenset(temp_state.x), frozenset(temp_state.o))]=0.001
            else:
                best_move = choice(list(available_moves))
                #state.x.add(best_move)
                temp_state = deepcopy(state) 
                temp_state.x.add(best_move)
                self.dict_moves[(frozenset(temp_state.x), frozenset(temp_state.o))]=0.001
        return best_move
    
    def add_winning(self):
        self._winning_games+=1

In [718]:

def random_game_1(p1,p2):
    
    trajectory=list()
    state=State(set(), set())
    available=set(range(1,9+1))


    players=[p1,p2]
    index=choice([0,1])
    #index=1
    #index=0
    while True:
        
        current_player=players[index]

        move=current_player.move(list(available),state)

        if(current_player.symbol == -1):
            state.o.add(move)
            trajectory.append(deepcopy(state))
            available.remove(move) 
            if(win(state.o)) or not available:
                break
        
        else:
            state.x.add(move)
            trajectory.append(deepcopy(state))
            available.remove(move) 
            if(win(state.x)) or not available:
                current_player.add_winning()
                break
    
        index=1-index

    return trajectory
    

In [719]:
value_dictionary=defaultdict(float)
epsilon=0.001


#o
p1=RandomPlayer(-1)
#x
p2=Agent(1)

num_iterations=50_000

for steps in tqdm(range(num_iterations)):
    trajectory=random_game_1(p1,p2)
   
    # i compute the final reward
    final_reward=state_value(trajectory[-1])
    #print(final_reward)
    #update all the state according to this reward
    for s in trajectory:
        hashable_state=(frozenset(s.x),frozenset(s.o))
        p2.dict_moves[hashable_state]= p2.dict_moves[hashable_state]+epsilon*(final_reward-p2.dict_moves[hashable_state])


print("Winning percentage of the agent ",p2._winning_games/num_iterations*100)

100%|██████████| 50000/50000 [01:54<00:00, 436.79it/s]

Winning percentage of the agent  74.47





In [720]:
sorted(p2.dict_moves.items(), key=lambda e: e[1], reverse=True)[:10]

[((frozenset({5}), frozenset({3})), 0.5875470604494527),
 ((frozenset({5}), frozenset({1})), 0.5459017636752646),
 ((frozenset({5}), frozenset({8})), 0.5434493075985732),
 ((frozenset(), frozenset({1})), 0.5374438690564269),
 ((frozenset(), frozenset({3})), 0.5287986272934934),
 ((frozenset(), frozenset({8})), 0.48199871248632936),
 ((frozenset({5}), frozenset({6})), 0.4758250132154504),
 ((frozenset({2}), frozenset({4})), 0.47425306526850397),
 ((frozenset({4}), frozenset({9})), 0.4567156884306328),
 ((frozenset({6}), frozenset({7})), 0.4463894541918245)]

Sets aren't hashable but frozen sets are

In [721]:
x=frozenset({2,3,4})
y={x:'yess'}