Copyright **`(c)`** 2023 Giovanni Squillero `<giovanni.squillero@polito.it>`  
[`https://github.com/squillero/computational-intelligence`](https://github.com/squillero/computational-intelligence)  
Free for personal or classroom use; see [`LICENSE.md`](https://github.com/squillero/computational-intelligence/blob/master/LICENSE.md) for details.  

# LAB10

Use reinforcement learning to devise a tic-tac-toe player.

### Deadlines:

* Submission: [Dies Natalis Solis Invicti](https://en.wikipedia.org/wiki/Sol_Invictus)
* Reviews: [Befana](https://en.wikipedia.org/wiki/Befana)

Notes:

* Reviews will be assigned  on Monday, December 4
* You need to commit in order to be selected as a reviewer (ie. better to commit an empty work than not to commit)

In [2]:
from itertools import permutations
from collections import namedtuple
import numpy as np
from treelib import Node, Tree
from copy import deepcopy
from itertools import product
import tqdm
from tqdm import tqdm
import math

import pickle


In [4]:
TICTACTOE_MAP = np.array([[1, 6, 5], [8, 4, 0], [3, 2, 7]])

def tictactoe_map(set_pos):
    return {TICTACTOE_MAP[pos//3,pos%3] for pos in set_pos}

def display(x, o):
    for r in range(3):
        for c in range(3):
            if (r*3+c) in x:
                print("X", end=" ")
            elif (r*3+c) in o:
                print("O", end=" ")
            else:
                print(".", end=" ")
        print()

def won(cells):
    return any(sum(h) == 12 for h in permutations(tictactoe_map(cells), 3))

def my_print(s):
    print(s, flush=True )

In [5]:
St = namedtuple('State', ['x', 'o','next_turn']) # x={}, o={}
Ply = namedtuple('Ply', ['turn','pos'])

def static_eval(state: St):
    if won(state.x):
        return 1
    elif won(state.o):
        return -1
    else:
        return 0
        

In [6]:
states_map={} # map (state, total reward)

action_map={} # not present in lab10_b

BIG_INT = 1_000_000_000
DEBUG = False
#position of x fit in 5bit variable, same for position of o, ply fit to 1bit variable
#FROM PAPER: "quixo is solved"
def from_xo_to_key(state: St) -> int:

    key = 0

    for pos in state.x:
        key += 2**pos
    for pos in state.o:
        key += 2**(pos+9)
    
    key+= state.next_turn*(2**18)

    return key

def from_key_to_xo(key: int) -> St: 

    x ={bin_mask for bin_mask in range(9) if (key & 1<<bin_mask)!=0}
    o ={bin_mask for bin_mask in range(9) if (key & 1<<(bin_mask+9))!=0}
    next_turn = (key & (1<<18))>>18
    
    return St(x, o, next_turn)

def from_board_to_xo(board: np.array((3,3), np.int8)) -> (set,set):

    x = set([])
    o = set([])
    for i in range(board.shape[0]):
        for j in range(board.shape[1]):
            if board[i][j] == 0:
                x |= {i*3+j}
            elif board[i][j] == 1:
                o |= {i*3+j}
        
    return (x, o)

def from_xo_to_board(x: set, o:set) -> np.array((3,3), np.int8):
    board = np.ones((3,3),dtype=np.int8)*-1
    for pos in x:
        board[pos//3][pos%3] = 0
    for pos in o:
        board[pos//3][pos%3] = 1
    return board

def calculate_equivalent(key: int) -> [int]:
    
    x, o, next_turn = from_key_to_xo(key)

    board = np.ones((3,3), dtype=np.uint8) * -1
    
    for pos in x:
        board[pos//3,pos%3] = 0
    for pos in o:
        board[pos//3,pos%3] = 1
    
    xo_set = from_board_to_xo(board)
    xo_set_T = from_board_to_xo(board.T)

    equiv_state_ids = [from_xo_to_key(St(xo_set[0],xo_set[1], next_turn)), from_xo_to_key(St(xo_set_T[0],xo_set_T[1], next_turn))]

    for _ in range(3):

        board = np.rot90(board)

        xo_set = from_board_to_xo(board)
        xo_set_T = from_board_to_xo(board.T)

        equiv_state_ids.append(from_xo_to_key(St(xo_set[0],xo_set[1], next_turn)))
        equiv_state_ids.append(from_xo_to_key(St(xo_set_T[0],xo_set_T[1], next_turn)))
        
    return equiv_state_ids

def key_mapping(state: St) -> None:
    #check if an equivalent state is already stored in map
    equiv_states_keys = calculate_equivalent(from_xo_to_key(state))
    equiv_key = None
    for state_key in equiv_states_keys:
        if state_key in states_map.keys():
            equiv_key = state_key
            break

    if equiv_key == None:
        state_key = from_xo_to_key(state)
        states_map[state_key] = 0 #counter

def generate_all_states() -> None:
    
    for el in list(product([-1,0,1], repeat=9)):
        n_0 = len([0 for i in el if i == 0])
        n_1 = len([1 for i in el if i == 1])
        if 0<= n_0-n_1 <= 1: 
            next_player = n_0-n_1
            x = set([])
            o = set([])
            for i,xo_ in enumerate(el):
                if xo_ == 0:
                    x = x|{i}
                elif xo_ == 1:
                    o = o|{i}
            key_mapping(St(x, o, next_player))

def from_ply_to_key(ply: Ply) -> int :#4 bits are enought to represent pos, fifth bit is turn 1/0
    return (ply.turn<<4) + ply.pos

def from_key_to_ply(key: int) -> Ply:
    return Ply(key>>4,key-((key>>4)<<4))

def make_ply(state_xo: St, ply: Ply):

    assert state_xo.next_turn == ply.turn, "error: wrong turn"

    if ply.turn == 0:
        return St(state_xo.x|set({ply.pos}), state_xo.o, (ply.turn+1)%2)
    else:
        return St(state_xo.x, state_xo.o|set({ply.pos}), (ply.turn+1)%2)
    
def generate_all_actions() -> None:

    for state_key in states_map.keys():
        state_xo=from_key_to_xo(state_key)

        for pos_ply in set(range(9)) - state_xo.x - state_xo.o:
            
            ply = Ply(state_xo.next_turn, pos_ply)
            
            next_state_xo = make_ply(state_xo, ply)
            action_map[(state_key,from_ply_to_key(ply))] = static_eval(next_state_xo)
            #in this way terminal move are chosen everytime by corresponding player
            #(I can also go for higher gamma)

generate_all_states() #4s 850 states
my_print(f"states generated: {len(states_map.keys())}")

generate_all_actions()
my_print(f"actions generated: {len(action_map.keys())}")


states generated: 850
actions generated: 2702


In [10]:
N_GAMES = 2_500

hit_map_action = {}
#states map is now transformed to a counter that tell how many times we reached the state:
for key in states_map.keys():
    states_map[key] = static_eval(from_key_to_xo(key)) #avoid overflow

for key in action_map.keys():
    hit_map_action[key] = 0
    action_map[key] = static_eval(make_ply(from_key_to_xo(key[0]), from_key_to_ply(key[1])))


def softmax_stable(x: np.array) -> np.array: #takes np.array return np.array
    return(np.exp(x - np.max(x)) / np.exp(x - np.max(x)).sum())


def policy_ply(state: int, player: int, learned = False, stochastic = False) -> (int, int): 
    # can play random, deterministic trained, stochastic trained

    state_xo = from_key_to_xo(state)
    
    assert (len(state_xo.x) + len(state_xo.o)) < 9, "game already ended"
    assert static_eval(state_xo) == 0, "someone already won"
    
    equiv_states = calculate_equivalent(state)
    for n_transform,equiv_state_key in enumerate(equiv_states): # we need to check from which state we start but since not all states are mapped we need to find the equivalent mapped
        
        if states_map.get(equiv_state_key) != None:
            
            equiv_state_xo = from_key_to_xo(equiv_state_key) # equiv state have different free positions
            possible_pos = set(range(9)) - equiv_state_xo.x - equiv_state_xo.o    
            
            equiv_plys_credits = []
            for ply_pos in possible_pos:
                    
                ply_on_equiv = Ply(equiv_state_xo.next_turn, ply_pos)
                ply_on_equiv_key = from_ply_to_key(ply_on_equiv)
                credit = action_map.get((equiv_state_key, ply_on_equiv_key))
                equiv_plys_credits.append((ply_on_equiv_key, credit))

            equiv_state_ply = (None, None)

            if learned:

                if stochastic:
                    if player == 0:
                        equiv_ply_key = np.random.choice(np.array([equiv_plys_credits[i][0] for i in range(len(equiv_plys_credits))]), p=softmax_stable(np.array([equiv_plys_credits[i][1] for i in range(len(equiv_plys_credits))])))
                    else:
                        equiv_ply_key = np.random.choice(np.array([equiv_plys_credits[i][0] for i in range(len(equiv_plys_credits))]), p=softmax_stable(np.array([-equiv_plys_credits[i][1] for i in range(len(equiv_plys_credits))])))
                    equiv_state_ply = (equiv_state_key, equiv_ply_key) 

                else: #deterministic (pick best action based on value)
                        
                    if player == 0:
                        best = -BIG_INT
                    else:
                        best =  BIG_INT

                    for equiv_ply_credit in equiv_plys_credits:
                            
                        if (equiv_ply_credit[1]>best and player == 0) or (equiv_ply_credit[1]<best and player == 1):
                            best = equiv_ply_credit[1]
                            equiv_state_ply = (equiv_state_key, equiv_ply_credit[0])
                        
            else: #random (uniform probability for all possible actions)
                
                equiv_ply_key = np.random.choice(np.array([equiv_plys_credits[i][0] for i in range(len(equiv_plys_credits))]))
                equiv_state_ply = (equiv_state_key, equiv_ply_key)

            assert equiv_state_ply != (None, None), "best move must exist"
                
                
            if DISPLAY_GAME:
                    #at this stage the print of board doesn't apply automatically transformations
                    #just tell what transformation is retrieved from action_map
                    for i in range(4):
                        for j in range(2):
                            if n_transform == i*2+j:
                                if i != 0 or j != 0:
                                    my_print(f"rotation:{-i*90}°, transpose: {j}")
                                          
                    my_print(from_xo_to_board(equiv_state_xo.x,equiv_state_xo.o))
                
            return equiv_state_ply


γ = 0.83

def assign_rewards_monte_carlo(winner: int, ply_played_keys: [(int, int)]) -> None:
    
    reward = 1-winner*2 #map 0 to 1 and 1 to -1
    
    if winner == -1:
        reward = 0
    for i,state_ply_key in enumerate(list(ply_played_keys)[::-1]):
        
        total_discount = γ**i
        #hyperbolic discount vs exponential discount γ**i, exponential is time invariant --> better 
        #tic tac toe has +1/-1/0 only at end so Gt = Re*(γ**distane_from_end) 
        hit_map_action[state_ply_key] += total_discount
        action_map[state_ply_key] += 1/(hit_map_action[state_ply_key])*(reward*total_discount-action_map[state_ply_key]) #deepmind course lecture 4 minute 28
        
                    
assign_rewards=assign_rewards_monte_carlo

DISPLAY_GAME = False # useful only to display automatic plays

my_print("random/random training")
custom_bar_format = "{l_bar}{bar:50}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]"
progress_bar = tqdm(range(N_GAMES),dynamic_ncols=True,desc="Game",colour="green",total=N_GAMES,mininterval=0.5,bar_format=custom_bar_format,ncols=100)

for game in progress_bar:
    player = 0
    ply_played_keys = []
    winner = -1
    state_prev = from_xo_to_key(St(set(),set(),0))
        
    for turn in range(9): #it finish after 9 ply maximum
            
        state_ply = policy_ply(state_prev, player, learned=False) 
        ply_played_keys.append(state_ply)
        state_xo = make_ply(from_key_to_xo(state_ply[0]), from_key_to_ply(state_ply[1]))
        state_prev = from_xo_to_key(state_xo)

        if static_eval(state_xo) != 0:
            winner=player
            break

        player = (player+1)%2
        
    assign_rewards(winner, ply_played_keys)


my_print("semi-trained/random training")
custom_bar_format = "{l_bar}{bar:50}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]"
progress_bar = tqdm(range(N_GAMES),dynamic_ncols=True,desc="Game",colour="green",total=N_GAMES,mininterval=0.5,bar_format=custom_bar_format,ncols=100)

for game in progress_bar:
    player = 0
    ply_played_keys = []
    winner = -1
    state_prev = from_xo_to_key(St(set(),set(),0))
        
    for turn in range(9): #it finish after 9 ply maximum
            
        state_ply = policy_ply(state_prev, player, learned=(player==(game%2)), stochastic=True) 
        ply_played_keys.append(state_ply)
        state_xo = make_ply(from_key_to_xo(state_ply[0]), from_key_to_ply(state_ply[1]))
        state_prev = from_xo_to_key(state_xo)

        if static_eval(state_xo) != 0:
            winner=player
            break

        player = (player+1)%2
        
    assign_rewards(winner, ply_played_keys)
        

my_print("semi-trained/semi-trained training")
custom_bar_format = "{l_bar}{bar:50}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]"
progress_bar = tqdm(range(N_GAMES),dynamic_ncols=True,desc="Game",colour="green",total=N_GAMES,mininterval=0.5,bar_format=custom_bar_format,ncols=100)

for game in progress_bar:
    player = 0
    ply_played_keys = []
    winner = -1
    state_prev = from_xo_to_key(St(set(),set(),0))
        
    for turn in range(9): #it finish after 9 ply maximum
            
        state_ply = policy_ply(state_prev, player, learned=True, stochastic=True) 
        ply_played_keys.append(state_ply)
        state_xo = make_ply(from_key_to_xo(state_ply[0]), from_key_to_ply(state_ply[1]))
        state_prev = from_xo_to_key(state_xo)

        if static_eval(state_xo) != 0:
            winner=player
            break

        player = (player+1)%2
        
    assign_rewards(winner, ply_played_keys)


my_print("trained/semi-trained training")
custom_bar_format = "{l_bar}{bar:50}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]"
progress_bar = tqdm(range(N_GAMES),dynamic_ncols=True,desc="Game",colour="green",total=N_GAMES,mininterval=0.5,bar_format=custom_bar_format,ncols=100)

for game in progress_bar:
    player = 0
    ply_played_keys = []
    winner = -1
    state_prev = from_xo_to_key(St(set(),set(),0))
        
    for turn in range(9): #it finish after 9 ply maximum
            
        state_ply = policy_ply(state_prev, player, learned=True, stochastic=(player==(game%2))) 
        ply_played_keys.append(state_ply)
        state_xo = make_ply(from_key_to_xo(state_ply[0]), from_key_to_ply(state_ply[1]))
        state_prev = from_xo_to_key(state_xo)

        if static_eval(state_xo) != 0:
            winner=player
            break

        player = (player+1)%2
        
    assign_rewards(winner, ply_played_keys)


my_print("trained/trained training")
custom_bar_format = "{l_bar}{bar:50}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]"
progress_bar = tqdm(range(N_GAMES),dynamic_ncols=True,desc="Game",colour="green",total=N_GAMES,mininterval=0.5,bar_format=custom_bar_format,ncols=100)

for game in progress_bar:
    player = 0
    ply_played_keys = []
    winner = -1
    state_prev = from_xo_to_key(St(set(),set(),0))
        
    for turn in range(9): #it finish after 9 ply maximum
            
        state_ply = policy_ply(state_prev, player, learned=True, stochastic=False) 
        ply_played_keys.append(state_ply)
        state_xo = make_ply(from_key_to_xo(state_ply[0]), from_key_to_ply(state_ply[1]))
        state_prev = from_xo_to_key(state_xo)

        if static_eval(state_xo) != 0:
            winner=player
            break

        player = (player+1)%2
        
    assign_rewards(winner, ply_played_keys)


my_print("trained/random training") #final training in the test situation
custom_bar_format = "{l_bar}{bar:50}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]"
progress_bar = tqdm(range(N_GAMES),dynamic_ncols=True,desc="Game",colour="green",total=N_GAMES,mininterval=0.5,bar_format=custom_bar_format,ncols=100)

for game in progress_bar:
    player = 0
    ply_played_keys = []
    winner = -1
    state_prev = from_xo_to_key(St(set(),set(),0))
        
    for turn in range(9): #it finish after 9 ply maximum
            
        state_ply = policy_ply(state_prev, player, learned=(player==(game%2)), stochastic=False) 
        ply_played_keys.append(state_ply)
        state_xo = make_ply(from_key_to_xo(state_ply[0]), from_key_to_ply(state_ply[1]))
        state_prev = from_xo_to_key(state_xo)

        if static_eval(state_xo) != 0:
            winner=player
            break

        player = (player+1)%2
        
    assign_rewards(winner, ply_played_keys)


# TEST AGENT
DISPLAY_GAME = True
MAX_PRINT = 10
N_GAMES = 10_000

wins = 0
draws = 0
lose = 0
for game in range(N_GAMES):
    #ai_player = game%2 #half I start half random start
    winner = -1
    state_prev = from_xo_to_key(St(set(),set(),0))
    player = 0
    

    if(game>=MAX_PRINT):
        DISPLAY_GAME = False

    if DISPLAY_GAME:
        my_print(f"GAME START")

    for turn in range(9): #it finish after 9 ply maximum
        
        state_ply = policy_ply(state_prev, player, player==(game%2))
        state_xo = make_ply(from_key_to_xo(state_ply[0]), from_key_to_ply(state_ply[1]))
        state_prev = from_xo_to_key(state_xo)
        
        if DISPLAY_GAME:
            if player == (game%2):
                my_print("AI turn")
            else:
                my_print("random turn")

        if static_eval(state_xo) != 0:
            winner=player
            break
        
        player = (player+1)%2

    if DISPLAY_GAME:
        my_print("FINAL BOARD")
        my_print(from_xo_to_board(state_xo.x, state_xo.o))

    if winner == (game%2):
        wins +=1
    elif winner == -1:
        draws +=1
    else:
        lose += 1
    
my_print(f"victories ratio: {wins/N_GAMES}")
my_print(f"draw ratio: {draws/N_GAMES}")
my_print(f"defeats ratio: {lose/N_GAMES}")

    

random/random training


Game: 100%|[32m██████████████████████████████████████████████████[0m| 2500/2500 [00:10<00:00]

semi-trained/random training



Game: 100%|[32m██████████████████████████████████████████████████[0m| 2500/2500 [00:11<00:00]

semi-trained/semi-trained training



Game: 100%|[32m██████████████████████████████████████████████████[0m| 2500/2500 [00:10<00:00]

trained/semi-trained training



Game: 100%|[32m██████████████████████████████████████████████████[0m| 2500/2500 [00:08<00:00]

trained/trained training



Game: 100%|[32m██████████████████████████████████████████████████[0m| 2500/2500 [00:10<00:00]

trained/random training



Game: 100%|[32m██████████████████████████████████████████████████[0m| 2500/2500 [00:07<00:00]

GAME START
[[-1 -1 -1]
 [-1 -1 -1]
 [-1 -1 -1]]
AI turn
[[-1 -1 -1]
 [-1  0 -1]
 [-1 -1 -1]]
random turn
rotation:0°, transpose: 1
[[-1 -1 -1]
 [-1  0 -1]
 [-1  1 -1]]
AI turn
[[-1 -1 -1]
 [-1  0 -1]
 [-1  1  0]]
random turn
rotation:-90°, transpose: 1
[[-1 -1 -1]
 [-1  0 -1]
 [ 0  1  1]]
AI turn
FINAL BOARD
[[-1 -1  0]
 [-1  0 -1]
 [ 0  1  1]]
GAME START
[[-1 -1 -1]
 [-1 -1 -1]
 [-1 -1 -1]]
random turn
[[-1 -1 -1]
 [-1 -1 -1]
 [-1  0 -1]]
AI turn
[[-1 -1 -1]
 [-1  1 -1]
 [-1  0 -1]]
random turn
rotation:-90°, transpose: 0
[[-1 -1 -1]
 [-1  1  0]
 [ 0 -1 -1]]
AI turn
rotation:-180°, transpose: 0
[[-1 -1  0]
 [ 0  1 -1]
 [-1  1 -1]]
random turn
[[-1 -1  0]
 [ 0  1 -1]
 [-1  1  0]]
AI turn
FINAL BOARD
[[-1  1  0]
 [ 0  1 -1]
 [-1  1  0]]
GAME START
[[-1 -1 -1]
 [-1 -1 -1]
 [-1 -1 -1]]
AI turn
[[-1 -1 -1]
 [-1  0 -1]
 [-1 -1 -1]]
random turn
rotation:-90°, transpose: 0
[[-1 -1 -1]
 [-1  0 -1]
 [-1  1 -1]]
AI turn
[[-1 -1 -1]
 [-1  0 -1]
 [-1  1  0]]
random turn
rotation:-90°, transpose: 1




rotation:-180°, transpose: 0
[[-1 -1 -1]
 [-1  0 -1]
 [-1  1 -1]]
AI turn
[[-1 -1 -1]
 [-1  0 -1]
 [-1  1  0]]
random turn
rotation:0°, transpose: 1
[[-1 -1 -1]
 [-1  0  1]
 [ 1 -1  0]]
AI turn
FINAL BOARD
[[ 0 -1 -1]
 [-1  0  1]
 [ 1 -1  0]]
GAME START
[[-1 -1 -1]
 [-1 -1 -1]
 [-1 -1 -1]]
random turn
rotation:-270°, transpose: 0
[[-1 -1 -1]
 [-1 -1 -1]
 [-1 -1  0]]
AI turn
[[-1 -1 -1]
 [-1  1 -1]
 [-1 -1  0]]
random turn
rotation:-90°, transpose: 1
[[-1 -1 -1]
 [-1  1  0]
 [ 0 -1 -1]]
AI turn
rotation:-180°, transpose: 0
[[-1 -1  0]
 [ 0  1 -1]
 [-1  1 -1]]
random turn
rotation:-180°, transpose: 1
[[-1 -1  0]
 [ 1  1 -1]
 [-1  0  0]]
AI turn
FINAL BOARD
[[-1 -1  0]
 [ 1  1  1]
 [-1  0  0]]
GAME START
[[-1 -1 -1]
 [-1 -1 -1]
 [-1 -1 -1]]
AI turn
[[-1 -1 -1]
 [-1  0 -1]
 [-1 -1 -1]]
random turn
[[-1 -1 -1]
 [-1  0 -1]
 [-1  1 -1]]
AI turn
[[-1 -1 -1]
 [-1  0 -1]
 [-1  1  0]]
random turn
rotation:0°, transpose: 1
[[-1 -1 -1]
 [-1  0  1]
 [ 1 -1  0]]
AI turn
FINAL BOARD
[[ 0 -1 -1]
 [-1  

In [11]:
#TEST AGENT ON LOT OF GAMES

N_GAMES = 100_000
DISPLAY_GAME = False

wins = 0
draws = 0
lose = 0

my_print("trained/random testing") #final training in the test situation
custom_bar_format = "{l_bar}{bar:50}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]"
progress_bar = tqdm(range(N_GAMES),dynamic_ncols=True,desc="Game",colour="green",total=N_GAMES,mininterval=0.5,bar_format=custom_bar_format,ncols=100)

for game in progress_bar:
    #ai_player = game%2 #half I start half random start
    winner = -1
    state_prev = from_xo_to_key(St(set(),set(),0))
    player = 0
    

    if(game>=MAX_PRINT):
        DISPLAY_GAME = False

    if DISPLAY_GAME:
        my_print(f"GAME START")

    for turn in range(9): #it finish after 9 ply maximum
        
        state_ply = policy_ply(state_prev, player, player==(game%2))
        state_xo = make_ply(from_key_to_xo(state_ply[0]), from_key_to_ply(state_ply[1]))
        state_prev = from_xo_to_key(state_xo)
        
        if DISPLAY_GAME:
            if player == (game%2):
                my_print("AI turn")
            else:
                my_print("random turn")

        if static_eval(state_xo) != 0:
            winner=player
            break
        
        player = (player+1)%2

    if DISPLAY_GAME:
        my_print("FINAL BOARD")
        my_print(from_xo_to_board(state_xo.x, state_xo.o))

    if winner == (game%2):
        wins +=1
    elif winner == -1:
        draws +=1
    else:
        lose += 1
    
my_print(f"victories ratio: {wins/N_GAMES}")
my_print(f"draw ratio: {draws/N_GAMES}")
my_print(f"defeats ratio: {lose/N_GAMES}")


trained/random testing


Game: 100%|[32m██████████████████████████████████████████████████[0m| 100000/100000 [06:16<00:00]

victories ratio: 0.95302
draw ratio: 0.04698
defeats ratio: 0.0





In [12]:
# SAVE AGENT Q-Values

# Save to file, suggested if lose rate == 0.0
with open('data.pkl', 'wb') as pickle_file:
    pickle.dump(action_map, pickle_file)


In [14]:
# LOADING AGENT Q-Values
with open('data.pkl', 'rb') as pickle_file:
    action_map = pickle.load(pickle_file)



In [19]:
# RUN PLAY AGAINST REAL PLAYER (use transform_V)
transform_V = [] 
#lab10_b use a direct mapping between (state,action)->next_state->state_value, easier to understand, 
#but since only state has value that is independent by the previous state it gets worse results

def rl_agent_ply(state: int) -> (int, int):

    equiv_states_keys = calculate_equivalent(state)
    
    for n_transform,equiv_state_key in enumerate(equiv_states_keys): # we need to check from which state we start but since not all states are mapped we need to find the equivalent mapped
        #rotation is done before move
        if states_map.get(equiv_state_key) != None:
            
            equiv_state_xo = from_key_to_xo(equiv_state_key) # equiv state have different free positions
            possible_pos = set(range(9)) - equiv_state_xo.x - equiv_state_xo.o

            if player == 0:
                best = -BIG_INT
            else:
                best =  BIG_INT
                
            equiv_state_best_ply = (None, None)
            for ply_pos in possible_pos:
                    
                ply_on_equiv = Ply(equiv_state_xo.next_turn, ply_pos)
                ply_on_equiv_key = from_ply_to_key(ply_on_equiv)
                    
                credits = action_map.get((equiv_state_key, ply_on_equiv_key))
                if credits != None:
                    if (credits>best and player == 0) or (credits<best and player == 1):
                        best = credits
                        equiv_state_best_ply = (equiv_state_key, ply_on_equiv_key)
                
            assert equiv_state_best_ply != (None, None), "best move must exist"
                
                
            for i in range(4):
                for j in range(2):
                    if n_transform == i*2+j:
                        if i!=0 or j!=0:
                            transform_V.append((i,j))
                            
            return equiv_state_best_ply
    
    assert False,"best move must exist"

def human_ply(state: int, pos: int) -> (int, int):

    equiv_states_keys = calculate_equivalent(state)
    turn = from_key_to_xo(state).next_turn
    #new_pos=equivalent_move(pos, state.next_turn) # first need to calculate new_pos as equivalent_move_until_now than from there you can calculate it based on additional the new board rotation

    for n_transform,equiv_state_key in enumerate(equiv_states_keys):
        if states_map.get(equiv_state_key)!=None: # after you find valid state you have to find if tha move is mapped
            
            #rotation is done before move

            for i in range(4):
                for j in range(2):
                    if n_transform == i*2+j:
                        
                        if i!=0 or j!=0:
                            transform_V.append((i,j))
                           
                        ply_on_equiv_key = equivalent_move(from_ply_to_key(Ply(turn, pos)))
                        
                        if (from_key_to_ply(ply_on_equiv_key).pos in from_key_to_xo(equiv_state_key).x) or (from_key_to_ply(ply_on_equiv_key).pos in from_key_to_xo(equiv_state_key).o):
                            my_print("invalid move, position already taken")
                            return (None, None)
                        
                        return (equiv_state_key, ply_on_equiv_key )

    assert False, "invalid move, unrecognized"

def display_board(board: np.array, transforms=transform_V) -> None:
    orig_orient_board = deepcopy(board)
    # current board is transition_board
    # but need to be mapped on equivalent board as orig_orient_board->transition_V->transition_board, 
    # so transition_board->(transition_V[::-1])^-1->orig_orient_board is done on reversed tv
    
    for (rot,t) in transforms[::-1]:
        if t == 1:
            orig_orient_board = orig_orient_board.T
        orig_orient_board = np.rot90(orig_orient_board, k=-rot)

    (x_orig, o_orig)=from_board_to_xo(orig_orient_board)
    
    display(x_orig, o_orig)

def equivalent_move(ply_key: int) -> int:

    player, pos = from_key_to_ply(ply_key)

    # current board is transition_board -> move is done on orig_oriented_board
    # but need to be mapped on equivalent move as: move on orig_orient_board->transition_V->move on transition_board
    supp_board = np.zeros((3,3),dtype=np.uint8)
    supp_board[pos//3][pos%3] = 1

    for (rot,t) in transform_V:
        supp_board = np.rot90(supp_board, k=rot)
        if t == 1:
            supp_board = supp_board.T

    supp_board=supp_board.flatten()

    for new_pos,val in enumerate(supp_board):
        if val == 1:
            return from_ply_to_key(Ply(player, new_pos))

    assert False, "not here"
        

DISPLAY_GAME = False

my_print("you can exit game by entering any not integer character")

your_turn = int(input("Do you choose X(player 0) or Y(player 1)? type 0/1"))

winner = -1
state_xo = St(set(),set(),0)
player = 0

my_print("game start")

for turn in range(9): #it finish after 9 ply maximum 

    display_board(from_xo_to_board(state_xo.x, state_xo.o))    

    if (player == your_turn):
        my_print("your turn") 
        pos = int(input("CHOOSE A POSITION 0 TO 8 in which you want to play (0=top-left, 2=top-right, 3=bottom-left, 4=bottom-right)"))
        while( pos not in [n for n in range(9)]):
            my_print("invalid character")
            pos = int(input("CHOOSE A POSITION 0 TO 8 in which you want to play (0=top-left, 2=top-right, 3=bottom-left, 4=bottom-right)"))
        state_ply = human_ply(from_xo_to_key(state_xo), pos)
        while (state_ply == (None,None)):
            pos = int(input("CHOOSE A POSITION 0 TO 8 in which you want to play (0=top-left, 2=top-right, 3=bottom-left, 4=bottom-right)"))
            while( pos not in [n for n in range(9)]):
                my_print("invalid character")
                pos = int(input("CHOOSE A POSITION 0 TO 8 in which you want to play (0=top-left, 2=top-right, 3=bottom-left, 4=bottom-right)"))
            state_ply = human_ply(from_xo_to_key(state_xo), pos)
    else:
        my_print("RL agent turn")
        state_ply = rl_agent_ply(from_xo_to_key(state_xo))
    
    
    state_xo = make_ply(from_key_to_xo(state_ply[0]), from_key_to_ply(state_ply[1]))
    
    if static_eval(state_xo) != 0:
        winner=player
        break
        
    player = (player+1)%2

display_board(from_xo_to_board(state_xo.x, state_xo.o))  

if winner == -1:
    my_print("\nDRAW")
else:
    if winner == your_turn:
        my_print("\nYOU WON")
    else:
        my_print("\nI WON")



you can exit game by entering any not integer character


game start
. . . 
. . . 
. . . 
your turn
. X . 
. . . 
. . . 
RL agent turn
. X . 
. O . 
. . . 
your turn
. X X 
. O . 
. . . 
RL agent turn
O X X 
. O . 
. . . 
your turn
O X X 
X O . 
. . . 
RL agent turn
O X X 
X O . 
. . O 

I WON
