Copyright **`(c)`** 2023 Giovanni Squillero `<giovanni.squillero@polito.it>`  
[`https://github.com/squillero/computational-intelligence`](https://github.com/squillero/computational-intelligence)  
Free for personal or classroom use; see [`LICENSE.md`](https://github.com/squillero/computational-intelligence/blob/master/LICENSE.md) for details.  

In [1]:
from itertools import combinations,product,cycle
from collections import namedtuple, defaultdict
from random import choice
from copy import deepcopy,copy

from tqdm.auto import tqdm
import numpy as np

In [2]:
State = namedtuple('State', ['x', 'o'])
MAGIC = [2, 7, 6, 9, 5, 1, 4, 3, 8]

In [3]:
def print_board(pos):
    """Nicely prints the board"""
    for r in range(3):
        for c in range(3):
            i = r * 3 + c
            if MAGIC[i] in pos.x:
                print('X', end='')
            elif MAGIC[i] in pos.o:
                print('O', end='')
            else:
                print('.', end='')
        print()
    print()


In [4]:
def win(elements):
    """Checks if elements is winning"""
    return any(sum(c) == 15 for c in combinations(elements, 3))

def state_value(pos: State): 
    """Evaluate state: +1 first player wins"""
    if win(pos.x):
        return 1
    elif win(pos.o):
        return -1
    else:
        return 0


In [7]:
def random_game():
    #returns the history (trajectory) of the game played!
    trajectory = list()
    state = State(set(), set())
    available = set(range(1, 9+1))
    while available:
        x = choice(list(available))
        state.x.add(x)
        trajectory.append(deepcopy(state))
        available.remove(x)
        if win(state.x) or not available:
            break

        o = choice(list(available))
        state.o.add(o)
        trajectory.append(deepcopy(state))
        available.remove(o)
        if win(state.o):
            break
    return trajectory


In [9]:
# state-value table
value_dictionary = defaultdict(float)
# keep counts of all the possible states
hit_state = defaultdict(int)
# learning rate
epsilon = 0.001



# generate all possible states of the board
all_combinations = [frozenset(item) for r in range(4) for item in combinations(MAGIC,r)]
global_states = list(product(all_combinations,all_combinations))
# 16900 total states
# set the initial value of all the states to 0.5 (50% chance of winning for all states.)
for state in global_states:     
    value_dictionary[state] = 0.5


for steps in tqdm(range(500_000)):
    trajectory = random_game()
    # final reward of the game
    final_reward = state_value(trajectory[-1])
    for state in trajectory:
        hashable_state = (frozenset(state.x), frozenset(state.o))
        hit_state[hashable_state] += 1
        value_dictionary[hashable_state] = value_dictionary[
            hashable_state
        ] + epsilon * (final_reward - value_dictionary[hashable_state])


  0%|          | 0/500000 [00:00<?, ?it/s]

In [10]:

best_value_states = [(key,value) for key,value in sorted(value_dictionary.items(), key=lambda e: e[1], reverse=True)[:30] if len(key[0]) <= 2 and len(key[1]) <= 2]

for item in best_value_states:
    print(item)

In [14]:

def available_moves(state):
    '''returns available moves on the board'''
    return set(MAGIC) - (state.x | state.o)

def random_move(player, board):
    '''do a random move'''
    available = available_moves(board)
    rand_move = choice(list(available))
    cstate = deepcopy(board)
    cstate.x.add(rand_move) if player == 'x' else cstate.o.add(rand_move)
    return cstate,None
 

def rl_move(player, board, policy=value_dictionary):
    '''RL Agent plays moves based on given policy, returns board after move'''
    # set of available moves on the board
    available = available_moves(board)
    # to keep record of possible states
    possible_states = []
    best_state = None
    max_value = 0
    
    for move in available:
        cstate = deepcopy(board)
        cstate.x.add(move) if player == 'x' else cstate.o.add(move)
        possible_states.append(cstate)
        
    for state in possible_states:
        hashable_state = (frozenset(state.x), frozenset(state.o))
        if policy[hashable_state] > max_value:
            max_value = policy[hashable_state]
            best_state = state
    # in case no move in policy, choose a random available move
    if best_state is None:
        rand_move = choice(list(available))
        cstate = deepcopy(board)
        cstate.x.add(move) if player == 'x' else cstate.o.add(move)
        best_state = cstate
        max_value = -1
                
    return best_state,max_value


def play(player1,player2,number_games): 
    '''play number of games between two strategies and return results'''  
    players = [player1,player2]
    marks = ['x','o']
    wins = [0,0,0]
    turn = cycle([0,1])
    
    for _ in range(number_games):
        board = State(set(), set())
        while True:
            this_turn = next(turn)
            board, _ = players[this_turn](marks[this_turn],board)
            if win(board.x) or win(board.o):
                wins[this_turn] += 1
                break
            elif len(available_moves(board)) == 0:
                # draw
                wins[2] += 1
                break

    return wins
        
player1 = rl_move
player2 = random_move
number_games = 1000
results = play(player1,player2,number_games)
print(f"{player1} won {results[0]/number_games*100}% of games, {player2} won {results[1]/number_games*100}% and the number of draws is {results[2]/number_games*100}% of games!")



<function rl_move at 0x10dd8f560> won 56.89999999999999% of games, <function random_move at 0x10dd8f1a0> won 19.0% and the number of draws is 24.099999999999998% of games!
