Copyright **`(c)`** 2023 Giovanni Squillero `<giovanni.squillero@polito.it>`  
[`https://github.com/squillero/computational-intelligence`](https://github.com/squillero/computational-intelligence)  
Free for personal or classroom use; see [`LICENSE.md`](https://github.com/squillero/computational-intelligence/blob/master/LICENSE.md) for details.  

# LAB10

Use reinforcement learning to devise a tic-tac-toe player.

### Deadlines:

* Submission: Dies Natalis Solis Invicti ([CET](https://en.wikipedia.org/wiki/Sol_Invictus))
* Reviews: Befana

Notes:

* Reviews will be assigned  on Monday, December 4
* You need to commit in order to be selected as a reviewer (ie. better to commit an empty work than not to commit)

In [1]:
from itertools import combinations
from collections import namedtuple, defaultdict
from random import choice
import numpy as np
from copy import deepcopy
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
State = namedtuple('Position',['x','o'])

MAGIC = [2,7,6,
         9,5,1,
         4,3,8]

SYMMETRIES = [[4,9,2,3,5,7,8,1,6],  #rotate 90°
              [8,3,4,1,5,9,6,7,2],  #rotate 180°
              [6,1,8,7,5,3,2,9,4],  #rotate 270°
              [2,9,4,7,5,3,6,1,8],  #diagonal 1
              [8,1,6,3,5,7,4,9,2],  #diagonal 2
              [4,3,8,9,5,1,2,7,6],  #respect second row
              [6,7,2,1,5,9,8,3,4]]  #respect second column

In [3]:
def possible_plys(pos: State):
    possible_moves = (set(range(1,10)) - pos.x) - pos.o
    syms = []
    plys = []
    original_board = create_board(pos,MAGIC)
    for sym in SYMMETRIES:
        board = create_board(pos,sym)
        if original_board == board:
            syms.append(sym)
    
    while possible_moves:
        ply = possible_moves.pop()
        plys.append(ply)
        index = MAGIC.index(ply)
        for sym in syms:
            ply = sym[index]
            possible_moves.discard(ply)
    return plys



def print_board(pos):
    """Nicely prints the board"""
    for r in range(3):
        for c in range(3):
            index = r *3 + c
            if MAGIC[index] in pos.x:
                print('X', end = '')
            elif MAGIC[index] in pos.o:
                print('O', end = '')
            else:
                print('_', end='')
        print()
    print()

def create_board(pos,magic):
    """return the board"""
    board = []
    for r in range(3):
        for c in range(3):
            index = r *3 + c
            if magic[index] in pos.x:
                board.append(1)
            elif magic[index] in pos.o:
                board.append(-1)
            else:
                board.append(0)
    return board

def win(elements):
    """Checks is elements is winning"""
    return any(sum(c) == 15 for c in combinations(elements,3))

def win_in_one(elements, possible_plys):
    wins = -1
    for ply in possible_plys:
        els = deepcopy(elements)
        els.add(ply)
        if win(els):
            wins = ply
            break
    return wins 

def state_value(pos: State):
    """Evaluate state: +1 first player wins"""
    if win(pos.x):
        return 1
    elif win(pos.o):
        return -1
    else:
        return 0


In [4]:
def random_game():
    trajectory = list()
    state = State(set(),set())
    available = possible_plys(state)
    while available:
        ply = win_in_one(state.x,available)
        x = ply if ply != -1 else choice(list(available))
        state.x.add(x)
        trajectory.append(deepcopy(state))
        available = possible_plys(state)
        if win(state.x) or not available:
            break

        ply = win_in_one(state.o,available)
        o = ply if ply != -1 else choice(list(available))
        state.o.add(o)
        trajectory.append(deepcopy(state))
        available = possible_plys(state)
        if win(state.o):
            break
    return trajectory

In [5]:
value_dictionary = defaultdict(float)
hit_state = defaultdict(int)
epsilon = .01

for steps in tqdm(range(150_000)):
    trajectory = random_game()
    final_reward = state_value(trajectory[-1])
    for state in trajectory:
        hashable_state = (frozenset(state.x), frozenset(state.o))
        hit_state[hashable_state] += 1
        value_dictionary[hashable_state] = value_dictionary[hashable_state] + epsilon * (final_reward - value_dictionary[hashable_state]) 
        

  0%|          | 112/150000 [00:00<02:15, 1109.05it/s]

100%|██████████| 150000/150000 [01:31<00:00, 1647.41it/s]


In [13]:
a = list(filter(lambda e: state_value(State(e[0][0],e[0][1])) == 0,value_dictionary.items()))
a = sorted(a,key = lambda e: e[1], reverse = True)
ind=1
pos = State(a[ind][0][0],a[ind][0][1])
print(a[ind])
print_board(pos)

ind=-ind
pos = State(a[ind][0][0],a[ind][0][1])
print(a[ind])
print_board(pos)


((frozenset({1, 5}), frozenset({2, 6})), 0.9999997880601903)
O_O
_XX
___

((frozenset({1, 2, 9}), frozenset({3, 5})), -0.9995373115349785)
X__
XOX
_O_



In [7]:
def optimal_action_x(state):  #Agent for x
    hashable_state = (frozenset(state.x), frozenset(state.o))
    action_values = {action: value_dictionary.get((hashable_state[0] | {action}, hashable_state[1]), 0.0) for action in possible_plys(state)}
    return max(action_values, key=action_values.get)

def optimal_action_o(state):    #Agent for o
    hashable_state = (frozenset(state.x), frozenset(state.o))
    action_values = {action: value_dictionary.get((hashable_state[0], hashable_state[1] | {action}), 0.0) for action in possible_plys(state)}
    return min(action_values, key=action_values.get)

def play_game_o():
    state = State(set(), set())
    while True:
        action = optimal_action_x(state)
        state.x.add(action)
        print_board(state)
        if win(state.x):
            print(state)
            print("You lose!")
            break
        elif len(state.x) + len(state.o) == 9:
            print("Draw!")
            break
        
        while True:
            plys = possible_plys(state)
            print(f"possible moves: {plys}")
            action = int(input("Insert your move: "))
            if action in plys:
                break
            print("You need to choose one of the possible moves")
        state.o.add(action)
        print_board(state)
        if win(state.o):
            print("You win!")
            break

def play_game_x():
    state = State(set(), set())
    while True:
        while True:
            plys = possible_plys(state)
            print(f"possible moves: {plys}")
            action = int(input("Insert your move: "))
            if action in plys:
                break
            print("You need to choose one of the possible moves")
        state.x.add(action)
        print_board(state)
        if win(state.x):
            print("You win!")
            break
        elif len(state.x) + len(state.o) == 9:
            print("Draw!")
            break

        action = optimal_action_o(state)
        state.o.add(action)
        print_board(state)
        if win(state.o):
            print(state)
            print("You lose!")
            break

In [18]:
play_game_o()

___
_X_
___

possible moves: [1, 2]
___
_XO
___

__X
_XO
___

possible moves: [2, 3, 4, 7, 8, 9]
__X
_XO
O__

X_X
_XO
O__

possible moves: [8, 9, 3, 7]
XOX
_XO
O__

XOX
_XO
O_X

Position(x={8, 2, 5, 6}, o={1, 4, 7})
You lose!


In [57]:
play_game_x()

possible moves: [1, 2, 5]
___
_X_
___

O__
_X_
___

possible moves: [1, 4, 7, 8]
O__
_X_
__X

O__
_X_
O_X

possible moves: [1, 3, 6, 7, 9]
O__
XX_
O_X

O__
XXO
O_X

possible moves: [3, 6, 7]
OX_
XXO
O_X

OX_
XXO
OOX

possible moves: [6]
OXX
XXO
OOX

Draw!
