### Reinforcement Learning example by the Teacher

In [9]:
from itertools import combinations
from collections import namedtuple, defaultdict

import numpy as np

In [10]:
Position = namedtuple('Position', ['x', 'o'])

In [11]:
MAGIC = [
    2, 7, 6,
    9, 5, 1,
    4, 3, 8,
]

In [12]:
def print_board(pos: Position):
    """ Print a tic-tac-toe board """
    for r in range(3):
        for c in range(3):
            index = r * 3 + c
            if MAGIC[index] in pos.x:
                print('x', end='')
            elif MAGIC[index] in pos.o:
                print('o', end='')
            else:
                print('-', end='')
        print()
    print()

In [13]:
def win(squares):
    """ Check if a player has won"""
    return any(sum(square) == 15 for square in combinations(squares, 3))

def state_value(pos: Position):
    """ Evaluate position: +1 first player wins """
    if win(pos.x):
        return 1
    elif win(pos.o):
        return -1
    else:
        return 0

In [14]:
from random import choice
from copy import deepcopy

def random_game():
    state = Position(set(), set())
    available = set(range(1, 9+1))
    trajectory = list()
    while available:
        x = choice(list(available))
        state.x.add(x)
        trajectory.append(deepcopy(state))
        available.remove(x)
        if win(state.x) or not available:
            break
        # print("Player X's turn")
        # print_board(state)
        y = choice(list(available))
        state.o.add(y)
        trajectory.append(deepcopy(state))
        available.remove(y)
        if win(state.o) or not available:
            break
        # print("Player O's turn")
        # print_board(state)
    return trajectory

In [15]:
trajectory = random_game()
print_board(trajectory[-1])
player = state_value(trajectory[-1])
if player == 1:
    print("Player X wins!")
elif player == -1:
    print("Player O wins!")
else:
    print("Draw!")

xx-
-xo
oox

Player X wins!


In [16]:
from tqdm import tqdm

value_dict = defaultdict(int)
epsilon = 0.001

for steps in tqdm(range(100000)):
    trajectory = random_game()
    for state in trajectory:
        hashable_state = (frozenset(state.x), frozenset(state.o))
        value_dict[hashable_state] += epsilon * (state_value(state) - value_dict[hashable_state])


100%|██████████| 100000/100000 [00:10<00:00, 9982.59it/s]


In [17]:
sorted(value_dict.items(), key=lambda x: x[1], reverse=True)[:10]

[((frozenset({1, 3, 6, 8, 9}), frozenset({2, 4, 5, 7})), 0.40145685685590804),
 ((frozenset({1, 2, 3, 4, 8}), frozenset({5, 6, 7, 9})), 0.3960429350695001),
 ((frozenset({1, 5, 7, 8, 9}), frozenset({2, 3, 4, 6})), 0.3954383734429431),
 ((frozenset({1, 6, 7, 8, 9}), frozenset({2, 3, 4, 5})), 0.39483320664959265),
 ((frozenset({2, 3, 5, 8, 9}), frozenset({1, 4, 6, 7})), 0.39483320664959265),
 ((frozenset({2, 3, 6, 7, 9}), frozenset({1, 4, 5, 8})), 0.3930140692080231),
 ((frozenset({3, 4, 6, 8, 9}), frozenset({1, 2, 5, 7})), 0.39058004346455016),
 ((frozenset({1, 2, 4, 5, 6}), frozenset({3, 7, 8, 9})), 0.39058004346455016),
 ((frozenset({1, 2, 4, 7, 9}), frozenset({3, 5, 6, 8})), 0.3899700134780282),
 ((frozenset({1, 2, 3, 6, 8}), frozenset({4, 5, 7, 9})), 0.3887481209718509)]