### Reinforcement Learning example by the Teacher

In [None]:
from itertools import combinations
from collections import namedtuple, defaultdict

import numpy as np

In [None]:
Position = namedtuple('Position', ['x', 'o'])

In [None]:
MAGIC = [
    2, 7, 6,
    9, 5, 1,
    4, 3, 8,
]

In [None]:
def print_board(pos: Position):
    """ Print a tic-tac-toe board """
    for r in range(3):
        for c in range(3):
            index = r * 3 + c
            if MAGIC[index] in pos.x:
                print('x', end='')
            elif MAGIC[index] in pos.o:
                print('o', end='')
            else:
                print('-', end='')
        print()
    print()

In [None]:
def win(squares):
    """ Check if a player has won"""
    return any(sum(square) == 15 for square in combinations(squares, 3))

def state_value(pos: Position):
    """ Evaluate position: +1 first player wins """
    if win(pos.x):
        return 1
    elif win(pos.o):
        return -1
    else:
        return 0

In [None]:
from random import choice
from copy import deepcopy

def random_game():
    state = Position(set(), set())
    available = set(range(1, 9+1))
    trajectory = list()
    while available:
        x = choice(list(available))
        state.x.add(x)
        trajectory.append(deepcopy(state))
        available.remove(x)
        if win(state.x) or not available:
            break
        # print("Player X's turn")
        # print_board(state)
        y = choice(list(available))
        state.o.add(y)
        trajectory.append(deepcopy(state))
        available.remove(y)
        if win(state.o) or not available:
            break
        # print("Player O's turn")
        # print_board(state)
    return trajectory

In [None]:
trajectory = random_game()
print_board(trajectory[-1])
player = state_value(trajectory[-1])
if player == 1:
    print("Player X wins!")
elif player == -1:
    print("Player O wins!")
else:
    print("Draw!")

In [None]:
from tqdm import tqdm

value_dict = defaultdict(int)
epsilon = 0.001

for steps in tqdm(range(100000)):
    trajectory = random_game()
    for state in trajectory:
        hashable_state = (frozenset(state.x), frozenset(state.o))
        value_dict[hashable_state] += epsilon * (state_value(state) - value_dict[hashable_state])


In [None]:
sorted(value_dict.items(), key=lambda x: x[1], reverse=True)[:10]