In [2]:
from build.player.qlearner.QLearner import QLearner
from build.Board import Board, Result

import numpy as np
import random

In [37]:
class QTableLearner(QLearner):
    """
    QLearner specification for q-table

    Terms:
        state
            the snapshot of the board fields
            example: [[ 'x',  'o',  'o'],
                      [None,  'x', None],
                      [None, None, None]]

        action
            equals an index of a chosen Board.POSSIBLE_ACTION
            example: 8
    """

    def __init__(self, q_table=None, learn_rate=0.1, discount_factor=0.4, theta=0.1):
        """
        :param learn_rate: learning rate of this q learner
        :param discount_factor: discount factor of this q learner
        """

        if q_table is None:
            q_table = Board.get_default_dict()

        self.q_table = q_table
        self.learn_rate = learn_rate
        self.discount_factor = discount_factor
        self.theta = theta

    #@save_q_table
    def update(self, prev_state, state, prev_action, result) -> None:
        """
        Updates the Q-Table for future best move evaluations
        :param prev_state: last state of the board
        :param state: new state of the board, including the new action
        :param prev_action: the index of a Board.POSSIBLE_ACTION
        :param result: reward for chosen action
        """
        prev_state, state = np.array_str(prev_state), np.array_str(state)

        old_value = self.q_table[prev_state][prev_action // 3, prev_action % 3]
        max_q = 0 if result in [Result.GAME_WON, Result.GAME_LOST, Result.GAME_DRAW] else np.max(self.q_table[state])
        new_value = result + self.discount_factor * max_q
        temporal_difference = new_value - old_value

        self.q_table[prev_state][prev_action // 3, prev_action % 3] = old_value + self.learn_rate * temporal_difference

    def select_move(self, state, theta=None) -> int:
        """
        see QLearner
        """
        if theta is None:
            theta = self.theta

        if np.random.uniform(0, 1) > theta:  # then exploit the env --> use Qtable or memory info
            return int(np.argmax(self.q_table[np.array_str(state)]))

        else:  # then explore the enviroment --> randomly sample a move from available moves
            return random.choice(list(Board.POSSIBLE_ACTIONS))  # that is the agent always explores the enviroment

    def is_known_state(self, state):
        """
        see QLearner
        """
        return self.q_table.__contains__(np.array_str(state))