In [None]:
from collections import defaultdict
import numpy as np
from functools import partial
import random
from scipy.special import softmax

In [None]:
#NO_RESULT = 0 r=1
#GAME_WON = 1 r=10
#GAME_LOST = 2 r=-10
#GAME_DRAW = 3 r=-7
#INVALID_MOVE = 4 r=-5

class QLearner():

    def __init__(self, alpha=0.1, gamma=0.8):
        # key = board (hashed)
        self.q_table = defaultdict(partial(np.ndarray, 0))
        self.alpha = alpha
        self.gamma = gamma

    def update(self, s, a, r, s_next) -> None:
        """
        update the q value of a (state, action) pair when the action was rewarded with r
        :param s: the state
        :param a: the action that was carried out from state s
        :param r: the reward gained from the environment after action in state s
        :param s_next: state the environment changed into after action a in state s
        """
        self.q_table[s.tobytes()][a] = self.q_table[s.tobytes()][a] * (1 - self.alpha) + self.alpha * (r + self.gamma * max(self.q_table[s_next.tobytes()]))

    def select_action(self, s, theta=1) -> [int, int]:
        """
        Choose action according to softmax function in state s
        :param s: state of the environment
        :param theta: "temperature" parameter
        :return: selected action
        """
        available_moves = []
        for a in self.q_table[s.tobytes()]:
            if a is not None:
                available_moves.append(a)

        if theta > 0.9:
            action = np.argmax(self.q_table[s.tobytes()])
            if action in available_moves:
                pass
            else:
                action = random.choice(available_moves)
        else:
            action = random.choice(available_moves)
        return action

