In [4]:
from collections import defaultdict
import numpy as np
from functools import partial
import random
from scipy.special import softmax

In [5]:
class QLearner:
    """
    :param alpha: learning rate of this q learner :param gamma: discount factor of this q learner
    https://stackoverflow.com/questions/25014298/creating-a-defaultdict-with-empty-numpy-array
    """
    def __init__(self, alpha=0.1, gamma=0.8):
        self._qtable = defaultdict(lambda: numpy.ndarray(0))
        self._gamma = gamma
        self._alpha = alpha

        
    """ update the q value of a
     (state, action) pair when the action was rewarded with r :param s: the
     state :param a: the action that was carried out from state s :param r:
     the reward gained from the environment after action in state s :param
     s_next: state the environment changed into after action a in state s
     
     Qˆ(s,a) := (1 − α)Qˆ(s,a) + α r + γ maxQˆ(s,a)
     """
    def update(self, s, a, r, s_next)-> None:
        self._qtable[s][a] = (1-self._alpha)*(self._qtable[s][a])+self._alpha * r + self._gamma * max(self._qtable[s_next][a])
    
    """ Choose action
     according to softmax function in state s
     :param s: state of theenvironment
     :param theta: "temperature" parameter
     :return: selectedaction 
     https://www.baeldung.com/cs/epsilon-greedy-q-learning
    """
    def select_action(self, s, theta=1):
        moves = []
        for available_action in self._qtable[s]:
            if available_action is not None:
                moves.add(available_action)
        n = random.uniform(0, 2)
        if n < theta:
            action = random.choice(moves)
        else:
            action = np.argmax(self._qtable[s])
        return action
        