In [21]:
from collections import defaultdict
from player.Player import Player
from Decorators import debug, log
import numpy as np
import random

class QLearner(Player):
    """
    Q learner player class
    """

    def __init__(self, representationChar, exp_factor=0.9):
        super().__init__(representationChar)
        self.states_value = []  # record all positions taken
        self.exp_factor = exp_factor
        self.learning_rate = 0.1
        self.discount_rate = 0.99
        self.max_exploration_rate = 1
        self.min_exploration_rate = 0.01
        self.exploration_decay_rate = 0.01
        self.env_action_space=7
        self.q_table=defaultdict(lambda: np.zeros(self.env_action_space)) # dictionary
        self.rewards_all_episodes = []
        self.rewards_current_episode = 0
        self.q_table_state_space=None
        self.ava_moves = []

    def update_exp_factor(self,episode):
        self.exp_factor = self.min_exploration_rate + (self.max_exploration_rate - self.min_exploration_rate) * np.exp(-self.exploration_decay_rate*episode)
        self.rewards_all_episodes.append(self.rewards_current_episode)
        self.rewards_current_episode = 0
        self.q_table_state_space=len(self.q_table)

    def get_ava_moves(self, state):
        for ridx, row in enumerate(state):
            for cidx, cell in enumerate(row):
                if cell is None:
                    self.ava_moves.append([ridx+1, cidx+1])
        return self.ava_moves

    def update_qtable(self,prev_state,new_state,action,reward):
        prev_state,new_state=np.array_str(prev_state),np.array_str(new_state)

        self.q_table[prev_state][action] = self.q_table[prev_state][action] * (1 - self.learning_rate) + self.learning_rate * (reward + self.discount_rate * np.max(self.q_table[new_state]))
        self.rewards_current_episode += reward

    def choose_move(self,state):  #
        exploration_rate_threshold = np.random.uniform(0, 1)
        ava_moves = self.get_ava_moves(state)
        if exploration_rate_threshold > self.exp_factor: # then exploit the env --> use Qtable or memory info
            action = np.argmax(self.q_table[np.array_str(state)])
            if action in ava_moves:
                pass
            else:
                action = random.choice(ava_moves)

        else: # then explore the enviroment --> randomly sample a move from available moves
            action = random.choice(ava_moves) # that is the agent always explores the enviroment
        return action # return choosen move


    @debug
    @log(aiReadable=True)
    def makeMove(self, board):
        """
        Makes a random move on the board
        :param board: current state of the board
        :return: list of a random column (first) and row number (second)
        """

        return self.choose_move(board)

    @debug
    def giveResult(self, result):
        """
        Just a empty method because a random player doesn't need feedback
        """
        pass