In [None]:
import os
from Board import Result
from collections import defaultdict
from player.Player import Player
from Decorators import debug, log, save_q_table
import numpy as np
import random
import dill

class QLearner(Player):
    """
    Q learner player class
    """

    def __init__(self, representationChar, exp_factor=0.9):
        super().__init__(representationChar)
        self.states_value = []  # record all positions taken
        self.exp_factor = exp_factor
        self.learning_rate = 0.1
        self.discount_rate = 0.99
        self.max_exploration_rate = 1
        self.min_exploration_rate = 0.01
        self.exploration_decay_rate = 0.01
        self.env_action_space=9
        self.q_table=defaultdict(lambda: np.zeros(self.env_action_space)) # dictionary
        self.rewards_all_episodes = []
        self.rewards_current_episode = 0
        self.q_table_state_space=None
        self.ava_moves = []
        self.prev_state = []
        self.reward = 0
        self.last_action = None
        self.state = None

    def update_exp_factor(self,episode):
        """
        Updates the exploration factor every episode, to explore more while progressing
        :param episode: current episode
        """
        self.exp_factor = self.min_exploration_rate + (self.max_exploration_rate - self.min_exploration_rate) * np.exp(-self.exploration_decay_rate*episode)
        self.rewards_all_episodes.append(self.rewards_current_episode)
        self.rewards_current_episode = 0
        self.q_table_state_space=len(self.q_table)

    def get_ava_moves(self, state):
        """
        Calculates all available moves of current state
        :param state: current state of the board
        :return: list of available moves
        """
        self.ava_moves = []
        for ridx, row in enumerate(state):
            for cidx, cell in enumerate(row):
                if cell is None:
                    self.ava_moves.append([ridx+1, cidx+1])
        return self.ava_moves

    @save_q_table
    def update_qtable(self,prev_state,new_state,action,reward):
        """
        Calculates the new q value from the new state and action pair
        :param prev_state: last state of the board
        :param new_state: new state of the board, including the new action
        :param action: chosen action
        :param reward: reward for chosen action
        :return: updated q table with the new value
        """
        prev_state,new_state=np.array_str(prev_state),np.array_str(new_state)
        q_table = defaultdict(lambda: np.zeros(self.env_action_space))

        if os.path.isfile("qtable.pkl") and os.path.getsize("qtable.pkl") > 0:
            with open("qtable.pkl", "rb") as qtablefile:
                q_table = dill.load(qtablefile)

        q_table[prev_state][action] = q_table[prev_state][action] * (1 - self.learning_rate) + self.learning_rate * (reward + self.discount_rate * np.max(q_table[new_state]))
        self.rewards_current_episode += reward

        return q_table

    def choose_move(self,state):
        """
        Choose best move from q table or explore with random moves.

        :param state: current state of the board (3x3 matrix with marks and/or open fields)
        :return: chosen move (an 2d array like [1,1] -> index of the field, starting by 1)
        """
        exploration_rate_threshold = np.random.uniform(0, 1)
        #ava_moves = self.get_ava_moves(state)
        #ava_moves = np.array(ava_moves)
        ava_moves = np.array([[1,1],[1,2],[1,3],[2,1],[2,2],[2,3],[3,1],[3,2],[3,3]])
        state = np.array(state)
        self.prev_state = state
        if exploration_rate_threshold > self.exp_factor: # then exploit the env --> use Qtable or memory info
            action = np.argmax(self.q_table[np.array_str(state)])
            if action in ava_moves:
                pass
            else:
                action = random.choice(ava_moves)

        else: # then explore the enviroment --> randomly sample a move from available moves
            action = random.choice(ava_moves) # that is the agent always explores the enviroment
        return action # return choosen move


    @debug
    @log(aiReadable=True)
    def makeMove(self, board):
        """
        Makes a random move on the board
        :param board: current state of the board
        :return: list of a random column (first) and row number (second)
        """

        self.last_action = self.choose_move(board)
        self.state = board
        return self.last_action

    @debug
    def giveResult(self, result):
        """
        Set own reward for given result and updates the q table
        """

        if Result.INVALID_MOVE == result: self.reward = -100
        if Result.GAME_LOST == result: self.reward = -10
        if Result.GAME_WON == result: self.reward = 100
        if Result.GAME_DRAW == result: self.reward = -1

        self.update_qtable(self.prev_state, self.state.field, self.last_action, self.reward)