# Content

In [1]:
from Board import Result
from player.Player import Player
from Decorators import debug
from abc import abstractmethod
from collections import defaultdict


import os
import dill
import numpy as np
import random

In [2]:
class QPlayer(Player):
    """
    The Q-Player using a specified Q-Learner to find the best next move
    """

    def __init__(self, representationChar, q_learner):

        Player.__init__(self ,representationChar)
        self.q_learner = q_learner
        self.prev_state = None
        self.state = None
        self.prev_action = None
        self.action = None
        self.prev_reward = 0

    @debug
    #@log(aiReadable=True)
    def makeMove(self, board):
        """
        Makes a move on the board
        :param board: current state of the board
        :return: list of a random column (first) and row number (second)
        """
        self.state = board.field
        self.action = self.q_learner.select_move(state=self.state)

        return self.action

    def giveResult(self, result):
        """
        Set own reward for given result and updates the q table
        """
        reward = 0
        if Result.INVALID_MOVE == result:
            reward = -100

        elif Result.GAME_LOST == result:
            reward = -10
            self.stats.incrLost()

        elif Result.GAME_WON == result:
            reward = 100
            self.stats.incrWon()

        elif Result.GAME_DRAW == result:
            reward = -1
            self.stats.incrDraw()

        if self.prev_action is not None:
            action_idx = list(self.q_learner.possible_actions.keys())[
                list(self.q_learner.possible_actions.values()).index(self.prev_action)
            ]

            self.q_learner.update(self.prev_state, self.state, action_idx, self.prev_reward)

        self.prev_action = self.action
        self.prev_state = self.state
        self.prev_reward = reward

In [3]:
class QLearner:
    """
    The abstract class generalizing all q-learning methods
    """

    @abstractmethod
    def update(self, state, new_state, action, result):
        """
        Calculates the new q value from the new state and action pair
        :param state: last state of the board
        :param new_state: new state of the board, including the new action
        :param action: chosen action
        :param result: reward for chosen action
        :return: update algorithm for the new results
        """

    @abstractmethod
    def select_move(self, state, theta=0.9):
        """
        Choose action according to softmax function in state
        :param state: state of the environment
        :param theta: "temperature" parameter
        :return: the action that got calculated as the best next move
        """


In [4]:
class QTableLearner (QLearner):
    """
    QLearner specification for q-table
    """

    def __init__(self, q_table = None, learn_rate=0.1, discount_factor=0.8):
        """
        :param learn_rate: learning rate of this q learner
        :param discount_factor: discount factor of this q learner
        """
        self.possible_actions = {0:[1, 1], 1:[2, 1], 2:[3, 1], 3:[1, 2], 4:[2, 2], 5:[3, 2], 6:[1, 3], 7:[2, 3], 8:[3, 3]}
        self.q_table = q_table if q_table is None else defaultdict(lambda: np.empty([3, 3]))
        self.learn_rate = learn_rate
        self.discount_factor = discount_factor

    #@save_q_table
    def update(self, state, new_state, action, result):
        """
        see QLearner
        """
        state, new_state= np.array_str(state), np.array_str(new_state)

        self.q_table[state][action // 3, action % 3] = self.q_table[state][action // 3, action % 3] \
                                      * (1 - self.learn_rate) \
                                      + self.learn_rate \
                                      * (result + self.discount_factor * np.max(self.q_table[new_state]))

        return self.q_table

    def select_move(self, state, theta=0.9):
        """
        see QLearner
        """
        if np.random.uniform(0, 1) > theta: # then exploit the env --> use Qtable or memory info
            idx = np.argmax(self.q_table[np.array_str(state)])
            action = self.possible_actions[idx]

        else: # then explore the enviroment --> randomly sample a move from available moves
            action = random.choice(self.possible_actions) # that is the agent always explores the enviroment

        return action # return choosen move

In [5]:
class QNNLearner (QLearner):
    """
    QLearner specification for q-table
    """

    def __init__(self, model = None, learn_rate=0.1, discount_factor=0.8):
        """
        :param learn_rate: learning rate of this q learner
        :param discount_factor: discount factor of this q learner
        """
        self.possible_actions = {0:[1, 1], 1:[2, 1], 2:[3, 1], 3:[1, 2], 4:[2, 2], 5:[3, 2], 6:[1, 3], 7:[2, 3], 8:[3, 3]}
        self.learn_rate = learn_rate
        self.model = model
        self.discount_factor = discount_factor
        self.memory = []
        self.count_memory = 0

#@save_q_table
    def update(self, state, new_state, action, result):
        """
        see QLearner
        """
        self.load_to_memory(state, action, new_state, result)

        self.count_memory += 1

        #print(self.count_memory)
        if self.count_memory == 2:
            self.count_memory = 0
            # Offline training
            self.model.learn_batch(self.memory)
            # Online training
            #self.learn(self.prev_state, self.prev_move, state,  -1, self.reward)
            self.memory = []


    def load_to_memory(self, prev_state, prev_move, state, reward):
        self.memory.append([prev_state, prev_move, state, reward])

    def select_move(self, state, theta=0.1):
        """
        see QLearner
        """

        p = random.uniform(0, 1)

        if p > theta:
            idx = self.choose_optimal_move(state)
            action = self.possible_actions[idx]
        else:
            action = random.choice(self.possible_actions)

        return action # return choosen move


    def choose_optimal_move(self, state):

        v = -float('Inf')
        v_list = []
        idx = []
        for move in self.possible_actions:
            value = self.model.calc_value(state, move)
            v_list.append(round(float(value), 5))

            if value > v:
                v = value
                idx = [move]
            elif v == value:
                idx.append(move)

        idx = random.choice(idx)
        return idx


In [6]:
class QUtils:

    @staticmethod
    def pretty_print_q_table(dict_q_table):
        """
        prints a human readable representation of the given q_table
        :param dict_q_table:
        """
        for key, values in dict_q_table.items():
            values = np.round_(values, 2)

            str_field = "{a1} | {b1} | {c1}\n{a2} | {b2} | {c2}\n{a3} | {b3} | {c3}"
            result = str_field.format(a1=values[0][0], b1=values[1][0], c1=values[2][0],
                                  a2=values[0][1], b2=values[1][1], c2=values[2][1],
                                  a3=values[0][2], b3=values[1][2], c3=values[2][2])

            pretty_key = str(key).replace("None", "' '").replace('[[', ' [').replace(']]', ']')
            pretty_value = str(values).replace('[[', ' [').replace(']]', ']')
            print(f"{pretty_key}\n\n{result}\n\n-------------------------\n")

    @staticmethod
    def get_dict_from_file(filepath):
        """
        uses the given filepath to read a dict from the specified file
        :param filepath: the file to read in
        :return: the dict read from the file
        """

        if os.path.isfile(filepath) and os.path.getsize(filepath) > 0:
            with open(filepath, "rb") as file:
                return dill.load(file)

    @staticmethod
    def save_dict_to_file(filepath, dict):
        """
        save a dict to a specified file
        :param filepath: the file to write to
        :param dict: the savable dict
        """

        with open(filepath, 'wb') as file:
            dill.dump(dict, file)

    @staticmethod
    def merge_dicts(d0:dict, d1:dict):
        """
        merges 2 given dicts
        :param d0: first dict
        :param d1: second dict
        :return:
        """

        d = d0.copy()
        for k,v in d1.items():
            if (k not in d):
                d[k] = d1[k]

        return d

In [7]:
from abc import abstractmethod
import os
from pathlib import Path
import keras.models as Km
import keras as K
import numpy as np
import time


class Model:

    def __init__(self, tag):
        self.tag = tag
        self.epsilon = 0.1
        self.alpha = 0.5
        self.gamma = 1
        self.model = self.load_model()

    def load_model(self):
        """
        if self.tag == 1:
            tag = '_first'
        else:
            tag = '_second'
            """
       # s = 'model_values' + tag + '.h5'
        s = 'model_values.h5'
        model_file = Path(s)

        if model_file.is_file():
            print('load model')
            model = Km.load_model(s)
            print('load model: ' + s)
        else:
            model = self.create_model()
        return model

    @abstractmethod
    def create_model(self):
        pass

    @abstractmethod
    def state_to_tensor(self, state, move):
        pass

    def calc_value(self, state, move):
        tensor = self.state_to_tensor(state, move)
        value = self.model.predict(tensor)
        # K.backend.clear_session()
        return value

    def calc_target(self, prev_state, prev_move, state, reward):

        qvalue = self.calc_value(prev_state, prev_move)
        v = []
        tensor = self.state_to_tensor(prev_state, prev_move)

        for move in range(len(tensor[:,0][0])):
            v.append(self.calc_value(state, move))

        if reward == 0:
            v_s_tag = self.gamma * np.max(v)
            target = np.array(qvalue + self.alpha * (reward + v_s_tag - qvalue))
        else:
            # v_s_tag = 0
            target = reward

        # target = np.array(v_s + self.alpha * (reward + v_s_tag - v_s))

        # if self.tag == 1:
        #     print('learn general')
        #     print(prev_state, prev_move, state, ava_moves, reward)
        # print('target: ', target)

        return target

    def train_model(self, prev_state, prev_move, target, epochs):

        tensor = self.state_to_tensor(prev_state, prev_move)

        if target is not None:

            if self.tag == 1:
                print('value before training:', self.model.predict(tensor))
            self.model.fit(tensor, target, epochs=epochs, verbose=0)
            # K.backend.clear_session()

            if self.tag == 1:
                print('target:', target)
                print('value after training:', self.model.predict(tensor))

    def save_model(self):
        if self.tag == 1:
            tag = '_first'
        else:
            tag = '_second'
        s = 'model_values' + tag + '.h5'

        try:
            os.remove(s)
        except:
            pass

        self.model.save(s)

    def learn_batch(self, memory):
        print('start learning player', self.tag)
        print('data length:', len(memory))

        # build x_train
        ind = 0
        #x_train = np.zeros((len(memory), 7, 7, 1))
        x_train = np.zeros((len(memory), 2, 9))
        for v in memory:
            [prev_state, prev_move, _, _] = v
            sample = self.state_to_tensor(prev_state, prev_move)
            x_train[ind, :, :] = sample
            ind += 1

        # train with planning
        # for i in range(5):
        loss = 20
        count = 0
        while loss > 0.02 and count < 10:
            # tic()
            y_train = self.create_targets(memory)
            # toc()
            self.model.fit(x_train, y_train, epochs=5, batch_size=256, verbose=0)
            loss = self.model.evaluate(x_train, y_train, batch_size=256, verbose=0)[0]
            count += 1
            print('planning number:', count, 'loss', loss)

        loss = self.model.evaluate(x_train, y_train, batch_size=256, verbose=0)
        print('player:', self.tag, loss, 'loops', count)

        self.save_model()

    def create_targets(self, memory):
        y_train_ = np.zeros((len(memory), 1))
        count_ = 0
        for v_ in memory:
            [prev_state_, prev_move_, state_, reward_] = v_
            target = self.calc_target(prev_state_, prev_move_, state_, reward_)
            y_train_[count_, :] = target
            count_ += 1

            # print('---------')
            # print('player', self.tag)
            # print('prev state', prev_state_)
            # print('prev move', prev_move_)
            # print('state', state_)
            # print('ava moves', ava_moves_)
            # print('reward', reward_)
            # print('target', target)
            #
            # value = self.calc_value(prev_state_, prev_move_)
            # print('value through net', value)
            # time.sleep(0.2)

        return y_train_

In [8]:
import numpy as np
import keras.layers as kl
import keras.models as km
import keras.optimizers as ko


class TicTacToeModel(Model):

    def __init__(self, tag):
        super().__init__(tag)
        pass

    def create_model(self):
        print('new model')

        #model = km.load_model("qnn_model")

        model = Km.Sequential()
        model.add(kl.Flatten(input_shape=(2, 9)))
        model.add(kl.Dense(18))
        model.add(kl.LeakyReLU(alpha=0.3))
        model.add(kl.Dense(18))
        model.add(kl.LeakyReLU(alpha=0.3))
        model.add(kl.Dense(18))
        model.add(kl.LeakyReLU(alpha=0.3))
        model.add(kl.Dense(18))
        model.add(kl.LeakyReLU(alpha=0.3))
        model.add(kl.Dense(18))
        model.add(kl.LeakyReLU(alpha=0.3))
        model.add(kl.Dense(18))
        model.add(kl.LeakyReLU(alpha=0.3))
        model.add(kl.Dense(18))
        model.add(kl.LeakyReLU(alpha=0.3))
        model.add(kl.Dense(9))
        model.add(kl.LeakyReLU(alpha=0.3))
        model.add(kl.Dense(1, activation='linear'))

        # adam = ko.Adam(lr=0.001)

        model.compile(optimizer='Adam', loss='mean_absolute_error', metrics=['accuracy'])
        #model.save("qnn_model")

        model.summary()

        return model

    def state_to_tensor(self, state, move):

        state = np.array(state)
        state = state.flatten()
        state = self.one_hot_encode_state(state)

        a = np.zeros(9)
        a = np.asarray(a).astype('float32')
        a[move] = 1

        state = np.asarray(state).astype('float32')
        tensor = np.array((a, state))
        #print(tensor)
        tensor = tensor.reshape((1, 2, 9))

        return tensor

    def one_hot_encode_state(self, state):
        for i in range(len(state)):
            if state[i] is None:
                state[i] = 0
            if state[i] == 'x':
                state[i] = 1
            if state[i] == 'o':
                state[i] = -1

        return state

In [9]:
from abc import abstractmethod
import random
import numpy as np
import pickle

class Agent:

    def __init__(self, tag, exploration_factor=1):
        self.tag = tag
        self.exp_factor = exploration_factor
        self.prev_state = np.zeros(9)
        self.prev_move = -1
        self.state = None
        self.move = None
        self.print_value = False
        self.model = Model(self.tag)
        self.memory = []
        self.count_memory = 0
        self.winner_flag = False

    def choose_move(self, state, winner, learn):

        self.load_to_memory(self.prev_state, self.prev_move, state, self.ava_moves(state), self.reward(winner))

        if winner is not None:

            self.count_memory += 1

            self.prev_state = np.zeros(9)
            self.prev_move = -1
            print(self.count_memory)
            if learn is True and self.count_memory == 2:
                self.count_memory = 0
                # Offline training
                self.model.learn_batch(self.memory)
                self.memory = []
                # Online training
                #self.learn(self.prev_state, self.prev_move, state, self.ava_moves(state),  -1, self.reward(winner))
            return None

        p = random.uniform(0, 1)

        if p < self.exp_factor:
            idx = self.choose_optimal_move(state)
        else:
            ava_moves = self.ava_moves(state)
            idx = random.choice(ava_moves)

        self.prev_state = state
        self.prev_move = idx

        return idx

    def choose_optimal_move(self, state):

        ava_moves = self.ava_moves(state)
        v = -float('Inf')
        v_list = []
        idx = []
        for move in ava_moves:
            value = self.model.calc_value(state, move)
            v_list.append(round(float(value), 5))

            if value > v:
                v = value
                idx = [move]
            elif v == value:
                idx.append(move)

        idx = random.choice(idx)
        return idx

    def learn(self, prev_state, prev_move, state, ava_moves, move, reward):

        if prev_move != -1:

            target = self.model.calc_target(prev_state, prev_move, state, reward)
            #print(target)
            self.model.train_model(prev_state, prev_move, target, 1)

    @abstractmethod
    def ava_moves(self, state):
        pass

    def load_to_memory(self, prev_state, prev_move, state, ava_moves, reward):
        self.memory.append([prev_state, prev_move, state, ava_moves, reward])

    def save_memory(self):
        is_file_ = True
        count = 1
        s = ''
        while is_file_:
            s = 'data4/value_list_' + str(self.tag) + '_' + str(count) + '.pkl'
            if Path(s).is_file():
                is_file_ = True
                count = count + 1
            else:
                is_file_ = False

        with open(s, 'wb') as output:
            pickle.dump(self.memory, output)

In [48]:
import numpy as np
model = TicTacToeModel(1)
value = np.ndarray(shape=[3,3])
pstate = np.ndarray(shape=[3,3])
state = [-1,0,-1,0,1,0,0,1,0]
for i in [0,1,2,3,4,5,6,7,8]:
    tensor = model.state_to_tensor(state, i)
    km = model.load_model()
    #print(tensor)
    value[int(i/3)][i%3] = km.predict(tensor)
    pstate[int(i/3)][i%3] = state[i]



load model
load model: model_values_first.h5
load model
load model: model_values_first.h5


ValueError: in user code:

    C:\Users\ke-ch\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\keras\engine\training.py:1478 predict_function  *
        return step_function(self, iterator)
    C:\Users\ke-ch\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\keras\engine\training.py:1468 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    C:\Users\ke-ch\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\distribute\distribute_lib.py:1259 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    C:\Users\ke-ch\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\distribute\distribute_lib.py:2730 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    C:\Users\ke-ch\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\distribute\distribute_lib.py:3417 _call_for_each_replica
        return fn(*args, **kwargs)
    C:\Users\ke-ch\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\keras\engine\training.py:1461 run_step  **
        outputs = model.predict_step(data)
    C:\Users\ke-ch\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\keras\engine\training.py:1434 predict_step
        return self(x, training=False)
    C:\Users\ke-ch\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\keras\engine\base_layer.py:998 __call__
        input_spec.assert_input_compatibility(self.input_spec, inputs, self.name)
    C:\Users\ke-ch\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\keras\engine\input_spec.py:234 assert_input_compatibility
        raise ValueError('Input ' + str(input_index) + ' of layer ' +

    ValueError: Input 0 of layer sequential_7 is incompatible with the layer: : expected min_ndim=4, found ndim=3. Full shape received: (None, 2, 9)


In [47]:
for i in [0,1,2,3,4,5,6,7,8]:
    pstate[int(i/3)][i%3] = state[i]
print(value)
print(pstate)
print(f"Optimal: {np.argmax(value)}")
newstate = pstate
newstate[int(np.argmax(value)/3)][np.argmax(value)%3] = 1
print(newstate)

[[-0.35171202 -0.39011419 -0.36668181]
 [-0.3843089  -0.35754976 -0.40643951]
 [-0.26949695 -0.39932159 -0.33464077]]
[[-1.  0. -1.]
 [ 0.  1.  0.]
 [ 0.  1.  0.]]
Optimal: 6
[[-1.  0. -1.]
 [ 0.  1.  0.]
 [ 1.  1.  0.]]


In [13]:
from Game import Game
from tqdm.notebook import trange

In [None]:
for j in trange(100):

    player1 = QPlayer('x', QNNLearner(model=TicTacToeModel(1)))
    player2 = QPlayer('o', QNNLearner(model=TicTacToeModel(-1)))
    players = [player1, player2]

    game = Game(players)

    for i in range(1000):
        game.run()

  0%|          | 0/100 [00:00<?, ?it/s]

new model
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten (Flatten)            (None, 18)                0         
_________________________________________________________________
dense (Dense)                (None, 18)                342       
_________________________________________________________________
leaky_re_lu (LeakyReLU)      (None, 18)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 18)                342       
_________________________________________________________________
leaky_re_lu_1 (LeakyReLU)    (None, 18)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 18)                342       
_________________________________________________________________
leaky_re_lu_2 (LeakyReLU)    (None, 18)       