# Задание 1. Табличное Q обучение

In [1]:
import numpy as np
import pickle
from tqdm.notebook import tqdm
import random
import matplotlib.pyplot as plt
from numba import njit

In [2]:
@njit(cache=True)
def availablePositions(board):
    data = np.where(board==0)
    positions = list(zip(data[0], data[1]))
    return positions

In [3]:
@njit('i8[:](i8[:,:],i8, i8, i8)', cache=True)
def who_winner(board, board_rows, board_cols, num_counts):
        # row
        for i in range(board_rows):
            if np.sum(board[i, :]) == num_counts:
                return np.array([1, 1])
            if np.sum(board[i, :]) == -num_counts:
                return np.array([1, -1])
        # col
        for i in range(board_cols):
            if np.sum(board[:, i]) == num_counts:
                return np.array([1, 1])
            if np.sum(board[:, i]) == -num_counts:
                return np.array([1, -1])
        # diagonal
        diag_sum1 = 0
        diag_sum2 = 0
        for i in range(board_cols):
            diag_sum1 += board[i, i]
            diag_sum2 += board[i, board_cols - i - 1]
        diag_sum = max(abs(diag_sum1), abs(diag_sum2))
        if diag_sum == num_counts:
            if diag_sum1 == num_counts or diag_sum2 == num_counts:
                return np.array([1, 1])
            else:
                return np.array([1, -1])

        if len(availablePositions(board)) == 0:
            return np.array([1, 0])
        return np.array([0, 0])

In [4]:
class TicTacToe:
    def __init__(self, p1, p2, board_rows, board_cols, num_counts):
        self.board = np.zeros((board_rows, board_cols), dtype = int)
        self.p1 = p1 # player 1
        self.p2 = p2 # player 2
        self.isEnd = False
        self.playerName = 1
        self.board_rows = board_rows
        self.board_cols = board_cols
        self.num_counts = num_counts
        
    def getHash(self):
        return self.board.tobytes()

    def winner(self):
        win = who_winner(self.board, self.board_rows, self.board_cols, self.num_counts)
        self.isEnd = win[0]
        if win[0] == 0:
            return None
        return win[1]

    def updateState(self, position):
        self.board[position] = self.playerName
        self.playerName = -1 if self.playerName == 1 else 1

    def giveReward(self):
        result = self.winner()
        if result == 1:
            self.p1.feedReward(1)
            self.p2.feedReward(-1)
        elif result == -1:
            self.p1.feedReward(-1)
            self.p2.feedReward(1)
        else:
            self.p1.feedReward(0)
            self.p2.feedReward(0)

    # board reset
    def reset(self):
        self.board = np.zeros((self.board_rows, self.board_cols), dtype = int)
        self.isEnd = False
        self.playerName = 1

    def fit(self, rounds=1000, delta_i_valid = 100, n_test=100, 
            set_change_expl=(0, 1), is_change_expl=(True, True)):
        
        self._fig, self._ax = plt.subplots(nrows=2, figsize=(8,12))
        dict_data = {-1:[], 0:[], 1:[]}
        mean_data = []
        data_x = []
        for i in tqdm(range(rounds)):
            self.reset()
            if i % delta_i_valid == 0:
                counts = 0
                dict_res = {-1:0, 0:0, 1:0}
                for j in range(n_test):
                    res = self.play_one_game(set_change_expl=set_change_expl, 
                                                 is_change_expl=is_change_expl)
                    dict_res[res] += 1
                    counts += res
                
                data_x.append(i)
                mean_data.append(counts/n_test)
                dict_data[1].append(dict_res[1])
                dict_data[0].append(dict_res[0])
                dict_data[-1].append(dict_res[-1])
                self._draw_plot(data_x, mean_data, dict_data[1], dict_data[0], dict_data[-1])
                
                print("Rounds {}, mean ={}, win = {}, draw = {}, loose = {}".format(i, 
                                                                                    counts/n_test, 
                                                                                    dict_res[1],
                                                                                    dict_res[0],
                                                                                    dict_res[-1]))
                
            while not self.isEnd:
                # Player 1
                positions = availablePositions(self.board)
                p1_action = self.p1.chooseAction(positions, self.board, self.playerName)
                self.updateState(p1_action)
                board_hash = self.getHash()
                self.p1.addState(board_hash)
                win = self.winner()
                if win is not None:
                    self.giveReward()
                    self.p1.reset()
                    self.p2.reset()
                    self.reset()
                    break

                else:
                    # Player 2
                    positions = availablePositions(self.board)
                    p2_action = self.p2.chooseAction(positions, self.board, self.playerName)
                    self.updateState(p2_action)
                    board_hash = self.getHash()
                    self.p2.addState(board_hash)

                    win = self.winner()
                    if win is not None:
                        self.giveReward()
                        self.p1.reset()
                        self.p2.reset()
                        self.reset()
                        break

    def play_one_game(self, set_change_expl=(0, 1), is_change_expl=(False, False)):
        self.reset()
        if is_change_expl[0]:
            p1_expl = self.p1.get_exp_rate()
            self.p1.set_exp_rate(set_change_expl[0])
        if is_change_expl[1]:
            p2_expl = self.p2.get_exp_rate()
            self.p2.set_exp_rate(set_change_expl[1])
        while not self.isEnd:
            # Player 1
            positions = availablePositions(self.board)
            p1_action = self.p1.chooseAction(positions, self.board, self.playerName)
            self.updateState(p1_action)
            win = self.winner()
            if win is not None:
                rew_ = win
                break
                
            # Player 2
            positions = availablePositions(self.board)
            p2_action = self.p2.chooseAction(positions, self.board, self.playerName)

            self.updateState(p2_action)
            win = self.winner()
            if win is not None:
                rew_ = win
                break
                
        if is_change_expl[0]:
            self.p1.set_exp_rate(p1_expl)
        if is_change_expl[1]:
            self.p2.set_exp_rate(p2_expl)
        return rew_
    
    def _draw_plot(self, data_x, data_y, data_y1, data_y2, data_y3):
        '''Отрисовка значений среднего выигрыша'''
        self._ax[0].clear()
        self._ax[0].plot(data_x, data_y)
        self._ax[0].set_title('Зависимость среднего выигрыша от итерации обучения')
        self._ax[0].set_ylabel('Средний выигрыш')
        self._ax[0].set_xlabel('Итерация обучения')
        self._ax[0].grid()
        
        
        self._ax[1].clear()
        self._ax[1].plot(data_x, data_y1, label="Выигрыш")
        self._ax[1].plot(data_x, data_y2, label="Ничья")
        self._ax[1].plot(data_x, data_y3, label="Поражение")
        self._ax[1].legend(loc='best', bbox_to_anchor=(0.95, 0.1, 0.05, 0.3))
        
        self._ax[1].set_title('Зависимость числа исходов событий для стратегии "X" от итерации обучения')
        self._ax[1].set_ylabel('Число игр')
        self._ax[1].set_xlabel('Итерация обучения')
        self._ax[1].grid()
        self._fig.canvas.draw()
        
    def showBoard(self):
        # p1: x  p2: o
        for i in range(0, self.board_rows):
            print('-------------')
            out = '| '
            for j in range(0, self.board_cols):
                if self.board[i, j] == 1:
                    token = 'x'
                if self.board[i, j] == -1:
                    token = 'o'
                if self.board[i, j] == 0:
                    token = ' '
                out += token + ' | '
            print(out)
        print('-------------')

In [5]:
class Player:
    def __init__(self, name, board_cols, board_rows, exp_rate=0.3, lr = 0.05, decay_gamma = 0.9):
        self.name = name
        self.states = []
        self.lr = lr
        self.exp_rate = exp_rate
        self.decay_gamma = decay_gamma
        self.states_value = {}
        self.board_cols = board_cols
        self.board_rows = board_rows
        
    def getHash(self, board):
        return board.tobytes()
    
    def get_exp_rate(self):
        return self.exp_rate
    
    def set_exp_rate(self, exp_rate):
        self.exp_rate = exp_rate

    def chooseAction(self, positions, current_board, symbol):
        if random.random() <= self.exp_rate:
            idx = random.choice(range(len(positions)))
            action = positions[idx]
        else:
            value_max = -999
            for p in positions:
                next_board = current_board.copy()
                next_board[p] = symbol
                next_boardHash = self.getHash(next_board)
                value = random.random() if self.states_value.get(next_boardHash) is None else self.states_value.get(next_boardHash)
                if value >= value_max:
                    value_max = value
                    action = p
        return action

    def addState(self, state):
        self.states.append(state)

    def feedReward(self, reward):
        for st in self.states[::-1]:
            if self.states_value.get(st) is None:
                self.states_value[st] = random.random()
            self.states_value[st] += self.lr * (self.decay_gamma * reward - self.states_value[st])
            reward = self.states_value[st]

    def reset(self):
        self.states = []

    def savePolicy(self):
        fw = open('policy_' + str(self.name), 'wb')
        pickle.dump(self.states_value, fw)
        fw.close()

    def loadPolicy(self, file):
        fr = open(file, 'rb')
        self.states_value = pickle.load(fr)
        fr.close()

# Игра 3x3

In [9]:
%matplotlib notebook
board_cols = 3 
board_rows = 3
num_counts = 3

p1 = Player("p1", board_cols, board_rows)
p2 = Player("p2", board_cols, board_rows)

game = TicTacToe(p1, p2, board_cols, board_rows, num_counts)
print("training...")
game.fit(rounds=50000, delta_i_valid = 5000, n_test=50000)

training...


<IPython.core.display.Javascript object>

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=50001.0), HTML(value='')))

Rounds 0, mean =0.30362, win = 29343, draw = 6495, loose = 14162
Rounds 5000, mean =0.51188, win = 35739, draw = 4116, loose = 10145
Rounds 10000, mean =0.83502, win = 44347, draw = 3057, loose = 2596
Rounds 15000, mean =0.83854, win = 44008, draw = 3911, loose = 2081
Rounds 20000, mean =0.90184, win = 46584, draw = 1924, loose = 1492
Rounds 25000, mean =0.98878, win = 49439, draw = 561, loose = 0
Rounds 30000, mean =0.98684, win = 49342, draw = 658, loose = 0
Rounds 35000, mean =0.98946, win = 49473, draw = 527, loose = 0
Rounds 40000, mean =0.98892, win = 49446, draw = 554, loose = 0
Rounds 45000, mean =0.98916, win = 49458, draw = 542, loose = 0
Rounds 50000, mean =0.98694, win = 49347, draw = 653, loose = 0



# Игра 4x4

In [7]:
%matplotlib notebook
board_cols = 4 
board_rows = 4
num_counts = 4

p1 = Player("p1", board_cols, board_rows)
p2 = Player("p2", board_cols, board_rows)

game = TicTacToe(p1, p2, board_cols, board_rows, num_counts)
print("training...")
game.fit(rounds=12000000, delta_i_valid = 1000000, n_test=100000)

training...


<IPython.core.display.Javascript object>

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=12000000.0), HTML(value='')))

Rounds 0, mean =0.04944, win = 31693, draw = 41558, loose = 26749
Rounds 1000000, mean =0.26162, win = 42793, draw = 40576, loose = 16631
Rounds 2000000, mean =0.39812, win = 52483, draw = 34846, loose = 12671
Rounds 3000000, mean =0.48611, win = 59338, draw = 29935, loose = 10727
Rounds 4000000, mean =0.51007, win = 60494, draw = 30019, loose = 9487
Rounds 5000000, mean =0.45048, win = 56692, draw = 31664, loose = 11644
Rounds 6000000, mean =0.58737, win = 66148, draw = 26441, loose = 7411
Rounds 7000000, mean =0.54614, win = 62322, draw = 29970, loose = 7708
Rounds 8000000, mean =0.57241, win = 64751, draw = 27739, loose = 7510
Rounds 9000000, mean =0.59391, win = 66519, draw = 26353, loose = 7128
Rounds 10000000, mean =0.74652, win = 78512, draw = 17628, loose = 3860
Rounds 11000000, mean =0.78085, win = 81022, draw = 16041, loose = 2937

