In [None]:
import numpy as np
import itertools
import random
import math
from typing import Union
from datetime import timedelta, datetime

import import_ipynb
from index import Game, TreeGame, TreeStatsGame

# Implementation of the agents

This notebooks relies on the [implementation of the game rules](./rules.ipynb) to implement agents to play Awal√©.
Those agents range from plain simple (the random agent) to quite sophisticated (Greedy-weighted UCT)

In [1]:
def max_rand(iterable, key=lambda x: x):
    '''Get the greatest element of an iterable and solve ties by a coin flip'''
    maximum_value = max(key(x) for x in iterable)
    keep = [x for x in iterable if key(x) == maximum_value]
    return random.choice(keep)

importing Jupyter notebook from index.ipynb
importing Jupyter notebook from /home/nikita/Code/Thesis/source/lib/draw.ipynb


# Player class

First we implement a player class. A player keeps track of the game state internaly. At each turn of the game, a player is called with the method `play()` to get the action played by the opponent (and thus update it's internal state) and then chooses an action with `get_action()`, updates once more it's internal state and then outputs it's action for the other player.

In [2]:
class Player:
    def play(self, their_action):
        # If we are the first player, there is no previous action
        if their_action != -1:
            # Play the opponent's move
            self.root, _, _ = self.root.step(their_action)
        else:
            assert self.player_id == 0, "Only the first player can have their_action=-1"
        
        action = self.get_action()
        self.root, _, _ = self.root.step(action)
        
        return action


## Simple agents
The random player just chooses an action at random between all the legal actions

In [3]:
class RandomPlayer(Player):
    def __init__(self, player_id):
        self.root = Game()
        self.player_id = player_id
    
    def get_action(self):
        return random.choice(self.root.legal_actions)

The greedy player tries all legal actions and chooses the one that gives him the best immediate reward : the number of seeds captured at that turn.

In [4]:
class GreedyPlayer(Player):
    def __init__(self, player_id, eps=0):
        self.root = Game()
        self.player_id = player_id
        self.eps = eps
    
    def get_action(self):
        # Choose a move
        children = []
        
        for legal_action in self.root.legal_actions:
            new_state, captures, finished = self.root.step(legal_action)
            if new_state.winner is None:
                win = 0
            elif new_state.winner == self.player_id:
                win = 1
            else:
                win = -1
            children.append((legal_action, captures, win))
        
        # order wins first, then by captures, then random
        sorted_children = sorted(children, key=lambda a_c_w: (-a_c_w[2], -a_c_w[1], random.random()))
        if random.random() < self.eps:
            action = random.choice(self.root.legal_actions)
        else:
            action = sorted_children[0][0]
            
        return action

## MCTS

Now comes the agents using the MCTS algorithm.

The MCTS first chooses a node to expand with the `tree_policy()` when the node is found, it is expanded with the `default_policy()`. When reaching a terminal node, the counters are updated. This is repeated `BUDGET` times and then the final action is chosen as the action that has the highest amount of wins.

Both policies in this implementation are random walks.

In [5]:
class MCTSPlayer(Player):
    def __init__(self, player_id, budget: Union[int, timedelta]):
        self.root = TreeStatsGame()
        self.player_id = player_id
        self.budget = budget

    def tree_policy(self, node):
        while not node.is_leaf_game:
            if node.is_fully_expanded:
                node = random.choice(node.expanded_children)
            else:
                action = random.choice(node.legal_unvisited_actions)
                node, _, _ = node.step(action)
        return node
    
    def explore_tree(self):
        # Choose a starting node
        node = self.tree_policy(self.root)

        # Run a simulation on that node
        finished = node.game_finished
        while not finished:
            action = self.default_policy(node)
            node, _, finished = node.step(action)

        # Backtrack stats
        node.update_stats(node.winner)
    
    def default_policy(self, node):
        # Random walk
        return random.choice(node.legal_actions)
    
    def action_score(self, x):
        node = self.root.children[x]
        if node is None:
            return -random.random()

        assert self.root.current_player == self.player_id
        assert node.current_player != self.player_id

        return node.wins[self.player_id]
        
    
    def get_action(self):
        if isinstance(self.budget, int):
            for _ in range(self.budget):
                self.explore_tree()
        elif isinstance(self.budget, timedelta):
            start = datetime.now()
            end = start + self.budget
            while datetime.now() < end:
                self.explore_tree()
        else:
            raise TypeError("budget should be Union[int, timedelta], not %s" % type(budget))
        
        possible_actions = self.root.legal_actions
        return max(possible_actions, key=self.action_score)

## Upper confidence for trees UCT

`UCTPlayer` reuses the MCTS agent but subclasses the `tree_policy` and uses UCT

In [6]:
class UCTPlayer(MCTSPlayer):
    def __init__(self, player_id, budget: Union[int, timedelta], c: float):
        super().__init__(player_id, budget)
        self.c = c
        
    def node_score(self, node):
        exporation = node.wins[node.current_opponent] / (node.n_playouts + 1)
        exploitation = math.sqrt(math.log(node.parent().n_playouts) / (node.n_playouts + 1))
        return exporation + self.c * exploitation

    def tree_policy(self, node):
        while not node.is_leaf_game:
            if node.is_fully_expanded:
                node = max_rand(node.expanded_children, key=self.node_score)
            else:
                action = random.choice(node.legal_unvisited_actions)
                node, _, _ = node.step(action)
        return node

 `GreedyUCTPlayer` subclasses `UCTPlayer` and changes the `default_policy` to weigh more the actions that will give more immediate rewards.

In [7]:
class GreedyUCTPlayer(UCTPlayer):    
    def default_policy(self, node):
        # Greedy walk
        assert len(node.legal_actions) != 0
        captures = [node.step(action)[1] + 1 for action in node.legal_actions]
        return random.choices(node.legal_actions, weights=captures)[0]

## Human player

As a bonus, we implement a player that uses terminal/notebook inputs to choose its actions from a human.
This enables us to play against the machine.

In [8]:
 class HumanPlayer(Player):
    def __init__(self, player_id):
        self.root = Game()
        self.player_id = player_id
    
    def get_action(self):
        self.root.show_state()
        action = -1
        while action not in self.root.legal_actions:
            action = int(input("Input move [0-5]: "))
            if action not in self.root.legal_actions:
                print("Illegal move")
            print("\n")
        return action