# Lab 2: ES

## Task

Write agents able to play [*Nim*](https://en.wikipedia.org/wiki/Nim), with an arbitrary number of rows and an upper bound $k$ on the number of objects that can be removed in a turn (a.k.a., *subtraction game*).

The goal of the game is to **avoid** taking the last object.

* Task 2.1: An agent using fixed rules based on *nim-sum* (i.e., an *expert system*)
* Task 2.2: An agent using evolved rules using ES

In [99]:
import logging
from pprint import pprint, pformat
from collections import namedtuple
from random import random, choice, randint
from copy import deepcopy
from typing import Callable, Literal
from dataclasses import dataclass, field
from tqdm.notebook import trange
import pickle
import numpy as np

## The *Nim* and *Nimply* classes

In [2]:
# named tuple to indicate a possible Nim ply
Nimply = namedtuple("Nimply", "row, num_objects")

In [3]:
class Nim:
    """
    Class implementing the Nim game.
    """

    def __init__(self, num_rows: int, k: int = None) -> None:
        """
        Game constructor.

        Args:
            num_rows: number of rows (piles);
            k: maximum number of objects you can nim from a row.

        Returns:
            None.
        """
        self._rows = [i * 2 + 1 for i in range(num_rows)]
        self._k = k

    def __bool__(self):
        return sum(self._rows) > 0

    def __str__(self):
        return "<" + " ".join(str(_) for _ in self._rows) + ">"

    def __repr__(self):
        return "<" + " ".join(str(_) for _ in self._rows) + ">"

    @property
    def rows(self) -> tuple:
        return tuple(self._rows)

    @property
    def k(self) -> int:
        return self._k

    def nimming(self, ply: Nimply) -> None:
        """
        Update the game by performing a ply.

        Args:
            ply: ply to perform.

        Returns:
            None.
        """
        row, num_objects = ply
        assert self._rows[row] >= num_objects
        assert self._k is None or num_objects <= self._k, f"{num_objects=}, {self._k=}"
        self._rows[row] -= num_objects

## Game Definition

In [4]:
def play_game(nim: Nim, strategy1: Callable[[Nim], Nimply], strategy2: Callable[[Nim], Nimply]) -> Literal[0, 1]:
    """
    Play a Nim game using the given strategies.

    Args:
        nim: Nim game instance;
        strategy1: Player 0 strategy;
        strategy2: Player 1 strategy.

    Returns:
        player: the winning player.
    """
    logging.getLogger().setLevel(logging.WARN)

    strategy = (strategy1, strategy2)

    logging.info(f"init : {nim}")
    player = 0
    while nim:
        ply = strategy[player](nim)
        logging.info(f"ply: player {player} plays {ply}, {nim_sum(nim)}")
        nim.nimming(ply)
        logging.info(f"status: {nim}")
        player = 1 - player
    logging.info(f"status: Player {player} won!")

    return player


def play_games(
    nim: Nim,
    player: int,
    player_strategy: Callable[[Nim], Nimply],
    opponent_strategy: Callable[[Nim], Nimply],
    n_matches: int,
) -> list[int]:
    """
    Play a given number of matches on a Nim game instance.

    Args:
        nim: Nim game instance;
        player: choose if your strategy is played by the first or second player;
        player_strategy: your player strategy;
        opponent_strategy: your opponent strategy;
        n_matches: number of matchers to play.

    Returns:
        List history of the winning players.
    """
    return [
        play_game(deepcopy(nim), player_strategy, opponent_strategy)
        if player == 0
        else play_game(deepcopy(nim), opponent_strategy, player_strategy)
        for _ in range(n_matches)
    ]

In [114]:
def streak(player_strategy, opponent_strategy, n_matches):
    """
    Play a given number of random matches between two strategies.

    Args:
        player_strategy: your player strategy;
        opponent_strategy: your opponent strategy;
        n_matches: number of matchers to play.

    Returns:
        ercentage of wins.
    """
    wins = 0
    for _ in range(n_matches):
        random_size = randint(4, 10)
        random_k = choice([None, None, *[randint(2, random_size * 2 + 1) for _ in range(2)]])
        nim = Nim(random_size, random_k)
        player = choice([0, 1])
        wins += 1 if play_games(nim, player, player_strategy, opponent_strategy, 1)[0] == player else 0
    return wins / n_matches

## Rule-based strategies 

In [5]:
def pure_random(state: Nim) -> Nimply:
    """
    Perform a completely random move.

    Args:
        state: Nim game instance.

    Returns:
        A ply is returned.
    """
    row = choice([r for r, c in enumerate(state.rows) if c > 0])
    num_objects = randint(1, state.rows[row] if state.k is None else min(state.rows[row], state.k))
    return Nimply(row, num_objects)

In [6]:
def gabriele(state: Nim) -> Nimply:
    """
    Pick always the maximum possible number of the lowest row.

    Args:
        state: Nim game instance.

    Returns:
        A ply is returned.
    """
    possible_moves = [
        (r, o) for r, c in enumerate(state.rows) for o in range(1, c + 1 if state.k is None else min(c + 1, state.k))
    ]
    return Nimply(*max(possible_moves, key=lambda m: (-m[0], m[1])))

In [7]:
def nim_sum(state: Nim) -> int:
    """
    Compute nim-sum value on a Nim game instance.

    Args:
        state: Nim game instance.

    Returns:
        The nim-sum value of the current game is returned.
    """
    tmp = np.array([tuple(int(x) for x in f"{c:032b}") for c in state.rows])
    xor = tmp.sum(axis=0) % 2
    return int("".join(str(_) for _ in xor), base=2)


def generate_all_plies(state: Nim) -> list[Nimply]:
    """
    Generate all possible plies on the current game.

    Args:
        state: Nim game instance.

    Returns:
        A list of plies is returned.
    """
    return [
        Nimply(r, o)
        for r, c in enumerate(state.rows)
        for o in range(1, c + 1 if state.k is None else min(c + 1, state.k))
    ]


def analize(state: Nim) -> dict:
    """
    Given a Nim game instance, this function computes all the possible plies
    and the corresponding nim-sum values.

    Args:
        state: Nim game instance.

    Returns:
        Plies and corresponding nim-sum values are returned as a dict.
    """
    cooked = dict()
    cooked["possible_moves"] = dict()
    for ply in generate_all_plies(state):
        tmp = deepcopy(state)
        tmp.nimming(ply)
        cooked["possible_moves"][ply] = nim_sum(tmp)
    return cooked


def optimal(state: Nim) -> Nimply:
    """
    If possible, this function returns a move which leads to a nim-sum value not equal to zero,
    otherwise a random move among all the possible moves.

    Args:
        state: Nim game instance.

    Returns:
        A ply is returned.
    """
    analysis = analize(state)
    logging.debug(f"analysis:\n{pformat(analysis)}")
    spicy_moves = [ply for ply, ns in analysis["possible_moves"].items() if ns != 0]
    if not spicy_moves:
        spicy_moves = list(analysis["possible_moves"].keys())
    logging.debug(pformat(f"{analysis['possible_moves']}"))
    ply = choice(spicy_moves)
    return ply

### Task 2.1

Without considering the value of $k$, we can win in a Nim game if we always play a move in order to have a nim-sum value of $0$.

At some point, we end up in a position that has only one row of size 2 or more. \
In such a position the nim-sum value is not equal to 0. To win we must reduce this to size 0 or 1, leaving an odd number of rows with size 1. From that point on, all moves are forced.

_*Reference: https://en.wikipedia.org/wiki/Nim#Proof_of_the_winning_formula*_.

The following function implements this winning strategy.

In [8]:
def expert_system(state: Nim) -> Nimply:
    """
    This function implement an expert system which beats the strategies defined above.
    Details on how to win are available at https://en.wikipedia.org/wiki/Nim#Proof_of_the_winning_formula.

    Args:
        state: Nim game instance.

    Returns:
        A ply is returned.
    """
    analysis = analize(state)
    logging.debug(f"analysis:\n{pformat(analysis)}")
    not_zero_rows = len(state.rows) - state.rows.count(0)
    one_count_rows = state.rows.count(1)
    if one_count_rows == not_zero_rows - 1:
        is_odd = (one_count_rows % 2) == 1
        row, objects = [(row, objects) for row, objects in enumerate(state.rows) if objects > 1][0]
        if is_odd:
            return Nimply(row, objects if state.k is None else min(objects, state.k))
        return Nimply(row, objects - 1 if state.k is None else min(objects - 1, state.k))
    spicy_moves = [ply for ply, ns in analysis["possible_moves"].items() if ns == 0]
    if not spicy_moves:
        spicy_moves = list(analysis["possible_moves"].keys())
    logging.debug(pformat(f"{analysis['possible_moves']}"))
    ply = choice(spicy_moves)
    return ply

## Adaptive Strategy

### Task 2.2

Our idea is to implement an adaptive strategy that learns how to play depending on the current game phase. 

Suppose that we have $3$ different phases:
- _early game_;
- _mid game_;
- _end game_.

We decide in which phase we are in based on the formula $n_\textrm{phase} = \dfrac{\textrm{\# remaining plies}}{\textrm{\# possible plies from the beginning}} \in [0, 1]$.
 
We define $t_1, t_2$, two thresholds learnt by the _ES_ strategy, that divide the interval $[0, 1]$ in three parts: $i_1 = [0, t_1)$, $i_2 = [t_1, t_2]$ and $i_3 = (t_2, 1]$. \
If $n_\textrm{phase} \in i_1$ we are in _early game_, $n_\textrm{phase} \in i_2$ we are in _mid game_, otherwise we are in _end game_.

Each phase has its own set of weights. One weight is associated to one of the strategies defined above. The probability of picking a strategy for a ply is directly proportional to its weight value.

The _ES_ algorithm learns which strategy is best to play in each phase.

In [115]:
def get_phase_ratio(state: Nim) -> float:
    """
    Get the n_phase value.

    Args:
        state: Nim game instance.

    Returns:
        The n_phase value is returned.
    """
    all_plys_new_game = len(generate_all_plies(Nim(len(state.rows), state.k)))
    all_plys_current_game = len(generate_all_plies(state))
    return all_plys_current_game / all_plys_new_game

In [116]:
@dataclass(init=False)
class Individual:
    """
    Class that represents an individual.
    """

    strategies: list[Callable[[Nim], Nimply]]
    strategy_weights: list[list[float]]
    phase_thresholds: list[float]

    def __init__(
        self, strategies: int = None, strategy_weights: list[list[float]] = None, phase_thresholds: list[float] = None
    ):
        if strategies is None:
            strategies = [pure_random, gabriele, optimal, expert_system]
        if strategy_weights is None:
            strategy_weights = np.random.randint(low=1, high=10, size=(3, len(strategies)))
        if phase_thresholds is None:
            phase_thresholds = sorted([random(), random()])
        else:
            phase_thresholds = sorted(phase_thresholds)

        self.strategies = strategies
        self.strategy_weights = strategy_weights
        self.phase_thresholds = phase_thresholds

    def _softmax(self, values) -> list[list[float]]:
        tmp = np.exp(values)
        return (tmp / np.sum(tmp)).tolist()

    def mutate(ind: "Individual") -> "Individual":
        global mutation_rate
        strategy_weights = np.clip(np.random.normal(loc=ind.strategy_weights, scale=mutation_rate[0]), 1, 10).tolist()
        phase_thresholds = np.clip(np.random.normal(loc=ind.phase_thresholds, scale=mutation_rate[1]), 0, 1).tolist()
        return Individual(
            strategies=ind.strategies, strategy_weights=strategy_weights, phase_thresholds=phase_thresholds
        )

    def __call__(self, state: Nim) -> Nimply:
        phase_ratio = get_phase_ratio(state)
        phase_index = (
            0 if phase_ratio < self.phase_thresholds[0] else (2 if phase_ratio > self.phase_thresholds[1] else 1)
        )
        weights = self.strategy_weights[phase_index]
        strategy = np.random.choice(self.strategies, p=self._softmax(weights))
        return strategy(state)

### (1 + $\lambda$)-ES

In [117]:
LAMBDA = 30
N_MATCHES = 10
N_ITERS = 1_000
FACTOR = 1.5
OPPONENT = expert_system
mutation_rate: tuple[float] = (2.5, 0.1)

In [118]:
parent = Individual()
parent_result = streak(parent, OPPONENT, N_MATCHES)
print('-- First Individual')
pprint(parent)

pbar = trange(0, N_ITERS // LAMBDA)
for _ in pbar:
    pbar.set_description(f'Parent Accuracy: {parent_result:.2%}')
    offspring = [parent.mutate() for _ in range(LAMBDA)]
    results = [streak(ind, OPPONENT, N_MATCHES) for ind in offspring]

    if np.sum(np.array(results) > parent_result) / LAMBDA > 1 / 5:
        mutation_rate = (mutation_rate[0] * FACTOR, mutation_rate[1] * FACTOR)
    else:
        mutation_rate = (mutation_rate[0] / FACTOR, mutation_rate[1] / FACTOR)

    solution_index = np.argmax(results)
    if parent_result < results[solution_index]:
        parent = offspring[solution_index]
        parent_result = results[solution_index]

    if parent_result >= 0.999:
        break

best_plus = parent
print('-- Best Individual')
pprint(best_plus)
print(f'Best Individual Accuracy: {streak(best_plus, OPPONENT, 100):.2%}')

-- First Individual
Individual(strategies=[<function pure_random at 0x109a894e0>,
                       <function gabriele at 0x109a896c0>,
                       <function optimal at 0x109a89800>,
                       <function expert_system at 0x109a898a0>],
           strategy_weights=array([[3, 6, 8, 8],
       [8, 1, 2, 6],
       [8, 8, 3, 9]]),
           phase_thresholds=[0.46903820024313614, 0.7188280047144948])


  0%|          | 0/33 [00:00<?, ?it/s]

-- Best Individual
Individual(strategies=[<function pure_random at 0x109a894e0>,
                       <function gabriele at 0x109a896c0>,
                       <function optimal at 0x109a89800>,
                       <function expert_system at 0x109a898a0>],
           strategy_weights=[[4.4186130102164345,
                              3.14106918754073,
                              3.613729594039401,
                              9.996039924069041],
                             [2.765247277444072,
                              1.4815181759801608,
                              2.9565777956431263,
                              8.316687001063416],
                             [5.030018689369344,
                              10.0,
                              4.326676846039679,
                              4.542936608647975]],
           phase_thresholds=[0.5398362109851288, 1.0])
Best Individual Accuracy: 46.00%


In [127]:
# serialize the best individual for the plus strategy
with open('best_plus.pkl', 'wb') as f:
    pickle.dump(best_plus, f)

In [128]:
# load the best individual for the plus strategy
with open('best_plus.pkl', 'rb') as f:
    best_plus = pickle.load(f)

### (1, $\lambda$)-ES

In [131]:
LAMBDA = 30
N_MATCHES = 10
N_ITERS = 1_000
FACTOR = 1.1
OPPONENT = expert_system
mutation_rate: tuple[float] = (1.5, 0.1)

In [133]:
parent = Individual()
parent_result = streak(parent, OPPONENT, N_MATCHES)
best_comma = parent
best_comma_result = parent_result
print('-- First Individual')
pprint(parent)

pbar = trange(0, N_ITERS // LAMBDA)
for _ in pbar:
    pbar.set_description(f'Parent Accuracy: {parent_result:.2%}')
    offspring = [parent.mutate() for _ in range(LAMBDA)]
    results = [streak(ind, OPPONENT, N_MATCHES) for ind in offspring]

    if np.sum(np.array(results) > parent_result) / LAMBDA > 1 / 5:
        mutation_rate = (mutation_rate[0] * FACTOR, mutation_rate[1] * FACTOR)
    else:
        mutation_rate = (mutation_rate[0] / FACTOR, mutation_rate[1] / FACTOR)

    solution_index = np.argmax(results)
    parent = offspring[solution_index]
    parent_result = results[solution_index]

    if best_comma_result < parent_result:
        best_comma = parent
        best_comma_result = parent_result

    if parent_result >= 0.999:
        break

print('-- Best Individual')
pprint(best_comma)
print(f'Best Individual Accuracy: {streak(best_comma, OPPONENT, 100):.2%}')

-- First Individual
Individual(strategies=[<function pure_random at 0x109a894e0>,
                       <function gabriele at 0x109a896c0>,
                       <function optimal at 0x109a89800>,
                       <function expert_system at 0x109a898a0>],
           strategy_weights=array([[5, 6, 7, 2],
       [3, 6, 7, 5],
       [7, 1, 4, 5]]),
           phase_thresholds=[0.28615992897279696, 0.33743931022473583])


  0%|          | 0/33 [00:00<?, ?it/s]

-- Best Individual
Individual(strategies=[<function pure_random at 0x109a894e0>,
                       <function gabriele at 0x109a896c0>,
                       <function optimal at 0x109a89800>,
                       <function expert_system at 0x109a898a0>],
           strategy_weights=[[3.0889994342396667,
                              6.7601427417936595,
                              4.777263698916828,
                              1.1973202646743928],
                             [5.498704729810203,
                              5.497292084432263,
                              1.903679679994468,
                              10.0],
                             [3.9647858584752957,
                              2.0616940692904815,
                              3.0956824908195477,
                              10.0]],
           phase_thresholds=[0.0, 0.5251936958805217])
Best Individual Accuracy: 42.00%


In [134]:
# serialize the best individual for the comma strategy
with open('best_comma.pkl', 'wb') as f:
    pickle.dump(best_comma, f)

In [135]:
# load the best individual for the comma strategy
with open('best_comma.pkl', 'rb') as f:
    best_comma = pickle.load(f)

## Assess Strategies

In [15]:
def assess_strategy(
    nim: Nim,
    player: int,
    player_strategy: Callable[[Nim], Nimply],
    opponent_strategies: list[Callable[[Nim], Nimply]],
    n_matches: int,
) -> None:
    """
    This function prints how many times the given player strategy
    wins against the opponent strategies.

    Args:
        nim: Nim game instance;
        player: which player to play;
        player_strategy: which strategy to play;
        opponent_strategies: which strategies to play against;
        n_matches: number of matches to assess the quality of the strategy.

    Returns:
        None.
    """
    for opponent_strategy in opponent_strategies:
        games = play_games(nim, player, player_strategy, opponent_strategy, n_matches)
        accuracy = games.count(player) / len(games)
        print(
            f"-- Player {player} ({player_strategy.__qualname__}) against {opponent_strategy.__qualname__}: {accuracy:.2%} wins"
        )

In [16]:
nim = Nim(5, 3)

### Expert System

In [17]:
assess_strategy(
    nim=nim,
    player=0,
    player_strategy=expert_system,
    opponent_strategies=[pure_random, gabriele, optimal],
    n_matches=1000,
)

-- Player 0 (expert_system) against pure_random: 84.10% wins
-- Player 0 (expert_system) against gabriele: 94.70% wins
-- Player 0 (expert_system) against optimal: 92.80% wins


In [18]:
assess_strategy(
    nim=nim,
    player=1,
    player_strategy=expert_system,
    opponent_strategies=[pure_random, gabriele, optimal],
    n_matches=1000,
)

-- Player 1 (expert_system) against pure_random: 83.90% wins
-- Player 1 (expert_system) against gabriele: 93.30% wins
-- Player 1 (expert_system) against optimal: 91.60% wins


### (1 + $\lambda$)-ES

In [136]:
best_plus.__qualname__ = "adaptive strategy (1 + 𝜆)-ES"

assess_strategy(
    nim=nim,
    player=0,
    player_strategy=best_plus,
    opponent_strategies=[pure_random, gabriele, optimal, expert_system],
    n_matches=1000,
)

-- Player 0 (adaptive strategy (1 + 𝜆)-ES) against pure_random: 84.80% wins
-- Player 0 (adaptive strategy (1 + 𝜆)-ES) against gabriele: 94.00% wins
-- Player 0 (adaptive strategy (1 + 𝜆)-ES) against optimal: 91.80% wins
-- Player 0 (adaptive strategy (1 + 𝜆)-ES) against expert_system: 56.20% wins


In [137]:
best_plus.__qualname__ = "adaptive strategy (1 + 𝜆)-ES"

assess_strategy(
    nim=nim,
    player=1,
    player_strategy=best_plus,
    opponent_strategies=[pure_random, gabriele, optimal, expert_system],
    n_matches=1000,
)

-- Player 1 (adaptive strategy (1 + 𝜆)-ES) against pure_random: 84.40% wins
-- Player 1 (adaptive strategy (1 + 𝜆)-ES) against gabriele: 92.60% wins
-- Player 1 (adaptive strategy (1 + 𝜆)-ES) against optimal: 91.30% wins
-- Player 1 (adaptive strategy (1 + 𝜆)-ES) against expert_system: 42.20% wins


### (1, $\lambda$)-ES

In [138]:
best_comma.__qualname__ = "adaptive strategy (1, 𝜆)-ES"

assess_strategy(
    nim=nim,
    player=0,
    player_strategy=best_comma,
    opponent_strategies=[pure_random, gabriele, optimal, expert_system],
    n_matches=1000,
)

-- Player 0 (adaptive strategy (1, 𝜆)-ES) against pure_random: 81.80% wins
-- Player 0 (adaptive strategy (1, 𝜆)-ES) against gabriele: 93.40% wins
-- Player 0 (adaptive strategy (1, 𝜆)-ES) against optimal: 91.70% wins
-- Player 0 (adaptive strategy (1, 𝜆)-ES) against expert_system: 56.50% wins


In [139]:
best_comma.__qualname__ = "adaptive strategy (1, 𝜆)-ES"

assess_strategy(
    nim=nim,
    player=1,
    player_strategy=best_comma,
    opponent_strategies=[pure_random, gabriele, optimal, expert_system],
    n_matches=1000,
)

-- Player 1 (adaptive strategy (1, 𝜆)-ES) against pure_random: 83.30% wins
-- Player 1 (adaptive strategy (1, 𝜆)-ES) against gabriele: 91.30% wins
-- Player 1 (adaptive strategy (1, 𝜆)-ES) against optimal: 89.90% wins
-- Player 1 (adaptive strategy (1, 𝜆)-ES) against expert_system: 42.70% wins
