# Lab 3: Policy Search

## Task

Write agents able to play [*Nim*](https://en.wikipedia.org/wiki/Nim), with an arbitrary number of rows and an upper bound $k$ on the number of objects that can be removed in a turn (a.k.a., *subtraction game*).

The player **taking the last object wins**.

* Task3.1: An agent using fixed rules based on *nim-sum* (i.e., an *expert system*)
* Task3.2: An agent using evolved rules
* Task3.3: An agent using minmax
* Task3.4: An agent using reinforcement learning

## Instructions

* Create the directory `lab3` inside the course repo 
* Put a `README.md` and your solution (all the files, code and auxiliary data if needed)

## Notes

* Working in group is not only allowed, but recommended (see: [Ubuntu](https://en.wikipedia.org/wiki/Ubuntu_philosophy) and [Cooperative Learning](https://files.eric.ed.gov/fulltext/EJ1096789.pdf)). Collaborations must be explicitly declared in the `README.md`.
* [Yanking](https://www.emacswiki.org/emacs/KillingAndYanking) from the internet is allowed, but sources must be explicitly declared in the `README.md`.

## Deadlines ([AoE](https://en.wikipedia.org/wiki/Anywhere_on_Earth))

* Sunday, December 4th for Task3.1 and Task3.2
* Sunday, December 11th for Task3.3 and Task3.4
* Sunday, December 18th for all reviews

In [73]:
import logging
from collections import namedtuple
import random
from typing import Callable
from copy import deepcopy
from itertools import accumulate, product
from operator import xor

## The *Nim* and *Agent* classes

In [74]:
Nimply = namedtuple("Nimply", "row, num_objects")

In [75]:
class Nim:
    def __init__(self, num_rows: int, k: int = None) -> None:
        self._rows = [i * 2 + 1 for i in range(num_rows)]
        self._k = k

    def __bool__(self):
        return sum(self._rows) > 0

    def __str__(self):
        return "<" + " ".join(str(_) for _ in self._rows) + ">"

    def __hash__(self) -> int:
        rowList=list(self._rows)
        rowList.sort()
        return hash(" ".join(str(_) for _ in self._rows))

    def __eq__(self, __o: object) -> bool:
        return (self.__hash__()==__o.__hash__())

    def assign_rows(self, rows):
        self._rows = list(rows)
        return self

    @property
    def rows(self) -> tuple:
        return tuple(self._rows)

    @property
    def k(self) -> int:
        return self._k

    def nimming(self, ply: Nimply) -> None:
        row, num_objects = ply
        assert self._rows[row] >= num_objects
        assert self._k is None or num_objects <= self._k
        self._rows[row] -= num_objects

    def is_game_over(self):
        if sum(o > 0 for o in self._rows)==1:
            return True
        else:
            return False

    def get_state_and_reward(self):
        return self, self.get_reward()

    def get_reward(self):
        # if at end give 0 reward
        # if not at end give -1 reward
        return -1 * int(not self.is_game_over())

class Agent:
    def __init__(self, num_rows: int,  alpha=0.15, random_factor=0.2) -> None:
        self.G = {}
        self.alpha = alpha
        self.random_factor = random_factor
        self.state_history = []

        tmp = []
        for i in range(num_rows):
            tmp.append(range(0, i * 2 + 2))
        for i in list(product(*tmp)):
            n = Nim(num_rows)
            self.G[n.assign_rows(i)] = random.random()

    def choose_action(self, board: Nim):
        """ chooses action according to action policy """
        r = random.random()
        possible = [(r, o) for r, c in enumerate(board.rows) for o in range(1,c+1)] 

        if r < self.random_factor: # for epsilon-greedy policy
            next_move = random.choice(possible)
        else:       # choose best possible action in this state
            evaluations = list()
            for ply in possible:
                newBoard=deepcopy(board) #return new board
                newBoard.nimming(ply)
                evaluations.append((ply, self.G[newBoard]))

            # we choose the action with the higher G
            next_move = max(evaluations, key=lambda k: k[1])[0]

        return next_move

    def update_state_history(self, state, reward):
        self.state_history.append((state, reward))

    def learn(self):
        target = 0

        for prev, reward in reversed(self.state_history):
            self.G[prev] = self.G[prev] + self.alpha * (target - self.G[prev])
            target += reward

        self.state_history = []

        self.random_factor -= 10e-5  # decrease random factor each episode of play
        

## Sample Strategies 

In [76]:
def pure_random(state: Nim) -> Nimply:
    row = random.choice([r for r, c in enumerate(state.rows) if c > 0])
    num_objects = random.randint(1, state.rows[row])
    return Nimply(row, num_objects),0

In [77]:
def gabriele(state: Nim) -> Nimply:
    """Pick always the maximum possible number of the lowest row"""
    possible_moves = [(r, o) for r, c in enumerate(state.rows) for o in range(1, c + 1)]
    return Nimply(*max(possible_moves, key=lambda m: (-m[0], m[1]))),0

In [78]:
def franchino(state: Nim) -> Nimply:
    """Pick always the one from the longest row"""
    return Nimply(max(enumerate(state.rows), key=lambda a: a[1])[0], 1),0

## P1: Expert Player (same as Professor's)

In [79]:
def nim_sum(state: Nim) -> int:
    *_, result = accumulate(state.rows, xor)
    return result


def cook_status(state: Nim,nimSum=False) -> dict:
    cooked = dict()
    cooked["possible_moves"] = [
        (r, o) for r, c in enumerate(state.rows) for o in range(1, c + 1) if state.k is None or o <= state.k
    ]
    cooked["active_rows_number"] = sum(o > 0 for o in state.rows)
    cooked["shortest_row"] = min((x for x in enumerate(state.rows) if x[1] > 0), key=lambda y: y[1])[0]
    cooked["longest_row"] = max((x for x in enumerate(state.rows)), key=lambda y: y[1])[0]
    if nimSum:
        cooked["nim_sum"] = nim_sum(state)

        brute_force = list()
        for m in cooked["possible_moves"]:
            tmp = deepcopy(state)
            tmp.nimming(m)
            brute_force.append((m, nim_sum(tmp)))
        cooked["brute_force"] = brute_force
    return cooked

In [80]:
def optimal_startegy(state: Nim) -> Nimply:
    data = cook_status(state,True)
    return next((bf for bf in data["brute_force"] if bf[1] == 0), random.choice(data["brute_force"]))[0],0

## P4: Reinforcement Learning

In [81]:
logging.getLogger().setLevel(logging.DEBUG)


NUM_ROWS = 7
opponent = gabriele

board = Nim(NUM_ROWS)
logging.debug(f"status: Initial board  -> {board}")

jack = Agent(NUM_ROWS, alpha=0.3, random_factor=0.2)
moveHistory = []
indices = []
win = 0

for i in range(10000):
    steps = 0
    player = 0
    while not board.is_game_over():
        steps += 1
        if player == 0:
            # choose an action (explore or exploit)
            action = jack.choose_action(board)
            board.nimming(action)
            reward = board.get_reward()  # get the new reward
            # update the robot memory with state and reward
            jack.update_state_history(board, reward)
        else:
            ply = opponent(board)[0]
            board.nimming(ply)

        if steps > 100:
            # end the robot if it takes too long to find the goal
            break

        player = 1 - player

        #if i % 100 == 0:
            #logging.debug(f"status: After player {player} -> {board}")
    
    winner = 1 - player
    if winner == 0:
        win += 1

    if i % 50 == 0 and i > 0:
        win_ratio = win/50
        win = 0
        logging.info(f"Win ration {win_ratio} won!")

    jack.learn()  # jack should learn after every episode
    # get a history of number of steps taken to plot later
    '''
    if i % 50 == 0:
        print(f"{i}: {maze.steps}")
        moveHistory.append(maze.steps)
        indices.append(i)
    '''

    board = Nim(NUM_ROWS)
    

DEBUG:root:status: Initial board  -> <1 3 5 7 9 11 13>
INFO:root:Win ration 0.12 won!
INFO:root:Win ration 0.02 won!
INFO:root:Win ration 0.06 won!
INFO:root:Win ration 0.1 won!
INFO:root:Win ration 0.02 won!
INFO:root:Win ration 0.08 won!
INFO:root:Win ration 0.02 won!
INFO:root:Win ration 0.02 won!
INFO:root:Win ration 0.04 won!
INFO:root:Win ration 0.04 won!
INFO:root:Win ration 0.0 won!
INFO:root:Win ration 0.06 won!
INFO:root:Win ration 0.02 won!
INFO:root:Win ration 0.04 won!
INFO:root:Win ration 0.0 won!
INFO:root:Win ration 0.04 won!
INFO:root:Win ration 0.0 won!
INFO:root:Win ration 0.04 won!
INFO:root:Win ration 0.0 won!
INFO:root:Win ration 0.02 won!
INFO:root:Win ration 0.0 won!
INFO:root:Win ration 0.02 won!
INFO:root:Win ration 0.0 won!
INFO:root:Win ration 0.02 won!
INFO:root:Win ration 0.08 won!
INFO:root:Win ration 0.02 won!
INFO:root:Win ration 0.0 won!
INFO:root:Win ration 0.02 won!
INFO:root:Win ration 0.0 won!
INFO:root:Win ration 0.0 won!
INFO:root:Win ration 0.0 