Copyright **`(c)`** 2022 Giovanni Squillero `<squillero@polito.it>`  
[`https://github.com/squillero/computational-intelligence`](https://github.com/squillero/computational-intelligence)  
Free for personal or classroom use; see [`LICENSE.md`](https://github.com/squillero/computational-intelligence/blob/master/LICENSE.md) for details.  


In [197]:
import logging
from collections import namedtuple
import random
from typing import Callable
from copy import deepcopy
from itertools import accumulate
from operator import xor

## The *Nim* and *Nimply* classes

In [198]:
Nimply = namedtuple("Nimply", "row, num_objects")

In [199]:
class Nim:
    def __init__(self, num_rows: int, k: int = None) -> None: #k is the max number of object that can be removed from a line
        self._rows = [i * 2 + 1 for i in range(num_rows)]
        self._k = k

    def __bool__(self):
        return sum(self._rows) > 0

    def __str__(self):
        return "<" + " ".join(str(_) for _ in self._rows) + ">"

    @property
    def rows(self) -> tuple:
        return tuple(self._rows)

    @property
    def k(self) -> int:
        return self._k

    def print_state(self):
        index = 0
        res = ""
        for row in self.rows:
            res = res + str(index) + ":" + str(row) + " "
        return res
        
    def nimming(self, ply: Nimply) -> None:
        row, num_objects = ply
        assert self._rows[row] >= num_objects
        assert self._k is None or num_objects <= self._k
        self._rows[row] -= num_objects

In [200]:
def nim_sum(state: Nim) -> int:
    *_, result = accumulate(state.rows, xor)
    return result


def cook_status(state: Nim) -> dict:
    cooked = dict()
    cooked["possible_moves"] = [
        (r, o) for r, c in enumerate(state.rows) for o in range(1, c + 1) if state.k is None or o <= state.k
    ]
    cooked["active_rows_number"] = sum(o > 0 for o in state.rows)
    cooked["shortest_row"] = min((x for x in enumerate(state.rows) if x[1] > 0), key=lambda y: y[1])[0]
    cooked["longest_row"] = max((x for x in enumerate(state.rows)), key=lambda y: y[1])[0]
    cooked["nim_sum"] = nim_sum(state)

    brute_force = list()
    for m in cooked["possible_moves"]:
        tmp = deepcopy(state)
        tmp.nimming(m)
        brute_force.append((m, nim_sum(tmp)))
    cooked["brute_force"] = brute_force

    return cooked

## Sample (and silly) startegies 

In [201]:
def pure_random(state: Nim) -> Nimply:
    row = random.choice([r for r, c in enumerate(state.rows) if c > 0])
    num_objects = random.randint(1, state.rows[row])
    return Nimply(row, num_objects)

In [202]:
def gabriele(state: Nim) -> Nimply:
    """Pick always the maximum possible number of the lowest row"""
    possible_moves = [(r, o) for r, c in enumerate(state.rows) for o in range(1, c + 1)]
    return Nimply(*max(possible_moves, key=lambda m: (-m[0], m[1])))

In [203]:
def optimal_strategy(state: Nim) -> Nimply:
    data = cook_status(state)
    return next((bf for bf in data["brute_force"] if bf[1] == 0), random.choice(data["brute_force"]))[0]

In [204]:
def enrico(state: Nim) -> Nimply: #my strategy
    data = cook_status(state)
    if data["active_rows_number"] == 0:
        return (data["longest_row"], state._rows[data["longest_row"]])
    else:
        return (data["shortest_row"], state._rows[data["shortest_row"]])     

<h1>agent using reinforcement learning</h1>

In [205]:
class Agent():
    def __init__(self, nim: Nim, num_tot_matches):
        self.nim = nim          #to be update at each game
        self.random_factor = 1  #at the begining is set to 1 -> 100 explore
        self.learned = dict()    #key is the nim status (nim._rows) the value is a dict of ( key: ply, value: score) from previus games
        self.current_move = dict()  #key is the nim status, value is the ply performed in the current game
        self.num_matches = 0
        self.num_tot_matched = num_tot_matches

    def play(self) -> Nimply:  #return a move
        selected_ply = None
        if random.random() > self.random_factor: # exploitation: select best move in same status situation if exists(score must be grater that 0)
            if self.nim.print_state() in self.learned.keys():
                moves = self.learned[self.nim.print_state()]
                best = None
                max = 0
                for move, score in moves.items():
                    if score > max:
                        
                        best = move
                        max = score
                        
                if best == None:
                    selected_ply = pure_random(self.nim)
                else:
                    selected_ply = best
                    
            else:
                selected_ply = pure_random(self.nim)       
        else:   #exploration
            selected_ply = pure_random(self.nim) 
        
        self.current_move[self.nim.print_state()] = selected_ply
        
        return selected_ply

    def update_score(self, win):    #in learned update score +1 if agent wins or -1 if loses
        self.random_factor = 1 -2*(self.num_matches/self.num_tot_matched) #update random factor for encrease the exploitation the matches prograssion

        for nim_state, move in self.current_move.items():
            if nim_state in self.learned.keys():    
                if move in self.learned[nim_state].keys():
                    if win:
                        self.learned[nim_state][move]+=1
                    else:
                        self.learned[nim_state][move]-=1
                else:
                    if win:
                        self.learned[nim_state][move] = 1
                    else:
                        self.learned[nim_state][move] = -1
            else:
                if win:
                    self.learned[nim_state] = {move: 1}
                else:
                    self.learned[nim_state] = {move: -1}
                   


In [206]:
NUM_MATCHES_EVAL = 100
NIM_SIZE = 5
NUM_MATCHES_TRAINING = 5000
OPPONENT_TRAIN = [pure_random, enrico, gabriele, optimal_strategy]
OPPONENT_EVAL = enrico

In [207]:
def training(robot):
    won=0
    last_player_start = 1
    for i in range(NUM_MATCHES_TRAINING):
        nim = Nim(NIM_SIZE)
        robot.nim = nim
        robot.num_matches = i
        player = 1 - last_player_start  #for switching the starter
        last_player_start = player
        while nim:
            if player == 0:
                ply = OPPONENT_TRAIN[int(i//(NUM_MATCHES_TRAINING/len(OPPONENT_TRAIN)))](nim)  #select opponent starting from the silliest
            else:
                ply = robot.play()
            nim.nimming(ply)
            player = 1 - player
        if player == 0: #robot win
            won+=1
            robot.update_score(1)
        else:   #robot lose
            robot.update_score(0)
    print("won in training: ", won)        

In [208]:
def evaluate(robot) -> float:
    won = 0
    last_player_start = 1
    for m in range(NUM_MATCHES_EVAL):
        nim = Nim(NIM_SIZE)
        robot.nim = nim
        player = 1 - last_player_start
        last_player_start = player
        while nim:
            if player == 0:
                ply = OPPONENT_EVAL(nim)
            else:
                ply = robot.play()
            nim.nimming(ply)
            player = 1 - player
        if player == 0:
            won += 1

    return won / NUM_MATCHES_EVAL #percentage of match won agaist the opponent

In [209]:


def main():
    robot = Agent(None, NUM_MATCHES_TRAINING)

    training(robot)
    
    robot.random_factor = 0 #only exploitation
    res = evaluate(robot)
    print(res)        



In [210]:
main()

won in training:  1168
0.14
