# Libraries

In [22]:
__author__ = "Gabriele Greco"
import random
import logging
from collections import namedtuple
from copy import deepcopy
from itertools import accumulate
from itertools import combinations_with_replacement
from operator import xor
from matplotlib import pyplot as plt
import numpy as np

# Nim Class

In [23]:
Nimply = namedtuple("Nimply", "row, num_objects")

In [24]:
class Nim:
    def __init__(self, num_rows: int, k: int = None) -> None:
        self._rows = [i * 2 + 1 for i in range(num_rows)]
        self._k = k
        self._end = [i*0 for i in range(num_rows)] # end of nim

    def __bool__(self):
        return sum(self._rows) > 0

    def __str__(self):
        return "<" + " ".join(str(_) for _ in self._rows) + ">"

    @property
    def rows(self) -> tuple:
        return tuple(self._rows)

    @property
    def k(self) -> int:
        return self._k
    
    def possible_moves(self):
        return [(r, o) for r, c in enumerate(self._rows) for o in range(1, c + 1) if self._k is None or o <= self._k]

    def nimming(self, ply: Nimply) -> None: # this is update states
        row, num_objects = ply
        assert self._rows[row] >= num_objects
        assert self._k is None or num_objects <= self._k
        self._rows[row] -= num_objects

    def get_reward(self): # 1 if I have won, -1 lost, 0 not won or lost
        if (self._rows == self._end):
            return 1
        else:
            return -1 * int(not self._rows == self._end) # -1 or 0

### Strategies

In [5]:
def simple_strategy(state: Nim) -> Nimply:
    row = random.choice([r for r, c in enumerate(state.rows) if c > 0])
    return Nimply(row, 1)

def pure_random(state: Nim) -> Nimply: # take a random row and select random elements (always < k)
    row = random.choice([r for r, c in enumerate(state.rows) if c > 0])
    if(state.rows[row] > state.k):
        num_objects = random.randint(1, state.k)
    else:
        num_objects = random.randint(1, state.rows[row])
    return Nimply(row, num_objects)

def shortest_row(state: Nim) -> Nimply: # take the shortest row and select random elements if the matches > k otherwise close the row
    row = min((x for x in enumerate(state.rows) if x[1] > 0), key=lambda y: y[1])[0]
    if(state.rows[row] > state.k):
       num_objects = random.randint(1, state.k)
    else:
       num_objects = state.rows[row]
    return Nimply(row, num_objects)

# algorithm taken from professor's code
def nim_sum(state: Nim) -> int:
    *_, result = accumulate(state.rows, xor)
    return result

def cook_status(state: Nim) -> dict:
    cooked = dict()
    cooked["possible_moves"] = state.possible_moves()
    cooked["active_rows_number"] = sum(o > 0 for o in state.rows)
    cooked["shortest_row"] = min((x for x in enumerate(state.rows) if x[1] > 0), key=lambda y: y[1])[0]
    cooked["longest_row"] = max((x for x in enumerate(state.rows)), key=lambda y: y[1])[0]
    cooked["nim_sum"] = nim_sum(state)

    brute_force = list()
    for m in cooked["possible_moves"]:
        tmp = deepcopy(state)
        tmp.nimming(m)
        brute_force.append((m, nim_sum(tmp)))
    cooked["brute_force"] = brute_force

    return cooked

def optimal_strategy(state: Nim) -> Nimply:
    data = cook_status(state)
    return next((bf for bf in data["brute_force"] if bf[1] == 0), random.choice(data["brute_force"]))[0]

# Task 4: Reinforcement Learning

##### Agent

In [27]:
class Agent(object):
    def __init__(self, state, alpha = 0.15, random_factor = 0.2): # we can modify these values
        self.state_history = [((0, 0), 0)] # row, k taken, reward
        self.alpha = alpha
        self.random_factor = random_factor
        self.G = {} # Initialization is done when he meet a new state
        self.init_reward(state)
            
    # G = a set of positions. Distance from our move to our target
    def init_reward(self, state):
        self.G[0, 0] = 0
        for i in state.possible_moves():
               self.G[i[0], i[1]] = np.random.uniform(low = 0.1, high = 1.0)

    def choose_action(self, state: Nim, possibleMoves):
        maxG = -10e15 # very low value
        next_move = None
        if random.random() < self.random_factor:
            next_move = random.choice(possibleMoves)
        else:
            for action in possibleMoves:
                new_state = tuple([action[0], action[1]]) # nuovo stato
                if self.G[new_state] >= maxG:
                    next_move = action
                    maxG = self.G[new_state]
                    
        return next_move

    def update_state_history(self, state, reward): # state = rows
        self.state_history.append((state, reward))

    def learn(self):
        target = 0

        for prev, reward in reversed(self.state_history): # travel the history backwards
            self.G[prev] = self.G[prev] + self.alpha * (target - self.G[prev])
            target += reward
        
        self.state_history = []
        self.random_factor -= 10e-5 # decrease random factor each episode of play

### Main

In [57]:
logging.getLogger().setLevel(logging.DEBUG)

countwin = 0 
countgames = 0
rows = 6
k = 3
nim = Nim(rows, k)
robot = Agent(nim, alpha=0.1, random_factor=0.4)

for n in range(1000):
    player = 0 # first player
    nim = Nim(rows, k) # initializing nim

    while(nim):

        if(player == 0):
            #ply = optimal_strategy(nim)
            #ply = pure_random(nim)
            #ply = shortest_row(nim)
            ply = simple_strategy(nim)
            nim.nimming(ply)
        else:
            action = robot.choose_action(nim, nim.possible_moves()) # chose the action
            nim.nimming(action)  # update nim according to the action
            reward = nim.get_reward()  # get the new reward
            robot.update_state_history(action, reward) # update the robot memory with state and reward
        player = 1 - player


    if(1 - player == 1): # winner = 1 - player
        countwin += 1
    else:
         robot.update_state_history(action, -1) # update the robot memory with state and reward

    # end of game, let's learn
    robot.learn()
    countgames += 1
    
    if n  % 100 == 0:
        logging.info(f"Game played = {n}: " f"Winrate = {(countwin/(countgames))*100}% ")
        countwin = 0
        countgames = 0
    

INFO:root:Game played = 0: Winrate = 100.0% 
INFO:root:Game played = 100: Winrate = 60.0% 
INFO:root:Game played = 200: Winrate = 59.0% 
INFO:root:Game played = 300: Winrate = 49.0% 
INFO:root:Game played = 400: Winrate = 49.0% 
INFO:root:Game played = 500: Winrate = 51.0% 
INFO:root:Game played = 600: Winrate = 55.00000000000001% 
INFO:root:Game played = 700: Winrate = 59.0% 
INFO:root:Game played = 800: Winrate = 56.00000000000001% 
INFO:root:Game played = 900: Winrate = 63.0% 
