## Imports & Global Vars

In [None]:
import numpy as np
from Nim import *

nrows = 5
nepochs = 50000

## Reinforcement Learning Agent

In [None]:
class Agent(object):
    def __init__(self, states, alpha=0.15, random_factor=0.2):  # 80% explore, 20% exploit
        self.state_history = []  # state, reward
        self.alpha = alpha
        self.random_factor = random_factor
        self.G = {}

    def choose_action(self, state):
        maxG = -10e15
        next_move = None
        randomN = np.random.random()
        allowedMoves = [Nimply(r, o) for r, c in enumerate(state.rows) for o in range(1, c + 1)]
        if randomN < self.random_factor:
            # if random number below random factor, choose random action
            next_move = random.choice(allowedMoves)
        else:
            # if exploiting, gather all possible actions and choose one with the highest G (reward)
            uncharted = []
            for action in allowedMoves:
                l = len(state.rows)
                new_state = Nim(l).fromRows(state.rows)
                new_state.nimming(action)
                if new_state.rows not in self.G:
                    uncharted.append(action)
                elif self.G[new_state.rows] >= maxG:
                    maxG = self.G[new_state.rows]
                    next_move = action
            if len(uncharted) != 0:
                next_move = random.choice(uncharted)
        return next_move

    def update_state_history(self, state, reward):
        self.state_history.append((state, reward))

    def learn(self):
        target = 0

        for prev, reward in reversed(self.state_history):
            if prev not in self.G:
                self.G[prev] = np.random.uniform(low=1.0, high=0.1)
            self.G[prev] = self.G[prev] + self.alpha * (target - self.G[prev])
            target += reward

        self.state_history = []

        self.random_factor -= 10e-5  # decrease random factor each episode of play

In [None]:
def evaluate(gamesize: int, robot: Agent) -> float:
    win_count = 0
    neval = 100
    robot.random_factor = 0
    turn = 1
    for _ in range(neval):
        game = Nim(gamesize)
        while not game.endTest():
            turn = 1 - turn
            if not turn:
                game.nimming(robot.choose_action(game))
            else:
                game.nimming(pure_random(game))
        if turn:
            win_count += 1
    return win_count/neval

## Training

In [None]:
game = Nim(nrows)
robot = Agent(game, alpha=0.1, random_factor=0.4)

for i in range(nepochs):
    while not game.endTest():
        action = robot.choose_action(game)
        game.nimming(action)
        if not game.endTest():
            game.nimming(expert(game))
        reward = game.get_reward()
        robot.update_state_history(game, reward)
    robot.learn()
    game = Nim(nrows)
    
print(evaluate(nrows, robot))

## Play

In [None]:
game = Nim(nrows)

sandbox(game, robot)