# Libraries

In [2]:
__author__ = "Gabriele Greco"
import random
import logging
from collections import namedtuple
from copy import deepcopy
from itertools import accumulate
from operator import xor

# Nim Class

In [None]:
Nimply = namedtuple("Nimply", "row, num_objects")

In [None]:
class Nim:
    def __init__(self, num_rows: int, k: int = None) -> None:
        self._rows = [i * 2 + 1 for i in range(num_rows)]
        self._k = k

    def __bool__(self):
        return sum(self._rows) > 0

    def __str__(self):
        return "<" + " ".join(str(_) for _ in self._rows) + ">"

    @property
    def rows(self) -> tuple:
        return tuple(self._rows)

    @property
    def k(self) -> int:
        return self._k

    def nimming(self, ply: Nimply) -> None:
        row, num_objects = ply
        assert self._rows[row] >= num_objects
        assert self._k is None or num_objects <= self._k
        self._rows[row] -= num_objects

### Strategies

In [None]:
def pure_random(state: Nim) -> Nimply: # take a random row and select random elements (always < k)
    row = random.choice([r for r, c in enumerate(state.rows) if c > 0])
    if(state.rows[row] > state.k):
        num_objects = random.randint(1, state.k)
    else:
        num_objects = random.randint(1, state.rows[row])
    return Nimply(row, num_objects)

def shortest_row(state: Nim) -> Nimply: # take the shortest row and select random elements if the matches > k otherwise close the row
    row = min((x for x in enumerate(state.rows) if x[1] > 0), key=lambda y: y[1])[0]
    if(state.rows[row] > state.k):
       num_objects = random.randint(1, state.k)
    else:
       num_objects = state.rows[row]
    return Nimply(row, num_objects)

# algorithm taken from professor's code
def nim_sum(state: Nim) -> int:
    *_, result = accumulate(state.rows, xor)
    return result

def cook_status(state: Nim) -> dict:
    cooked = dict()
    cooked["possible_moves"] = [
        (r, o) for r, c in enumerate(state.rows) for o in range(1, c + 1) if state.k is None or o <= state.k
    ]
    cooked["active_rows_number"] = sum(o > 0 for o in state.rows)
    cooked["shortest_row"] = min((x for x in enumerate(state.rows) if x[1] > 0), key=lambda y: y[1])[0]
    cooked["longest_row"] = max((x for x in enumerate(state.rows)), key=lambda y: y[1])[0]
    cooked["nim_sum"] = nim_sum(state)

    brute_force = list()
    for m in cooked["possible_moves"]:
        tmp = deepcopy(state)
        tmp.nimming(m)
        brute_force.append((m, nim_sum(tmp)))
    cooked["brute_force"] = brute_force

    return cooked

def optimal_strategy(state: Nim) -> Nimply:
    data = cook_status(state)
    return next((bf for bf in data["brute_force"] if bf[1] == 0), random.choice(data["brute_force"]))[0]

# Agent

In [None]:
import numpy as np

ACTIONS = {'U': (-1, 0), 'D': (1, 0), 'L': (0, -1), 'R': (0, 1)}


class Agent(object):
    def __init__(self, states, alpha=0.15, random_factor=0.2):  # 80% explore, 20% exploit
        self.state_history = [((0, 0), 0)]  # state, reward
        self.alpha = alpha
        self.random_factor = random_factor
        self.G = {}
        self.init_reward(states)

    def init_reward(self, states):
        for i, row in enumerate(states):
            for j, col in enumerate(row):
                self.G[(j, i)] = np.random.uniform(low=1.0, high=0.1)

    def choose_action(self, state, allowedMoves):
        maxG = -10e15
        next_move = None
        randomN = np.random.random()
        if randomN < self.random_factor:
            # if random number below random factor, choose random action
            next_move = np.random.choice(allowedMoves)
        else:
            # if exploiting, gather all possible actions and choose one with the highest G (reward)
            for action in allowedMoves:
                new_state = tuple([sum(x) for x in zip(state, ACTIONS[action])])
                if self.G[new_state] >= maxG:
                    next_move = action
                    maxG = self.G[new_state]

        return next_move

    def update_state_history(self, state, reward):
        self.state_history.append((state, reward))

    def learn(self):
        target = 0

        for prev, reward in reversed(self.state_history):
            self.G[prev] = self.G[prev] + self.alpha * (target - self.G[prev])
            target += reward

        self.state_history = []

        self.random_factor -= 10e-5  # decrease random factor each episode of play


# Maze???

In [None]:
import numpy as np

ACTIONS = {'U': (-1, 0), 'D': (1, 0), 'L': (0, -1), 'R': (0, 1)}

class Maze(object):
    def __init__(self):
        self.end = (5, 5)
        self.maze = np.zeros((6, 6))
        self.maze[0, 0] = 2
        self.maze[5, :5] = 1
        self.maze[:4, 5] = 1
        self.maze[2, 2:] = 1
        self.maze[3, 2] = 1
        self.maze[self.end] = -1
        self.robot_position = (0, 0)
        self.steps = 0
        self.construct_allowed_states()

    def print_maze(self):
        print('---------------------------------')
        for row in self.maze:
            for col in row:
                if col == 0:
                    print(' 0 ', end="") # empty space
                elif col == 1:
                    print(' X ', end="") # walls
                elif col == 2:
                    print(' R ', end="") # robot position
                elif col == -1:
                    print(' E ', end="")
            print("\n")
        print('---------------------------------')

    def is_allowed_move(self, state, action):
        # check allowed move from a given state
        y, x = state
        y += ACTIONS[action][0]
        x += ACTIONS[action][1]
        if y < 0 or x < 0 or y > 5 or x > 5:
            # if robot will move off the board
            return False
        # if robot moves into empty space or its original start position
        return self.maze[y, x] <= 0 or self.maze[y, x] == 2

    def construct_allowed_states(self):
        # create a dictionary of allowed states from any position
        # using the isAllowedMove() function
        # this is so that you don't have to call the function every time
        allowed_states = {}
        for y, row in enumerate(self.maze):
            for x, col in enumerate(row):
                # iterate through all spaces
                if self.maze[(y,x)] != 1:
                    # if the space is not a wall, add it to the allowed states dictionary
                    allowed_states[(y,x)] = []
                    for action in ACTIONS:
                        if self.is_allowed_move((y,x), action) & (action != 0):
                            allowed_states[(y,x)].append(action)
        self.allowed_states = allowed_states

    def update_maze(self, action):
        y, x = self.robot_position # get current position
        self.maze[y, x] = 0 # set the current position to 0
        y += ACTIONS[action][0] # get new position
        x += ACTIONS[action][1] # get new position
        self.robot_position = (y, x) # set new position
        self.maze[y, x] = 2 # set new position
        self.steps += 1 # add steps

    def is_game_over(self):
        # check if robot in the final position
        return self.robot_position == self.end

    def get_state_and_reward(self):
        return self.robot_position, self.give_reward()

    def give_reward(self):
        # if at end give 0 reward
        # if not at end give -1 reward
        return -1 * int(not self.robot_position == self.end)

# MAIN???

In [None]:
# Brutally stolen and corrected from
# https://towardsdatascience.com/hands-on-introduction-to-reinforcement-learning-in-python-da07f7aaca88

# ... and a little bit modified

# Same goes for Maze and RLAgent, obviously

from Maze import Maze
from RLAgent import Agent
import matplotlib.pyplot as plt

if __name__ == '__main__':
    maze = Maze()
    robot = Agent(maze.maze, alpha=0.1, random_factor=0.4)
    moveHistory = []
    indices = []

    maze.print_maze()

    for i in range(5000):

        while not maze.is_game_over():
            state, _ = maze.get_state_and_reward()  # get the current state
            # choose an action (explore or exploit)
            action = robot.choose_action(state, maze.allowed_states[state])
            maze.update_maze(action)  # update the maze according to the action
            state, reward = maze.get_state_and_reward()  # get the new state and reward
            # update the robot memory with state and reward
            robot.update_state_history(state, reward)
            if maze.steps > 1000:
                # end the robot if it takes too long to find the goal
                maze.robot_position = (5, 5)
        robot.learn()  # robot should learn after every episode
        # get a history of number of steps taken to plot later
        if i % 50 == 0:
            print(f"{i}: {maze.steps}")
            moveHistory.append(maze.steps)
            indices.append(i)
        maze = Maze()  # reinitialize the maze

plt.semilogy(indices, moveHistory, "b")
plt.show()


# Task 4: Reinforcement Learning

##### Agent

In [None]:
class Agent(object):
    def __init__(self, states, alpha = 0.15, random_factor = 0.2):
        
        return

In [None]:
def play(N, turn):
    countwin = 0
    while(N):
        rows = random.randint(3, 4) # number of rows
        k = random.randint(1, 3) # upperbound of selected matches
        #rows = random.randint(4, 13) # number of rows
        #k = random.randint(3, 8) # upperbound of selected matches
        #rows = 4
        #k = 1
        if(turn == 0): # who starts first
           player = 1
        else:
           player = 0
        nim = Nim(rows, k) #creating the nim

        while(nim):
           if(player == 0):
              #ply = pure_random(nim)
              ply = optimal_strategy(nim)
              #ply = shortest_row(nim)
           else:
              ply = find_best_move(nim)
           nim.nimming(ply)
           logging.info(f"Board after player {player} -> {nim}")
           player = 1 - player
        if(1 - player == 1): # winner = 1 - player
               countwin += 1  
        N -= 1             

    return countwin

# Main

In [None]:
logging.getLogger().setLevel(logging.DEBUG)

for N in [1]: # game to played as First Player
    countwin = play(N, 0)
    logging.info(f"Game played = {N}: " f"Winrate 1° player = {(countwin/N)*100}% ")
for N in [1]: # game to played as Second Player
    countwin = play(N, 1)
    logging.info(f"Game played = {N}: " f"Winrate 2° player = {(countwin/N)*100}% ")