# Reinforced Learning over implified Neutreeko

This work consist of creating an agent capable of playing a simplified version of the game Neutreeko (https://www.neutreeko.net/neutreeko.htm).

![alt text](https://i.imgur.com/qeEH8e2.jpg)

In the simplified version, the game starts with a randomly generated 5x5 board with 3 black pieces. The Agent can move any piece in one of the 4 possible directions: UP, DOWN, LEFT, RIGHT. After choosing a direction and a piece, the selected piece will move in that direction until colliding with a edge of the board or another piece.

The game ends when 200 rounds pass or when the player is able to place 3 pieces together in a row, column or diagonally.



## Necessary imports
- gym
- numpy
- matplotlib
- typing

In [None]:
import gym
import numpy as np
import matplotlib.pyplot as plt
from typing import Tuple, List, Union

## Game Logic and Engine

For better organization, we separated some logic of the game into the class `NeutreekoUtils` and created some universal constants

In [None]:
EASY_ACTIONS_DICT = {
    'UP': (-1, 0),
    'DOWN': (+1, 0),
    'LEFT': (0, -1),
    'RIGHT': (0, +1),
}

BOARD_SIZE = 5

In [None]:
class NeutreekoUtils:
    @staticmethod
    def search_sequence_numpy(arr, seq) -> bool:
        """
        Find sequence in an array using NumPy only.
        :param arr: input 1D array
        :param seq: input 1D array
        :return: True if the seq is in the arr
        """

        # Store sizes of input array and sequence
        Na, Nseq = arr.size, seq.size

        # Range of sequence
        r_seq = np.arange(Nseq)

        # Create a 2D array of sliding indices across the entire length of input array.
        # Match up with the input sequence & get the matching starting indices.
        M = (arr[np.arange(Na-Nseq+1)[:, None] + r_seq] == seq).all(1)

        # Return true if the sequence exists
        return M.any() > 0

    @staticmethod
    def find_sequence_board(board: np.array, sequence) -> bool:
        """
        Given a board, attempts to find a sequence in all possible directions
        :param board:
        :param sequence:
        :return: True if the sequence is in the board, False otherwise
        """
        for i in range(len(board)):
            # Check in lines
            if NeutreekoUtils.search_sequence_numpy(board[i, :], sequence):
                return True
            # check in columns
            if NeutreekoUtils.search_sequence_numpy(board[:, i], sequence):
                return True

        # check victory in diagonals
        flipped_board = np.fliplr(board)
        for i in range(-2, 3):
            diagonal1 = np.diagonal(board, offset=i)
            if NeutreekoUtils.search_sequence_numpy(diagonal1, sequence):
                return True
            diagonal2 = np.diagonal(flipped_board, offset=i)
            if NeutreekoUtils.search_sequence_numpy(diagonal2, sequence):
                return True

        return False

    @staticmethod
    def value_in_board(board, coords: Tuple[int, int]) -> int:
        """
        Returns the value in the board

        :param board: A np array of size (5,5)
        :param coords: The x and y coordinates of a spot
        :return: The integer value in the board
        """
        if (coords[0] < 0) | (coords[0] >= BOARD_SIZE) | (coords[1] < 0) | (coords[1] >= BOARD_SIZE):
            return False
        return board[coords[0], coords[1]]

    @staticmethod
    def replace_in_board(board, coords: Tuple[int, int], value: int) -> Union[None, bool]:
        """
        Replaces a value in the board

        :param board: A np array of size (5,5)
        :param coords: The x and y coordinates of a spot
        :param value: the value to be in the board
        :return: None or False if the coords are not valid
        """
        if (coords[0] < 0) | (coords[0] >= BOARD_SIZE) | (coords[1] < 0) | (coords[1] >= BOARD_SIZE):
            return False
        board[coords[0], coords[1]] = value

### Game Logic 

For the game logic, we created the class `NeutreekoEasyGame` where we can create a new random board, check if a move is valid, retrieve a list of all possible moves, perform a move and update the board.

In [None]:
class NeutreekoEasyGame:
    def __init__(self):
        self.board = None
        self.current_player = None
        self.game_over = None
        self.turns_count = None

    def reset(self) -> None:
        """
        Resets the game, with a new board, turns count to 0 and designates the first player
        :return:
        """
        self.board = self.new_board()
        self.current_player = 1
        self.game_over = False
        self.turns_count = 0

    @staticmethod
    def new_board() -> np.array:
        """
        Returns a random starting board, each element is a numpy.int8 (-128, 127)

        :return: numpy.array
        """
        board = np.array([[0, 0, 0, 0, 0],
                          [0, 0, 0, 0, 0],
                          [0, 0, 0, 0, 0],
                          [0, 0, 0, 0, 0],
                          [0, 0, 0, 0, 0]], dtype=np.int8)
        piece_1_x = np.random.randint(0, 5)
        piece_1_y = np.random.randint(0, 5)

        piece_2_x = np.random.randint(0, 5)
        piece_2_y = np.random.randint(0, 5)
        while (piece_1_x == piece_2_x) & (piece_1_y == piece_2_y):
            piece_2_x = np.random.randint(0, 5)
            piece_2_y = np.random.randint(0, 5)

        piece_3_x = np.random.randint(0, 5)
        piece_3_y = np.random.randint(0, 5)
        while ((piece_1_x == piece_3_x) & (piece_1_y == piece_3_y)) | ((piece_2_x == piece_3_x) & (piece_2_y == piece_3_y)):
            piece_3_x = np.random.randint(0, 5)
            piece_3_y = np.random.randint(0, 5)

        board[piece_1_x][piece_1_y] = 1
        board[piece_2_x][piece_2_y] = 1
        board[piece_3_x][piece_3_y] = 1

        return board

    def value_in_board(self, position: Tuple[int, int]) -> int:
        """
        Returns the value in a position of the board

        :param position: Tuple with 2 ints representing the coordinates of a cell
        :return: The int value
        """
        return NeutreekoUtils.value_in_board(self.board, position)

    def replace_in_board(self, position: Tuple[int, int], value: int) -> None:
        """
        Replaces a value in a board position

        :param position: Tuple with 2 ints representing the coordinates of a cell
        :param value: Value of a player
        :return: None
        """
        NeutreekoUtils.replace_in_board(self.board, position, value)

    def free_cell(self, coords: Tuple[int, int]) -> bool:
        """
        Checks if a cell is within bounds of the board and if it is free (value is 0)

        :param coords: Tuple with 2 ints representing the coordinates of a cell
        :return: True if the cell equals 0 and is within bounds
        """
        if (coords[0] < 0) | (coords[0] >= BOARD_SIZE) | (coords[1] < 0) | (coords[1] >= BOARD_SIZE):
            return False
        value = self.value_in_board(coords)
        return value == 0

    def check_direction(self, coords: Tuple[int, int], direction: str) -> Union[None, Tuple[int, int]]:
        """
        Returns the resulting position given a starting position and a direction.
        If the direction is not valid (can't make progress in that direction), returns None

        :param coords: Coordinates of intial point
        :param direction: String representation of the direction to take
        :return: None if direction is not valid OR tuple with new coords of resulting positions
        """
        action_coords = EASY_ACTIONS_DICT[direction]
        attempt_coords = tuple(np.add(coords, action_coords))
        free_cell = self.free_cell(attempt_coords)
        if not free_cell:
            return None
        # apply direction until it reaches EOB (end of board) or another piece
        while free_cell:
            new_coords = attempt_coords
            attempt_coords = tuple(np.add(new_coords, action_coords))
            free_cell = self.free_cell(attempt_coords)
        return new_coords

    def available_directions(self, coords: Tuple[int, int]) -> List[str]:
        """
        Finds which directions are available for a piece on the coords tuple

        :param coords: The coordinates of a piece
        :return: A list of directions
        """
        dirs = []
        for action_name in EASY_ACTIONS_DICT.keys():
            result = self.check_direction(coords, action_name)
            if result:
                dirs.append(action_name)
        return dirs

    def get_possible_moves(self, player: int, only_valid: bool = False) -> List[int]:
        """
        Return all the possible moves for a given player with the current board

        :param player: Integer representing the player
        :param only_valid: returns only the valid moves
        :return: A list of ints representing possible actions
        """
        dirs_value = {
            'UP': 0,
            'DOWN': 1,
            'LEFT': 2,
            'RIGHT': 3
        }
        possible_moves = []

        # Find player piece positions
        result = np.where(self.board == player)
        list_of_coordinates = list(zip(result[0], result[1]))

        # for each player piece
        piece_value = 0
        for pos in list_of_coordinates:
            if only_valid:
                # checks which directions are available
                dirs = self.available_directions(pos)
                for dir in dirs:
                    value = 4*piece_value + dirs_value[dir]
                    possible_moves.append(value)
            else:
                # adds every direction
                for dir in EASY_ACTIONS_DICT.keys():
                    possible_moves.append(4*piece_value + dirs_value[dir])
            piece_value += 1
        return possible_moves

    def action_handler(self, pos, dir) -> Union[None, Tuple[tuple, str]]:
        """
        Effectuates the movement of the piece in pos, in the direction dir

        :param pos: The position of the piece that will be moved
        :param dir: The direction that the piece will be moved to
        :return: A tuple with the resulting position and the move type. None if the move is not valid
        """
        result = self.check_direction(pos, dir)
        if not result:
            return None

        self.update_game(pos, result)
        self.turns_count += 1

        self.game_over = NeutreekoUtils.find_sequence_board(self.board, np.array([1, 1, 1]))

        move_type = "win" if self.game_over else "default"
        return result, move_type

    def update_game(self, pos, result) -> None:
        """
        Replaces the piece in the board
        :param pos: initial position of the piece
        :param result: final position of the piece
        """
        self.replace_in_board(result, 1)
        self.replace_in_board(pos, 0)
        pass

    def render(self) -> None:
        """
        Renders the game on the screen
        """
        print(self.board)

## Environments

After having the game engine correctly working, we can implement the simplified Neutreeko environment as the `NeutreekoEasyEnv` class with the methods `init`, `step`, `render`, `close` and `reset`. The Environment also needs a helper class `Reward` that holds the reward for each type of move.

In [None]:
class Reward:
    @staticmethod
    def get(move_type):
        """
        Chooses a reward value based on the type of action
        :param move_type: type of action
        :return: the reward value
        """
        return {
            "win": 20,  # winning move
            # "2_row": 5,  # places 2 pieces together
            # "between": 2,  # gets in between 2 opponent pieces
            "default": -1  # makes a move (negative to not enforce unnecessary moves)
        }.get(move_type, -1)

In [None]:
class NeutreekoEasyEnv(gym.Env):
    """
    Description:
       In a 5x5 board there are 3 black pieces in a random formation
       The black pieces can move in any direction, they can move until
       they collide with another piece or the edge of the board
       The game ends when the 3 pieces make a line in any direction
    Source:
       This environment corresponds to a simplified version of the Neutreeko game
       Specified here: https://www.neutreeko.net/neutreeko.htm
    Observation:
       Type: Discrete(2300)

        [[0, 1, 0, 1, 0],
         [0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0],
         [0, 0, 1, 0, 0],
         [0, 0, 0, 0, 0]]

       All possible board combinations
    Actions:
       Type: Discrete(12)

       Num   Action
       0      0-UP
       1      0-DOWN
       2      0-LEFT
       3      0-RIGHT
       4      1-UP
       5      1-DOWN
       6      1-LEFT
       7      1-RIGHT
       8      2-UP
       9      2-DOWN
       10     2-LEFT
       11     2-RIGHT

          UP   DOWN  LEFT  RIGHT
       0  0     1     2     3
       1  4     5     6     7
       2  8     9     10    11

       The piece 0 is the one with the lowest index. For a piece in coords (x, y), its index is 5*x + y.
    Reward:
       Reward class
    Starting State:
        A randomly generated board
    Episode Termination:
       The player makes 3 in a row
       Episode length is greater than 200.
   """
    metadata = {
        'render.modes': ['terminal']
    }

    def __init__(self, render_mode='terminal', max_turns=200):
        super(NeutreekoEasyEnv, self).__init__()

        # 3 pieces and 4 directions possible
        self.action_space = gym.spaces.Discrete(3*4)
        self.observation_space = gym.spaces.Discrete(2300)

        self.render_mode = render_mode
        self.max_turns = max_turns

        self.game = NeutreekoEasyGame()
        pass

    def step(self, action: int) -> Tuple[object, float, bool, dict]:
        """
        Performs an action on the game and returns info
        :param action:
        :return: observation, reward, done, info
        """
        reward = 0
        info = {
            'old_state': np.copy(self.game.board),
            'turn': self.game.turns_count,
            'action': action,
            'direction': None,
        }

        assert not self.done
        # if self.done:
        #     logger.warn("You are calling 'step()' even though this environment has already returned done = True."
        #                 "You should always call 'reset()' once you receive 'done = True'"
        #                 "-- any further steps are undefined behavior.")
        pos, dir = self.process(action)
        move_check = self.game.action_handler(pos, dir)
        if move_check:
            new_pos, move_type = move_check
            reward = Reward.get(move_type)
            info['direction'] = dir
            info['new_position'] = new_pos

        return self.observation, reward, self.done, info

    def reset(self) -> None:
        """
        Resets the game
        """
        self.game.reset()

    def render(self, mode='terminal') -> None:
        """
        Renders the game according to the mode
        :param mode: terminal or GUI
        :return:
        """
        if mode == 'terminal':
            self.game.render()

    def close(self):
        """
        Closes the environment and terminates anything if necessary
        """
        pass

    @property
    def done(self) -> bool:
        """
        Checks if the game is done or the max turns were reached
        :return: True if the game is done, false otherwise
        """
        game_over = self.game.game_over
        too_many_turns = (self.game.turns_count > self.max_turns)
        return game_over or too_many_turns

    @property
    def observation(self) -> np.array:
        """
        Returns the game board
        :return: The board as a numpy array
        """
        return np.copy(self.game.board)

    def process(self, action: int) -> Tuple[tuple, str]:
        """
        Convert a action into a position and direction
        :param action: A integer between 0 and 11 representing an action
        :return: A tuple with a position (tuple) and a direction
        """
        directions = ['UP', 'DOWN', 'LEFT', 'RIGHT']
        result = np.where(self.game.board == 1)
        list_of_coordinates = list(zip(result[0], result[1]))
        return list_of_coordinates[action // 4], directions[action % 4]


## Agent Analysis
### Random Agent

To test if the environment and engine are working correctly, we can use a Agent that chooses random moves. The unused methods will be used for other agents

In [None]:
class RandomAgent:
    def __init__(self):
        pass

    def choice(self, env) -> int:
        """
        Given the environment, choose the action to take using randomness
        :param env: A Game environment
        :return: the action to take
        """
        player = env.game.current_player
        possible_moves = env.game.get_possible_moves(player, only_valid=True)
        i_random = np.random.randint(len(possible_moves))
        return possible_moves[i_random]

    def update(self, obs, reward, done, info, env):
        """
        Not used in this agent
        """
        pass

    def episode_update(self, episode):
        """
        Not used in this agent
        """
        pass

With the Agent implemented, we can test run the agent, environment and engine with a game loop

In [None]:
env = NeutreekoEasyEnv(render_mode='terminal')
agent = RandomAgent()

# Creating lists to keep track of reward and epsilon values
training_rewards = []

NB_EPISODES = 1000
for episode in range(1, NB_EPISODES + 1):
    # Resetting the environment each time as per requirement
    env.reset()
    # Starting the tracker for the rewards
    total_training_rewards = 0

    done = False
    while not done:
        action = agent.choice(env)
        obs, reward, done, info = env.step(action)
        agent.update(obs, reward, done, info, env)
        total_training_rewards += reward
    print(f"Episode {episode: <4} finished after {env.game.turns_count} turns")

    agent.episode_update(episode)

    # Adding the total reward
    training_rewards.append(total_training_rewards)
env.close()

In [None]:
# Visualizing results and total reward over all episodes
x = range(NB_EPISODES)
plt.plot(x, training_rewards)
plt.xlabel('Episode')
plt.ylabel('Training total reward')
plt.title('Total rewards over all episodes in training')
plt.show()

With the Random Agent there isn't any kind of learning and the total reward for each episode is mostly negative.

### Q-Learning Agent

In [None]:
class QAgent:
    def __init__(self, observation_space=2300, action_space=12):
        """
        Initialize an agent and its parameters

        :param observation_space: How many possible states there are
        :param action_space: How many actions there are
        """
        self.Q = np.zeros((observation_space, action_space))

        self.alpha = 0.7  # learning rate
        self.discount_factor = 0.618
        self.epsilon = 1
        self.max_epsilon = 1
        self.min_epsilon = 0.01
        self.decay = 0.001

        self.board_dict = {}
        self.lastID = None

    def choice(self, env) -> int:
        """
        Given the environment, choose the action to take

        :param env: A Game environment
        :return: the action to take
        """
        # Choosing an action given the states based on a random number
        exp_exp_tradeoff = np.random.uniform(0, 1)

        if repr(env.observation) not in self.board_dict:
            if self.lastID:
                self.lastID += 1
            else:
                self.lastID = 0
            self.board_dict[repr(env.observation)] = self.lastID
            state = self.lastID
        else:
            state = self.board_dict[repr(env.observation)]
        # STEP 2: FIRST option for choosing the initial action - exploit
        # If the random number is larger than epsilon: employing exploitation
        # and selecting best action
        if exp_exp_tradeoff > self.epsilon:
            action = np.argmax(self.Q[state, :])
            # Sometimes the agent will try to exploit, but end up choosing a not valid move
            # To ensure that exploiting always provides good results, there's a check to verify if the
            # action is valid, if it is not, chooses a random valid move
            possible_moves = env.game.get_possible_moves(1, only_valid=True)
            if action not in possible_moves:
                i_random = np.random.randint(len(possible_moves))
                action = possible_moves[i_random]

        # STEP 2: SECOND option for choosing the initial action - explore
        # Otherwise, employing exploration: choosing a random action
        else:
            possible_moves = env.game.get_possible_moves(1, only_valid=True)
            i_random = np.random.randint(len(possible_moves))
            action = possible_moves[i_random]

        return action

    def update(self, obs, reward: int, done: bool, info: dict, env) -> None:
        """
        Updates the Q-table after performing an action

        :param obs: New state that resulted from a action
        :param reward: The reward returned from applying a action to a state
        :param done: Boolean representing if the episode is finished
        :param info: Additional info from performing an action
        :param env: The environment
        """
        if repr(obs) not in self.board_dict:
            if not self.board_dict:
                self.board_dict[repr(obs)] = 0
                self.lastID = 0
            else:
                self.lastID += 1
                self.board_dict[repr(obs)] = self.lastID

        state = self.board_dict[repr(info["old_state"])]
        new_state = self.board_dict[repr(obs)]
        action = info['action']

        self.Q[state, action] = self.Q[state, action] + self.alpha * (reward + self.discount_factor * np.max(self.Q[new_state, :]) - self.Q[state, action])

    def episode_update(self, episode: int) -> None:
        """
        Update internals after each episode

        :param episode: Finished episode id
        """
        # Cutting down on exploration by reducing the epsilon
        self.epsilon = self.min_epsilon + (self.max_epsilon - self.min_epsilon)*np.exp(-self.decay*episode)

With the new agent, now we try the game loop

In [None]:
env = NeutreekoEasyEnv(render_mode='terminal')
agent = QAgent()

# Creating lists to keep track of reward and epsilon values
training_rewards = []
epsilons = []

NB_EPISODES = 1000
for episode in range(1, NB_EPISODES + 1):
    # Resetting the environment each time as per requirement
    env.reset()
    # Starting the tracker for the rewards
    total_training_rewards = 0

    done = False
    while not done:
        action = agent.choice(env)
        obs, reward, done, info = env.step(action)
        agent.update(obs, reward, done, info, env)
        total_training_rewards += reward
    print(f"Episode {episode: <4} finished after {env.game.turns_count} turns")

    agent.episode_update(episode)

    # Adding the total reward and reduced epsilon values
    training_rewards.append(total_training_rewards)
    epsilons.append(agent.epsilon)
    
print(f'Highest board id -> {agent.lastID}')
print(f'Q-table -> {agent.Q}')
env.close()

In [None]:
# Visualizing results and total reward over all episodes
x = range(NB_EPISODES)
plt.plot(x, training_rewards)
plt.xlabel('Episode')
plt.ylabel('Training total reward')
plt.title('Total rewards over all episodes in training')
plt.show()



In [None]:
# Visualizing the epsilons over all episodes
plt.plot(epsilons)
plt.xlabel('Episode')
plt.ylabel('Epsilon')
plt.title("Epsilon for episode")
plt.show()


### SARSA Agent



In [None]:
class SARSAAgent:
    def __init__(self, observation_space=2300, action_space=12):
        """
        Initialize an agent and its parameters
        :param observation_space: How many possible states there are
        :param action_space: How many actions there are
        """
        self.Q = np.zeros((observation_space, action_space))

        self.alpha = 0.7  # learning rate
        self.discount_factor = 0.618
        self.epsilon = 1
        self.max_epsilon = 1
        self.min_epsilon = 0.01
        self.decay = 0.005

        self.board_dict = {}
        self.lastID = None

    def choice(self, env) -> int:
        """
        Given the environment, choose the action to take
        :param env: A Game environment
        :return: the action to take
        """
        # Choosing an action given the states based on a random number
        exp_exp_tradeoff = np.random.uniform(0, 1)

        if repr(env.observation) not in self.board_dict:
            if self.lastID:
                self.lastID += 1
            else:
                self.lastID = 0
            self.board_dict[repr(env.observation)] = self.lastID
            state = self.lastID
        else:
            state = self.board_dict[repr(env.observation)]
        # STEP 2: FIRST option for choosing the initial action - exploit
        # If the random number is larger than epsilon: employing exploitation
        # and selecting best action
        if exp_exp_tradeoff > self.epsilon:
            action = np.argmax(self.Q[state, :])
            # Sometimes the agent will try to exploit, but end up choosing a not valid move
            # To ensure that exploiting always provides good results, there's a check to verify if the
            # action is valid, if it is not, chooses a random valid move
            possible_moves = env.game.get_possible_moves(1, only_valid=True)
            if action not in possible_moves:
                i_random = np.random.randint(len(possible_moves))
                action = possible_moves[i_random]

        # STEP 2: SECOND option for choosing the initial action - explore
        # Otherwise, employing exploration: choosing a random action
        else:
            possible_moves = env.game.get_possible_moves(1, only_valid=True)
            i_random = np.random.randint(len(possible_moves))
            action = possible_moves[i_random]

        return action

    def update(self, obs, reward: int, done: bool, info: dict, env):
        """
        Updates the Q-table after performing an action, using an available action in the new state
        :param obs: New state that resulted from a action
        :param reward: The reward returned from applying a action to a state
        :param done: Boolean representing if the episode is finished
        :param info: Additional info from performing an action
        :param env: The environment
        """
        if repr(obs) not in self.board_dict:
            if not self.board_dict:
                self.board_dict[repr(obs)] = 0
                self.lastID = 0
            else:
                self.lastID += 1
                self.board_dict[repr(obs)] = self.lastID

        state = self.board_dict[repr(info["old_state"])]
        new_state = self.board_dict[repr(obs)]
        action = info['action']
        new_action = self.choice(env)

        self.Q[state, action] = self.Q[state, action] + self.alpha * (reward + self.discount_factor * self.Q[new_state, new_action] - self.Q[state, action])

    def episode_update(self, episode: int) -> None:
        """
        Update internals after each episode
        :param episode: Finished episode id
        """
        # Cutting down on exploration by reducing the epsilon
        self.epsilon = self.min_epsilon + (self.max_epsilon - self.min_epsilon)*np.exp(-self.decay*episode)

Final game Loop

In [None]:
env = NeutreekoEasyEnv(render_mode='terminal')
agent = SARSAAgent()

# Creating lists to keep track of reward and epsilon values
training_rewards = []
epsilons = []

NB_EPISODES = 1000
for episode in range(1, NB_EPISODES + 1):
    # Resetting the environment each time as per requirement
    env.reset()
    # Starting the tracker for the rewards
    total_training_rewards = 0

    done = False
    while not done:
        action = agent.choice(env)
        obs, reward, done, info = env.step(action)
        agent.update(obs, reward, done, info, env)
        total_training_rewards += reward
    print(f"Episode {episode: <4} finished after {env.game.turns_count} turns")

    agent.episode_update(episode)

    # Adding the total reward and reduced epsilon values
    training_rewards.append(total_training_rewards)
    epsilons.append(agent.epsilon)
    
print(f'Highest board id -> {agent.lastID}')
print(f'Q-table -> {agent.Q}')
env.close()

In [None]:
# Visualizing results and total reward over all episodes
x = range(NB_EPISODES)
plt.plot(x, training_rewards)
plt.xlabel('Episode')
plt.ylabel('Training total reward')
plt.title('Total rewards over all episodes in training')
plt.show()

In [None]:
# Visualizing the epsilons over all episodes
plt.plot(epsilons)
plt.xlabel('Episode')
plt.ylabel('Epsilon')
plt.title("Epsilon for episode")
plt.show()
