In [1]:
from kaggle_environments import make
from kaggle_environments.envs.hungry_geese.hungry_geese import Observation, Configuration, Action, row_col

import numpy as np

Loading environment football failed: No module named 'gfootball'


In [2]:
def geese_heads(obs_dict, config_dict):
    """
    Return the position of the geese's heads
    """
    configuration = Configuration(config_dict)

    observation = Observation(obs_dict)
    player_index = observation.index
    player_goose = observation.geese[player_index]
    player_head = player_goose[0]
    player_row, player_column = row_col(player_head, configuration.columns)
    positions = []
    for geese in observation.geese:
        if len(geese)>0:
            geese_head = geese[0]
            row, column = row_col(geese_head, configuration.columns)
        else:
            row = None
            column = None
        positions.append((row, column))
    return positions

def get_last_actions(previous_geese_heads, heads_positions):

    def get_last_action(prev, cur):
        last_action = None

        prev_row = prev[0]
        prev_col = prev[1]
        cur_row = cur[0]
        cur_col = cur[1]

        if cur_row is not None:
            if (cur_row-prev_row == 1) | ((cur_row==0) & (prev_row==6)):
                last_action = Action.SOUTH.name
            elif (cur_row-prev_row == -1) | ((cur_row==6) & (prev_row==0)):
                last_action = Action.NORTH.name
            elif (cur_col-prev_col == 1) | ((cur_col==0) & (prev_col==10)):
                last_action = Action.EAST.name
            elif (cur_col-prev_col == -1) | ((cur_col==10) & (prev_col==0)):
                last_action = Action.WEST.name

        return last_action

    if len(previous_geese_heads) == 0:
        actions = [Action.SOUTH.name, Action.NORTH.name, Action.EAST.name, Action.WEST.name]
        nb_geeses = len(heads_positions)
        last_actions = ["None" for _ in range(nb_geeses)]
    else:   
        last_actions = [get_last_action(*pos) for pos in zip(previous_geese_heads, heads_positions)]

    return last_actions
    
def central_state_space(obs_dict, config_dict, last_actions):
    """
    Recreating a board where my agent's head in the middle of the board 
    (position (4,5)), and creating features accordingly
    """
    
    last_actions_dict = {
        Action.WEST.name: 1,
        Action.EAST.name: 2,
        Action.NORTH.name: 3,
        Action.SOUTH.name: 4,
        "None": 16
    }
    
    configuration = Configuration(config_dict)

    observation = Observation(obs_dict)
    player_index = observation.index
    player_goose = observation.geese[player_index]
    player_head = player_goose[0]
    player_row, player_column = row_col(player_head, configuration.columns)
    row_offset = player_row - 3
    column_offset = player_row - 5

    foods = observation['food']

    def centralize(row, col):
        if col > player_column:
            new_col = (5 + col - player_column) % 11
        else:
            new_col = 5 - (player_column - col)
            if new_col < 0:
                new_col += 11

        if row > player_row:
            new_row = (3 + row - player_row) % 7
        else:
            new_row = 3 - (player_row - row)
            if new_row < 0:
                new_row += 7
        return new_row, new_col

    food1_row, food1_column = centralize(*row_col(foods[0], configuration.columns))
    food2_row, food2_column = centralize(*row_col(foods[1], configuration.columns))

    food1_row_feat = float(food1_row - 3)/5 if food1_row>=3 else float(food1_row - 3)/5
    food2_row_feat = float(food2_row - 3)/5 if food2_row>=3 else float(food2_row - 3)/5

    food1_col_feat = float(food1_column - 5)/5 if food1_column>=5 else float(food1_column - 5)/5
    food2_col_feat = float(food2_column - 5)/5 if food2_column>=5 else float(food2_column - 5)/5

    # Create the grid
    board = np.zeros([7, 11])
    # Add food to board
    board[food1_row, food1_column] = 15
    board[food2_row, food2_column] = 15

    for geese_id, geese in enumerate(observation.geese):
        nb_blocks = len(geese)
        if nb_blocks > 0:
            for i, pix in enumerate(geese[::-1]):
                if ((i+1) == nb_blocks): #This is the head
                    idx = last_actions_dict[last_actions[geese_id]] #head
                else:
                    idx = (i+5) if (i+1)<15 else 14
                row, col = centralize(*row_col(pix, configuration.columns))
                board[row, col] = idx
            
    return board, len(player_goose), food1_row_feat, food1_col_feat, food2_row_feat, food2_col_feat

In [3]:
class RuleBasedAgent:
    """
    Rule based agent - 
    We will use this rule-based agent to collect state-space data and the actions to take.
    An initial neural network will be trained to learn this rule-based policy.
    The neural network will then be improved using RL methods.
    """
    def __init__(self):
        self.last_action = None
        self.last_heads_positions = []
        self.stateSpace = None
        
    def getStateSpace(self, obs_dict, config_dict):
        heads_positions = geese_heads(obs_dict, config_dict)
        last_actions = get_last_actions(self.last_heads_positions, heads_positions)
        
        board, player_goose_len, food1_row_feat, food1_col_feat, food2_row_feat, food2_col_feat = central_state_space(obs_dict, config_dict, last_actions)
        
        cur_obs = {}
        cur_obs['food1_col'] = food1_col_feat
        cur_obs['food2_col'] = food2_col_feat
        cur_obs['food1_row'] = food1_row_feat
        cur_obs['food2_row'] = food2_row_feat
        cur_obs['goose_size'] = (player_goose_len - 7) / 14
        cur_obs['board'] = board
        cur_obs['hunger'] = -1 + (float(obs_dict['step']%40)/20)
        cur_obs['step'] = (float(obs_dict['step'])/100) - 1
        
        return cur_obs, heads_positions, last_actions
    def __call__(self, obs_dict, config_dict):
        cur_obs, heads_positions, last_actions = self.getStateSpace(obs_dict, config_dict)
        
        food1_col_feat = cur_obs['food1_col'] 
        food2_col_feat = cur_obs['food2_col'] 
        food1_row_feat = cur_obs['food1_row']
        food2_row_feat = cur_obs['food2_row']
        player_goose_len = cur_obs['goose_size']
        board = cur_obs['board'] 
        cur_obs['hunger'] = -1 + (float(obs_dict['step']%40)/20)
        cur_obs['step'] = (float(obs_dict['step'])/100) - 1

        self.stateSpace = cur_obs
        
        # Prioritize food that is closer
        if (abs(food1_row_feat) + abs(food1_col_feat)) <= (abs(food2_row_feat) + abs(food2_col_feat)):
            p1_food_row_feat = food1_row_feat
            p1_food_col_feat = food1_col_feat
            p2_food_row_feat = food2_row_feat
            p2_food_col_feat = food2_col_feat
        else:
            p1_food_row_feat = food2_row_feat
            p1_food_col_feat = food2_col_feat
            p2_food_row_feat = food1_row_feat
            p2_food_col_feat = food1_col_feat
            

        action = None
        
        
        action_dict = {}

        # For each possible action, we create a value using the following logic:
            # Is action eligible? If yes, +10 000 points
            # Will the action kill us right away? if no, +1000 points
            # Is there a possibility that any other player 
                # move to that same box at that same step? If no, +100 points
            # Is this action getting us closer to the nearest food? If yes, +10 points
            # Is this action getting us closer to the other food? If yes, +1 points
            
        # We then take the action with the most points (won't kill us and
        # brings us toward food if possible)
        
        
        action_dict[Action.WEST.name] = 0
        # Is action eligible?
        if (self.last_action is None) | (self.last_action != Action.EAST.name):
            action_dict[Action.WEST.name] += 1E4
        # Will the action kill us right away?
        if (board[3, 4] == 0) | (board[3, 4] == 15):
            action_dict[Action.WEST.name] += 1E3
        # Will the action kill us on the subsequent step?:
        if (board[3, 4] == 0) | (board[3, 4] == 15):
            action_dict[Action.WEST.name] += 1E3
        # Is there a possibility that any other player 
        # move to that same box at that same step?
        if (board[3, 3] in [0, 1, 15]) & (board[4, 4] in [0, 4, 15]) & (board[2, 4] in [0, 3, 15]):
            action_dict[Action.WEST.name] += 1E2
        # Is this action getting us closer to the nearest food?
        if p1_food_col_feat < 0:
            action_dict[Action.WEST.name] += 1E1
        # Is this action getting us closer to the other food?
        if p2_food_col_feat < 0:
            action_dict[Action.WEST.name] += 1E0
            
        action_dict[Action.EAST.name] = 0
        if (self.last_action is None) | (self.last_action != Action.WEST.name):
            action_dict[Action.EAST.name] += 1E4
        if (board[3, 6] == 0) | (board[3, 6] == 15):
            action_dict[Action.EAST.name] += 1E3
        if (board[3, 7] in [0, 2, 15]) & (board[4, 6] in [0, 4, 15]) & (board[2, 6] in [0, 3, 15]):
            action_dict[Action.EAST.name] += 1E2
        if p1_food_col_feat > 0:
            action_dict[Action.EAST.name] += 1E1
        if p2_food_col_feat > 0:
            action_dict[Action.EAST.name] += 1E0
            

        action_dict[Action.NORTH.name] = 0
        if (self.last_action is None) | (self.last_action != Action.SOUTH.name):
            action_dict[Action.NORTH.name] += 1E4
        if (board[2, 5] == 0) | (board[2, 5] == 15):
            action_dict[Action.NORTH.name] += 1E3
        if (board[1, 5] in [0, 3, 15]) & (board[2, 4] in [0, 1, 15]) & (board[2, 6] in [0, 2, 15]):
            action_dict[Action.NORTH.name] += 1E2
        if p1_food_row_feat < 0:
            action_dict[Action.NORTH.name] += 1E1
        if p2_food_row_feat < 0:
            action_dict[Action.NORTH.name] += 1E0
            
        action_dict[Action.SOUTH.name] = 0
        if (self.last_action is None) | (self.last_action != Action.NORTH.name):
            action_dict[Action.SOUTH.name] += 1E4
        if (board[4, 5] == 0) | (board[4, 5] == 15):
            action_dict[Action.SOUTH.name] += 1E3
        if (board[5, 5] in [0, 4, 15]) & (board[4, 4] in [0, 1, 15]) & (board[4, 6] in [0, 2, 15]):
            action_dict[Action.SOUTH.name] += 1E2
        if p1_food_row_feat > 0:
            action_dict[Action.SOUTH.name] += 1E1
        if p2_food_row_feat > 0:
            action_dict[Action.SOUTH.name] += 1E0
        
        actions = [Action.SOUTH.name, Action.NORTH.name, Action.EAST.name, Action.WEST.name]
        values = [action_dict[action] for action in actions]
        
        action = actions[np.argmax(values)]

        
        self.last_action = action
        self.last_heads_positions = heads_positions
        return action

In [4]:
from random import choice
from copy import deepcopy
from kaggle_environments.envs.hungry_geese.hungry_geese import Observation, Configuration, Action, \
                                                                row_col, adjacent_positions, translate, min_distance
class GreedyAgent:
    def __init__(self):
        
        self.last_action = None
        self.observations = []

    def __call__(self, observation: Observation, configuration: Configuration):
        self.configuration = configuration
        
        board = np.zeros(self.configuration.rows*self.configuration.columns)
        board_shape = (self.configuration.rows, self.configuration.columns)
        
        board_heads = deepcopy(board)
        board_bodies = deepcopy(board)
        board_rewards = deepcopy(board)
        
        
        rows, columns = self.configuration.rows, self.configuration.columns

        food = observation.food
        geese = observation.geese
        
        
        opponents = [
            goose
            for index, goose in enumerate(geese)
            if index != observation.index and len(goose) > 0
        ]

        
        opponent_heads = [opponent[0] for opponent in opponents]
        # Don't move adjacent to any heads
        head_adjacent_positions = {
            opponent_head_adjacent
            for opponent_head in opponent_heads
            for opponent_head_adjacent in adjacent_positions(opponent_head, columns, rows)
        }
        
        tail_adjacent_positions ={
            opponent_tail_adjacent
            for opponent in opponents
            for opponent_tail in [opponent[-1]]
            for opponent_tail_adjacent in adjacent_positions(opponent_tail, columns, rows)
        }
        # Don't move into any bodies
        #bodies, heads = [position for goose in geese for position in goose]
        
        heads = [i[0] for i in geese if len(i)>1]
        bodies = [item for sublist in geese for item in sublist]
        
        board_bodies[list(bodies)] = 1
        board_heads[heads] = 1

        # Move to the closest food
        position = geese[observation.index][0]
        actions = {
            action: min_distance(new_position, food, columns)
            for action in Action
            for new_position in [translate(position, action, columns, rows)]
            if (
                new_position not in head_adjacent_positions and
                new_position not in bodies and
                (self.last_action is None or action != self.last_action.opposite())
            )
        }

        action = min(actions, key=actions.get) if any(actions) else choice([action for action in Action])
        
        
        cur_obs = {}
        cur_obs['head_adjacent_positions'] = head_adjacent_positions
        cur_obs['bodies'] = bodies
        cur_obs['board_bodies'] = board_bodies.reshape(board_shape)
        cur_obs['board_heads'] = board_heads.reshape(board_shape)
        cur_obs['tails'] = tail_adjacent_positions
        cur_obs['actions'] = actions
        cur_obs['action'] = action
        cur_obs['last_action'] = self.last_action
#         cur_obs['goose_size'] = player_goose_len
#         cur_obs['board'] = board
        cur_obs['cur_action'] = action
        self.observations.append(cur_obs)
        
        self.last_action = action
        return action.name


cached_greedy_agents = {}


def greedy_agent(obs, config):
    index = obs["index"]
    if index not in cached_greedy_agents:
        cached_greedy_agents[index] = GreedyAgent(Configuration(config))
    return cached_greedy_agents[index](Observation(obs))

In [5]:
from kaggle_environments import evaluate, make, utils

# Setup a hungry_geese environment.
env = make("hungry_geese", debug = True)
my_agent = RuleBasedAgent()
env.run([my_agent] + ["greedy" for i in range(7)])
env.render(mode="ipython", width=600, height=650)

Opposite action: (1, <Action.NORTH: 1>, <Action.SOUTH: 3>)
Opposite action: (4, <Action.WEST: 4>, <Action.EAST: 2>)
Goose Collision: EAST
Goose Starved: Action.NORTH
Body Hit: (6, <Action.SOUTH: 3>, 72, [61, 60, 71, 72, 6, 17])
Body Hit: (3, <Action.WEST: 4>, 52, [53, 42, 43, 54, 65, 64, 63, 52])
Body Hit: (0, <Action.EAST: 2>, 60, [59, 70, 71, 60, 61, 62, 73, 74, 75, 64])


In [14]:
my_agent.stateSpace

{'food1_col': 0.6,
 'food2_col': -0.6,
 'food1_row': 0.4,
 'food2_row': 0.4,
 'goose_size': 0.5,
 'board': array([[ 0.,  0.,  0.,  0.,  0.,  0.,  9.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  5.,  6.,  7.,  8.,  0.,  0.,  0.,  0.],
        [10., 11., 12., 13., 14., 15., 16.,  0.,  7.,  8.,  9.],
        [ 0.,  5.,  6.,  7.,  8.,  1., 17.,  0.,  6.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  9., 10., 11., 12.,  5.,  0.,  0.],
        [ 0.,  0., 15.,  0.,  0.,  0.,  0.,  4., 15.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.,  0.,  3.,  0.,  0.,  0.,  0.]]),
 'hunger': 0.75,
 'step': -0.25}

In [15]:
actions_list = np.array(['EAST',
                        'WEST',
                        'SOUTH',
                        'NORTH'])
def action_to_target(action):
    pos = np.argmax(actions_list == action)
    target = np.zeros(4)
    target[pos] = 1
    return target

def target_to_action(target):
    pos = np.argmax(target)
    return actions_list[pos]

def pred_to_action(pred):
    pos = np.random.multinomial(1, pred)
    return actions_list[pos]

In [16]:
def add_numerical(steps):
    numerical = ['food1_col',
                'food2_col',
                'food1_row',
                'food2_row',
                'goose_size',
                'hunger',
                'step']
    for step in steps:
        vector = np.zeros(len(numerical))
        for i, nm in enumerate(numerical):
            vector[i] = step['cur_state'][nm]
        step['numerical'] = vector
    return None

In [17]:
def add_embeddings(steps):
    numerical = ['food1_col',
                'food2_col',
                'food1_row',
                'food2_row',
                'goose_size',
                'hunger',
                'step']
    for step in steps:
        #vector = np.zeros(7*11, dtype=int)
        vector = []
        board = step['cur_state']['board']
        for row in range(7):
            for col in range(11):
               #vector[11*row + col] =  np.array(board[row][col], dtype=int)
               vector.append([int(board[row][col])])
        step['embeddings'] = vector
    return None

In [18]:
def add_state_value(discount, steps):
    steps_back = steps[::-1]
    v_prime = 0
    for step in steps_back:
        v = step['reward'] + discount*v_prime
        v_prime = v
        step['v'] = v
    return None

In [19]:
def process(discount, episodes):
    for episode in episodes:
        add_embeddings(episode)
        add_numerical(episode)
        add_state_value(discount, episode)
    return None

In [28]:
step_reward = 1
dying_reward = -10
step_200_reward = lambda l: l
win_game_reward = lambda step, l: (200-step) + step_200_reward(l)

discount = 0.95

nb_opponents = 1

steps_per_ep = 200
num_episodes = 200


env = make("hungry_geese", debug=True)
config = env.configuration

In [29]:
episodes = []
for ep in range(num_episodes):
    
    print('episode number: ', ep)
    steps = []
    my_agent = RuleBasedAgent()
    agents =  [my_agent] + [RuleBasedAgent() for _ in range(nb_opponents)]
    state_dict = env.reset(num_agents=nb_opponents + 1)[0]
    observation = state_dict['observation']
    my_goose_ind = observation['index']
    
    reward = state_dict['reward']
    action = state_dict['action']
    
    
    
    done = False
    for step in range(steps_per_ep):
        actions = []
        
        for i, agent in enumerate(agents):
            obs = deepcopy(observation)
            obs['index'] = i
            action = agent(obs, config)
            actions.append(action)
        
        state_dict = env.step(actions)[0]
        observation = state_dict['observation']
        my_goose_ind = observation['index']
        
        goose_length = len(observation['geese'][my_goose_ind])
        
        #new_state, _, _ = agent.getStateSpace(observation, config)
        
        #reward = state_dict['reward']
        action = state_dict['action']
        status = state_dict['status']
        
        if status != "ACTIVE":
            done = True
            
        # Check if my goose died
        if goose_length == 0:
            done = True
            reward = dying_reward
        elif (step+1) == steps_per_ep:
            reward = step_200_reward(goose_length)
            done = True
        elif status != "ACTIVE":
            reward = win_game_reward(step, goose_length)
        else:
            reward = 1
        
        steps.append({'cur_state': my_agent.stateSpace,
                                'action': action,
                                'reward': reward,
                                'new_state': '',#new_state,
                                'status': status,
                                'done': done})
        if done:
            print('Done, Step: ', step)
            print('status, ', status)
            break

        if step%50 == 0:
            print(f'We survived {step+1} steps')
    episodes.append(steps)

episode number:  0
We survived 0 steps
We survived 50 steps
Body Hit: (1, <Action.NORTH: 1>, 22, [33, 34, 23, 22, 32, 21, 20, 19, 30, 41, 42, 53, 64])
Done, Step:  89
status,  DONE
episode number:  1
We survived 0 steps
We survived 50 steps
We survived 100 steps
Body Hit: (1, <Action.EAST: 2>, 60, [59, 58, 57, 56, 67, 1, 2, 3, 69, 70, 71, 60, 49, 48, 47])
Done, Step:  141
status,  DONE
episode number:  2
We survived 0 steps
We survived 50 steps
Goose Collision: SOUTH
Done, Step:  98
status,  DONE
episode number:  3
We survived 0 steps
We survived 50 steps
Goose Collision: EAST
Done, Step:  56
status,  DONE
episode number:  4
We survived 0 steps
Body Hit: (1, <Action.EAST: 2>, 67, [66, 55, 56, 67, 1, 0, 10, 76])
Done, Step:  44
status,  DONE
episode number:  5
We survived 0 steps
We survived 50 steps
We survived 100 steps
We survived 150 steps
Done, Step:  198
status,  DONE
episode number:  6
We survived 0 steps
Goose Starved: Action.NORTH
Done, Step:  39
status,  DONE
episode number:  

status,  DONE
episode number:  60
We survived 0 steps
We survived 50 steps
Body Hit: (0, <Action.SOUTH: 3>, 15, [4, 5, 16, 15, 14, 3, 69, 68, 57, 56, 55, 66, 0])
Done, Step:  79
status,  DONE
episode number:  61
We survived 0 steps
We survived 50 steps
Goose Collision: NORTH
Done, Step:  78
status,  DONE
episode number:  62
We survived 0 steps
We survived 50 steps
We survived 100 steps
Body Hit: (0, <Action.NORTH: 1>, 14, [25, 36, 35, 24, 13, 14, 3, 2, 1, 0, 11, 12])
Done, Step:  112
status,  DONE
episode number:  63
We survived 0 steps
We survived 50 steps
Goose Collision: EAST
Done, Step:  85
status,  DONE
episode number:  64
We survived 0 steps
We survived 50 steps
We survived 100 steps
Body Hit: (0, <Action.NORTH: 1>, 7, [18, 29, 30, 41, 52, 51, 50, 49, 38, 27, 16, 17, 6, 7, 8, 9])
Done, Step:  116
status,  DONE
episode number:  65
We survived 0 steps
We survived 50 steps
Body Hit: (0, <Action.NORTH: 1>, 13, [24, 23, 12, 13, 14, 25, 36, 35, 46, 45, 44])
Done, Step:  57
status,  DON

status,  DONE
episode number:  119
We survived 0 steps
We survived 50 steps
Goose Collision: SOUTH
Done, Step:  90
status,  DONE
episode number:  120
We survived 0 steps
We survived 50 steps
We survived 100 steps
Goose Starved: Action.NORTH
Done, Step:  119
status,  DONE
episode number:  121
We survived 0 steps
We survived 50 steps
We survived 100 steps
We survived 150 steps
Done, Step:  198
status,  DONE
episode number:  122
We survived 0 steps
We survived 50 steps
Body Hit: (0, <Action.EAST: 2>, 4, [3, 2, 68, 69, 58, 59, 70, 4, 5, 6, 17, 28, 39, 40, 51, 50, 49])
Done, Step:  76
status,  DONE
episode number:  123
We survived 0 steps
Goose Collision: WEST
Goose Collision: SOUTH
Done, Step:  24
status,  DONE
episode number:  124
We survived 0 steps
We survived 50 steps
Goose Collision: SOUTH
Goose Collision: WEST
Done, Step:  86
status,  DONE
episode number:  125
We survived 0 steps
We survived 50 steps
Goose Collision: WEST
Done, Step:  73
status,  DONE
episode number:  126
We survived

We survived 50 steps
We survived 100 steps
Goose Collision: NORTH
Done, Step:  150
status,  DONE
episode number:  179
We survived 0 steps
We survived 50 steps
We survived 100 steps
Body Hit: (0, <Action.NORTH: 1>, 23, [34, 35, 36, 25, 24, 23, 22, 32, 31, 30, 29, 40, 51, 62, 73])
Done, Step:  133
status,  DONE
episode number:  180
We survived 0 steps
We survived 50 steps
We survived 100 steps
Body Hit: (1, <Action.WEST: 4>, 20, [21, 32, 31, 20, 9, 75, 64, 63, 52, 51, 50, 61, 60])
Done, Step:  103
status,  DONE
episode number:  181
We survived 0 steps
We survived 50 steps
Goose Collision: EAST
Done, Step:  72
status,  DONE
episode number:  182
We survived 0 steps
We survived 50 steps
Body Hit: (1, <Action.WEST: 4>, 9, [10, 21, 20, 9, 75, 76, 66, 0, 11, 22, 32, 43, 42])
Done, Step:  99
status,  DONE
episode number:  183
We survived 0 steps
We survived 50 steps
Goose Starved: Action.NORTH
Goose Starved: Action.NORTH
Done, Step:  79
status,  DONE
episode number:  184
We survived 0 steps
We 

In [32]:
process(discount, episodes)

In [36]:
len[episode[-1]['v'] for episode in episodes if episode[-1]['v']  == -10]

[-10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0,
 -10.0]

In [33]:
for episode in episodes:
    print(episode[-1]['v'])

124.0
74.0
-10.0
155.0
161.0
12.0
163.0
-10.0
154.0
106.0
56.0
-10.0
85.0
-10.0
41.0
-10.0
100.0
-10.0
-10.0
-10.0
118.0
-10.0
-10.0
96.0
91.0
-10.0
-10.0
-10.0
-10.0
-10.0
-10.0
-10.0
-10.0
67.0
-10.0
-10.0
-10.0
114.0
123.0
-10.0
-10.0
-10.0
-10.0
39.0
108.0
-10.0
3.0
-10.0
8.0
-10.0
-10.0
84.0
-10.0
116.0
-10.0
146.0
-10.0
-10.0
89.0
-10.0
-10.0
-10.0
-10.0
-10.0
-10.0
-10.0
124.0
-10.0
-10.0
117.0
-10.0
-10.0
-10.0
153.0
142.0
-10.0
-10.0
-10.0
-10.0
107.0
157.0
-10.0
-10.0
162.0
125.0
-10.0
150.0
-10.0
163.0
-10.0
-10.0
103.0
113.0
-10.0
100.0
-10.0
143.0
-10.0
134.0
-10.0
109.0
81.0
124.0
120.0
4.0
101.0
90.0
-10.0
147.0
-10.0
103.0
113.0
145.0
-10.0
-10.0
145.0
-10.0
-10.0
-10.0
119.0
-10.0
3.0
-10.0
-10.0
-10.0
-10.0
134.0
-10.0
88.0
-10.0
-10.0
-10.0
-10.0
-10.0
-10.0
-10.0
-10.0
4.0
118.0
133.0
-10.0
-10.0
-10.0
99.0
121.0
144.0
-10.0
-10.0
-10.0
-10.0
-10.0
123.0
154.0
-10.0
131.0
151.0
-10.0
-10.0
-10.0
63.0
-10.0
40.0
-10.0
140.0
-10.0
-10.0
-10.0
-10.0
-10.0
162.0
128.0
-

In [152]:
episodes[0]

[{'cur_state': {'food1_col': -0.4,
   'food2_col': 0.2,
   'food1_row': -0.2,
   'food2_row': -0.2,
   'goose_size': -0.42857142857142855,
   'board': array([[ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
          [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
          [ 0.,  0.,  0., 15.,  0.,  0., 15.,  0.,  0.,  0.,  0.],
          [ 0.,  0.,  0.,  0.,  0., 16.,  0.,  0.,  0.,  0.,  0.],
          [16.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
          [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
          [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]]),
   'hunger': -1.0,
   'step': -1.0},
  'action': 'NORTH',
  'reward': 1,
  'new_state': '',
  'status': 'ACTIVE',
  'done': False,
  'embeddings': [[0],
   [0],
   [0],
   [0],
   [0],
   [0],
   [0],
   [0],
   [0],
   [0],
   [0],
   [0],
   [0],
   [0],
   [0],
   [0],
   [0],
   [0],
   [0],
   [0],
   [0],
   [0],
   [0],
   [0],
   [0],
   [15],
   [0],
   [0],
   [15],
  