In [13]:
from kaggle_environments import make
from agents.short_sight_agent_no_food import ShortSightAgentNoFood

In [14]:
my_agent = ShortSightAgentNoFood(greedy=True)
my_agent.load_weights('ShortSightAgentNoFood')

In [12]:
env = make("hungry_geese", debug = True)
env.run([my_agent] + ["greedy" for i in range(7)])
env.render(mode="ipython", width=600, height=650)

Opposite action: (3, <Action.SOUTH: 3>, <Action.NORTH: 1>)
Goose Collision: NORTH
Opposite action: (4, <Action.SOUTH: 3>, <Action.NORTH: 1>)
Opposite action: (5, <Action.WEST: 4>, <Action.EAST: 2>)
Opposite action: (1, <Action.SOUTH: 3>, <Action.NORTH: 1>)
Opposite action: (7, <Action.NORTH: 1>, <Action.SOUTH: 3>)
Body Hit: (0, <Action.EAST: 2>, 24, [23, 34, 35, 24, 25])


In [23]:
from kaggle_environments import make
from kaggle_environments.envs.hungry_geese.hungry_geese import Observation, Configuration, Action, \
                                                row_col, adjacent_positions, translate, min_distance
from sklearn.model_selection import train_test_split
from agents.short_sight_agent_no_food import ShortSightAgentNoFood
from agents.greedy_agent import GreedyAgent
from utils.helpers import action_to_target
import numpy as np
from copy import deepcopy

steps_per_ep = 200
nb_opponents = 7

env = make("hungry_geese", debug=False)
config = env.configuration

validation_ratio = 0

initial_learning_rate = 0.1
def food_G(rewards):
    rewards_back = rewards[::-1]
    v_prime = 0
    g = []
    for reward in rewards_back:
        v = reward + 0.75*v_prime
        v_prime = v
        g.append(v)
    return g[::-1]

def step_G(rewards):
    rewards_back = rewards[::-1]
    v_prime = 0
    g = []
    for reward in rewards_back:
        v = reward + 0.2*v_prime
        v_prime = v
        g.append(v)
    return g[::-1]

def compute_G(rewards, discount):
    rewards_back = rewards[::-1]
    v_prime = 0
    g = []
    for reward in rewards_back:
        v = reward + discount*v_prime
        v_prime = v
        g.append(v)
    return g[::-1]

def G(food_rewards, step_rewards):
    return (np.array(food_G(food_rewards)) + step_G(np.array(step_rewards))).reshape(-1, 1)

def transform_sample(samples):
    nb_samples = len(samples)
    if validation_ratio > 0:
        train, test = train_test_split(range(nb_samples), test_size=validation_ratio)
    else:
        train = np.random.choice(nb_samples, nb_samples, replace=False)
        test = np.array([])

    forbidden = np.concatenate([sample['cur_state'][1].reshape(1, 4) for sample in samples], axis=0)
    top = np.concatenate([sample['cur_state'][0][0].reshape(1, 10) for sample in samples], axis=0)
    right = np.concatenate([sample['cur_state'][0][1].reshape(1, 10) for sample in samples], axis=0)
    bottom = np.concatenate([sample['cur_state'][0][2].reshape(1, 10) for sample in samples],
                            axis=0)
    left = np.concatenate([sample['cur_state'][0][3].reshape(1, 10) for sample in samples], axis=0)

    step_reward = [sample['step_reward'] for sample in samples]
    food_reward = [sample['food_reward'] for sample in samples]
    step_G = compute_G(step_reward, 0.2)
    food_G = compute_G(food_reward, 0.2)
    g = np.array(step_G).reshape(-1, 1) + np.array(food_G).reshape(-1, 1)
    g = (g-np.mean(g)) / (np.std(g) + 1E-5)
    y = np.concatenate([sample['action'].reshape(1, 4) for sample in samples], axis=0)
    return [forbidden[train], top[train], right[train], bottom[train], left[train],
            g[train]], y[train],\
           [forbidden[test] if len(test) > 0 else np.array([]),
            top[test] if len(test) > 0 else np.array([]),
            right[test] if len(test) > 0 else np.array([]),
            bottom[test] if len(test) > 0 else np.array([]),
            left[test] if len(test) > 0 else np.array([]),
            g[test] if len(test) > 0 else np.array([])],\
           y[test] if len(test) > 0 else np.array([])


steps = []
agents = [my_agent] + [GreedyAgent() for _ in range(nb_opponents)]

state_dict = env.reset(num_agents=nb_opponents + 1)[0]
observation = state_dict['observation']

done = False
my_agent.last_action = state_dict.action
prev_food_eaten = 1
prev_len = 1
for step in range(1, steps_per_ep):
    actions = []

    for i, agent in enumerate(agents):
        obs = deepcopy(observation)
        obs['index'] = i
        action = agent(obs, config)
        actions.append(action)

    cur_state = agents[0].stateSpace
    state_dict = env.step(actions)[0]
    observation = state_dict['observation']
    my_goose_ind = observation['index']

    my_goose_length = len(observation['geese'][my_goose_ind])

    action = state_dict['action']
    status = state_dict['status']

    if status != "ACTIVE":
        done = True
        next_state = None
    else:
        next_state = agents[0].getStateSpace(observation, action)

    # negative reward for crashing into another goose
    if (my_goose_length == 0):
        done = True
        if (prev_len == 1) and ((observation.step % 40) == 0):
            no_crash_reward = 0
        else:
            no_crash_reward = -3
    else:
        no_crash_reward = 0
    prev_len = my_goose_length
    cur_food_eaten = state_dict.reward % 100
    if cur_food_eaten > prev_food_eaten:
        food_reward = 1
    else:
        food_reward = 0
    prev_food_eaten = cur_food_eaten

    steps.append({'cur_state': cur_state,
                  'action': action_to_target(action),
                  'step_reward': no_crash_reward,
                  'food_reward': food_reward,
                  'next_state': next_state,
                  'done': done})
    if done:
        break


201
{'action': 'EAST', 'reward': 201, 'info': {}, 'observation': {'remainingOverageTime': 60, 'step': 1, 'geese': [[47], [35], [38], [64], [23], [14], [20], [53]], 'food': [5, 32], 'index': 0}, 'status': 'ACTIVE'}
301
{'action': 'EAST', 'reward': 301, 'info': {}, 'observation': {'remainingOverageTime': 60, 'step': 2, 'geese': [[48], [36], [39], [65], [24], [15], [21], [54]], 'food': [5, 32], 'index': 0}, 'status': 'ACTIVE'}
401
{'action': 'EAST', 'reward': 401, 'info': {}, 'observation': {'remainingOverageTime': 60, 'step': 3, 'geese': [[49], [37], [40], [55], [25], [16], [11], [44]], 'food': [5, 32], 'index': 0}, 'status': 'ACTIVE'}
501
{'action': 'EAST', 'reward': 501, 'info': {}, 'observation': {'remainingOverageTime': 60, 'step': 4, 'geese': [[50], [38], [41], [56], [26], [17], [12], [45]], 'food': [5, 32], 'index': 0}, 'status': 'ACTIVE'}
601
{'action': 'EAST', 'reward': 601, 'info': {}, 'observation': {'remainingOverageTime': 60, 'step': 5, 'geese': [[51], [39], [42], [57], [27],

3805
{'action': 'EAST', 'reward': 3805, 'info': {}, 'observation': {'remainingOverageTime': 60, 'step': 37, 'geese': [[48, 47, 46, 45, 44], [58], [61, 60], [10, 9, 8], [], [37, 36, 35], [43], []], 'food': [21, 30], 'index': 0}, 'status': 'ACTIVE'}
3905
{'action': 'EAST', 'reward': 3905, 'info': {}, 'observation': {'remainingOverageTime': 60, 'step': 38, 'geese': [[49, 48, 47, 46, 45], [59], [62, 61], [0, 10, 9], [], [38, 37, 36], [33], []], 'food': [21, 30], 'index': 0}, 'status': 'ACTIVE'}
4005
{'action': 'EAST', 'reward': 4005, 'info': {}, 'observation': {'remainingOverageTime': 60, 'step': 39, 'geese': [[50, 49, 48, 47, 46], [60], [63, 62], [1, 0, 10], [], [39, 38, 37], [34], []], 'food': [21, 30], 'index': 0}, 'status': 'ACTIVE'}
4104
{'action': 'EAST', 'reward': 4104, 'info': {}, 'observation': {'remainingOverageTime': 60, 'step': 40, 'geese': [[51, 50, 49, 48], [], [64], [2, 1], [], [40, 39], [], []], 'food': [21, 30], 'index': 0}, 'status': 'ACTIVE'}
4204
{'action': 'SOUTH', 're

In [17]:
steps

[{'cur_state': ((array([0., 0., 0., 0., 0., 0., 0., 0., 1., 1.]),
    array([0., 1., 0., 0., 0., 0., 1., 0., 0., 0.]),
    array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
    array([0., 0., 1., 0., 0., 0., 0., 0., 0., 0.])),
   array([0., 0., 0., 1.]),
   array([], dtype=float64)),
  'action': array([0., 0., 1., 0.]),
  'step_reward': 0,
  'food_reward': 0,
  'next_state': ((array([0., 1., 0., 0., 0., 0., 0., 4., 0., 0.]),
    array([0., 0., 0., 1., 0., 0., 0., 0., 1., 0.]),
    array([0., 0., 0., 0., 1., 1., 1., 0., 0., 0.]),
    array([1., 0., 0., 0., 0., 0., 0., 0., 0., 1.])),
   array([0., 0., 0., 1.]),
   array([], dtype=float64)),
  'done': False},
 {'cur_state': ((array([0., 1., 0., 0., 0., 0., 0., 4., 0., 0.]),
    array([0., 0., 0., 1., 0., 0., 0., 0., 1., 0.]),
    array([0., 0., 0., 0., 1., 1., 1., 0., 0., 0.]),
    array([1., 0., 0., 0., 0., 0., 0., 0., 0., 1.])),
   array([0., 0., 0., 1.]),
   array([], dtype=float64)),
  'action': array([1., 0., 0., 0.]),
  'step_reward'

In [5]:
board = np.array([np.random.randint(100) for _ in range(77)]).reshape(7,11)

In [6]:
board

array([[21, 44, 25, 46, 27, 11, 45, 89, 32, 41, 10],
       [89, 47, 50, 79, 70,  7,  6, 85, 25, 93, 65],
       [21, 59, 48, 98, 90, 33, 26, 87, 30, 34, 52],
       [29, 18, 10, 26, 73, 83, 91, 97, 46, 36, 90],
       [87, 30, 34, 94, 97, 68,  3, 56,  1, 53, 30],
       [43, 31, 14, 16, 86, 19, 82, 38, 72, 22,  3],
       [ 5, 78, 58,  3, 55, 66, 43, 39, 37, 72, 10]])

In [14]:
top = board[1:3, 3:8].reshape(-1)
right = board[1:6, 7:5:-1].T.reshape(-1)
bottom = board[5:3:-1, 7:2:-1].reshape(-1)
left = board[5:0:-1, 3:5].T.reshape(-1)

In [27]:
top = board[0:3, 2:9].reshape(-1)
right = board[0:7, 8:5:-1].T.reshape(-1)
bottom = board[6:3:-1, 8:1:-1].reshape(-1)
left = board[np.array([6,5,4,3,2,1,0]), 2:5].T.reshape(-1)

In [28]:
left

array([58, 14, 34, 10, 48, 50, 25,  3, 16, 94, 26, 98, 79, 46, 55, 86, 97,
       73, 90, 70, 27])

In [14]:
v = np.array(list(range(10)))

In [62]:
from kaggle_environments import make
from kaggle_environments.envs.hungry_geese.hungry_geese import Observation, Configuration, Action, \
                                                row_col, adjacent_positions, translate, min_distance
from sklearn.model_selection import train_test_split
from agents.short_sight_agent_no_food import ShortSightAgentNoFood
from agents.greedy_agent import GreedyAgent
from utils.helpers import action_to_target
import numpy as np
from copy import deepcopy

steps_per_ep = 200
nb_opponents = 7

env = make("hungry_geese", debug=False)
config = env.configuration

validation_ratio = 0

initial_learning_rate = 0.001
def food_G(rewards):
    rewards_back = rewards[::-1]
    v_prime = 0
    g = []
    for reward in rewards_back:
        v = reward + 0.75*v_prime
        v_prime = v
        g.append(v)
    return g[::-1]

def step_G(rewards):
    rewards_back = rewards[::-1]
    v_prime = 0
    g = []
    for reward in rewards_back:
        v = reward + 0.2*v_prime
        v_prime = v
        g.append(v)
    return g[::-1]

def compute_G(rewards, discount):
    rewards_back = rewards[::-1]
    v_prime = 0
    g = []
    for reward in rewards_back:
        v = reward + discount*v_prime
        v_prime = v
        g.append(v)
    return g[::-1]

def G(food_rewards, step_rewards):
    return (np.array(food_G(food_rewards)) + step_G(np.array(step_rewards))).reshape(-1, 1)

def transform_sample(samples):
    nb_samples = len(samples)
    if validation_ratio > 0:
        train, test = train_test_split(range(nb_samples), test_size=validation_ratio)
    else:
        train = np.random.choice(nb_samples, nb_samples, replace=False)
        test = np.array([])

    food = np.concatenate([sample['cur_state'][2].reshape(1, 4) for sample in samples], axis=0)
    forbidden = np.concatenate([sample['cur_state'][1].reshape(1, 4) for sample in samples], axis=0)
    top = np.concatenate([sample['cur_state'][0][0].reshape(1, 3) for sample in samples], axis=0)
    right = np.concatenate([sample['cur_state'][0][1].reshape(1, 3) for sample in samples], axis=0)
    bottom = np.concatenate([sample['cur_state'][0][2].reshape(1, 3) for sample in samples], axis=0)
    left = np.concatenate([sample['cur_state'][0][3].reshape(1, 3) for sample in samples], axis=0)

    step_reward = [sample['step_reward'] for sample in samples]
    food_reward = [sample['food_reward'] for sample in samples]
    step_G = compute_G(step_reward, 0)
    food_G = 0#compute_G(food_reward, 0.75)
    g = np.array(step_G).reshape(-1, 1) + np.array(food_G).reshape(-1, 1)
    g = (g-np.mean(g)) / (np.std(g) + 1E-5)
    y = np.concatenate([sample['action'].reshape(1, 4) for sample in samples], axis=0)
    return [forbidden[train], top[train], right[train], bottom[train], left[train],
            g[train]], y[train],\
           [forbidden[test] if len(test) > 0 else np.array([]),
            top[test] if len(test) > 0 else np.array([]),
            right[test] if len(test) > 0 else np.array([]),
            bottom[test] if len(test) > 0 else np.array([]),
            left[test] if len(test) > 0 else np.array([]),
            g[test] if len(test) > 0 else np.array([])],\
           y[test] if len(test) > 0 else np.array([])


steps = []
agents = [my_agent] + [GreedyAgent() for _ in range(nb_opponents)]

state_dict = env.reset(num_agents=nb_opponents + 1)[0]
observation = state_dict['observation']

done = False
my_agent.last_action = state_dict.action
prev_food_eaten = 1
prev_len = 1
for step in range(1, steps_per_ep):
    actions = []

    for i, agent in enumerate(agents):
        obs = deepcopy(observation)
        obs['index'] = i
        action = agent(obs, config)
        actions.append(action)

    cur_state = agents[0].stateSpace

    state_dict = env.step(actions)[0]
    observation = state_dict['observation']
    my_goose_ind = observation['index']

    my_goose_length = len(observation['geese'][my_goose_ind])

    action = state_dict['action']
    status = state_dict['status']

    if status != "ACTIVE":
        done = True
        next_state = None
    else:
        next_state = agents[0].getStateSpace(observation, action)

    # negative reward for crashing into another goose
    if (my_goose_length == 0):
        done = True
        if (prev_len == 1) and ((observation.step % 40) == 0):
            no_crash_reward = 0
        else:
            no_crash_reward = -3
    else:
        no_crash_reward = 0
    prev_len = my_goose_length
    cur_food_eaten = state_dict.reward % 100
    if cur_food_eaten > prev_food_eaten:
        food_reward = 1
    else:
        food_reward = 0
    prev_food_eaten = cur_food_eaten

    steps.append({'cur_state': cur_state,
                  'action': action_to_target(action),
                  'step_reward': no_crash_reward,
                  'food_reward': food_reward,
                  'next_state': next_state,
                  'done': done})
    if done:
        break



[[0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
[0. 0. 1.]
[1. 0. 0.]
[0. 0. 0.]
[0. 0. 0.]
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0.]]
[0. 0. 1.]
[1. 0. 0.]
[0. 0. 0.]
[0. 0. 0.]
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0.]]
[0. 0. 1.]
[1. 0. 0.]
[0. 0. 0.]
[0. 0. 0.]
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 

 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
[0. 0. 1.]
[1. 0. 0.]
[0. 0. 0.]
[0. 0. 0.]
[[1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
[0. 0. 1.]
[1. 0. 0.]
[0. 0. 0.]
[0. 0. 0.]
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 1.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
[0. 0. 0.]
[0. 0. 0.]
[0. 0. 1.]
[1. 0. 0.]
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 1.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
[0. 0. 0.]
[0. 0. 0.]
[0. 0. 1.]
[1. 0. 0.]
[[0. 0. 0. 0. 0. 0. 0. 0. 0.

[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 1. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 1. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1.]]
[0. 0. 0.]
[0. 0. 0.]
[0. 0. 0.]
[0. 1. 0.]
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 1. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 1. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1.]]
[0. 0. 0.]
[0. 0. 0.]
[0. 0. 0.]
[0. 1. 0.]
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 1. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 1. 1. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1.]]
[0. 0. 0.]
[0. 0. 0.]
[0. 0. 0.]
[0. 1. 0.]
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 1. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 

[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 1. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]]
[0. 0. 0.]
[0. 0. 0.]
[0. 0. 0.]
[0. 1. 0.]
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 1. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]]
[0. 0. 0.]
[0. 0. 0.]
[0. 0. 0.]
[0. 1. 0.]
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 1. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
[0. 0. 0.]
[0. 0. 0.]
[0. 0. 0.]
[0. 1. 0.]
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 

 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
[0. 0. 0.]
[0. 1. 1.]
[1. 0. 0.]
[0. 0. 0.]
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 1. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
[0. 0. 0.]
[0. 1. 1.]
[1. 0. 0.]
[0. 0. 0.]
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 1. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
[0. 0. 0.]
[0. 1. 1.]
[1. 0. 0.]
[0. 0. 0.]
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 1. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
[0. 0. 0.]
[0. 1. 1.]
[1. 0. 0.]
[0. 0. 0.]


In [70]:
done

True

In [69]:
my_goose_length

3

In [68]:
(prev_len == 1) and ((observation.step % 40) == 0)

False

In [67]:
observation.step % 40

26

In [65]:
no_crash_reward

0

In [63]:
steps

[{'cur_state': ((array([0., 0., 1.]),
    array([1., 0., 0.]),
    array([0., 0., 0.]),
    array([0., 0., 0.])),
   array([0., 0., 0., 1.]),
   array([-0.33333333, -0.2       ,  0.5       ,  1.        ])),
  'action': array([1., 0., 0., 0.]),
  'step_reward': 0,
  'food_reward': 0,
  'next_state': ((array([0., 0., 1.]),
    array([1., 0., 0.]),
    array([0., 0., 0.]),
    array([0., 0., 0.])),
   array([0., 1., 0., 0.]),
   array([-0.33333333, -0.25      ,  0.5       ,  0.5       ])),
  'done': False},
 {'cur_state': ((array([0., 0., 1.]),
    array([1., 0., 0.]),
    array([0., 0., 0.]),
    array([0., 0., 0.])),
   array([0., 1., 0., 0.]),
   array([-0.33333333, -0.25      ,  0.5       ,  0.5       ])),
  'action': array([1., 0., 0., 0.]),
  'step_reward': 0,
  'food_reward': 0,
  'next_state': ((array([0., 0., 1.]),
    array([1., 0., 0.]),
    array([0., 0., 0.]),
    array([0., 0., 0.])),
   array([0., 1., 0., 0.]),
   array([-0.33333333, -0.33333333,  0.5       ,  0.33333333]))

In [64]:
len(steps)

66

In [14]:
shuffled = np.random.choice(10, 8, replace=False)

In [15]:
shuffled

array([6, 1, 5, 8, 7, 9, 3, 0])

In [16]:
v[shuffled]

array([6, 1, 5, 8, 7, 9, 3, 0])

In [17]:
v[~shuffled]

array([3, 8, 4, 1, 2, 0, 6, 9])

In [1]:
my_agent = ShortSightAgent(greedy=False)

NameError: name 'ShortSightAgent' is not defined

In [11]:
from kaggle_environments import make
from kaggle_environments.envs.hungry_geese.hungry_geese import Observation, Configuration, Action, \
                                                row_col, adjacent_positions, translate, min_distance
from sklearn.model_selection import train_test_split
from agents.short_sight_agent_no_food import ShortSightAgentNoFood
from agents.greedy_agent import GreedyAgent
from utils.helpers import action_to_target
import numpy as np
from copy import deepcopy

steps_per_ep = 200
nb_opponents = 1

env = make("hungry_geese", debug=False)
config = env.configuration

validation_ratio = 0

initial_learning_rate = 0.001
def food_G(rewards):
    rewards_back = rewards[::-1]
    v_prime = 0
    g = []
    for reward in rewards_back:
        v = reward + 0.75*v_prime
        v_prime = v
        g.append(v)
    return g[::-1]

def step_G(rewards):
    rewards_back = rewards[::-1]
    v_prime = 0
    g = []
    for reward in rewards_back:
        v = reward + 0.2*v_prime
        v_prime = v
        g.append(v)
    return g[::-1]

def compute_G(rewards, discount):
    rewards_back = rewards[::-1]
    v_prime = 0
    g = []
    for reward in rewards_back:
        v = reward + discount*v_prime
        v_prime = v
        g.append(v)
    return g[::-1]

def G(food_rewards, step_rewards):
    return (np.array(food_G(food_rewards)) + step_G(np.array(step_rewards))).reshape(-1, 1)

def transform_sample(samples):
    nb_samples = len(samples)
    if validation_ratio > 0:
        train, test = train_test_split(range(nb_samples), test_size=validation_ratio)
    else:
        train = np.random.choice(nb_samples, nb_samples, replace=False)
        test = np.array([])

    food = np.concatenate([sample['cur_state'][1].reshape(1, 4) for sample in samples], axis=0)
    forbidden = np.concatenate([sample['cur_state'][1].reshape(1, 4) for sample in samples], axis=0)
    embedding = np.concatenate([sample['cur_state'][0].reshape(1, 24) for sample in samples],
                               axis=0)
    step_reward = [sample['step_reward'] for sample in samples]
    food_reward = [sample['food_reward'] for sample in samples]
    step_G = compute_G(step_reward, 0.2)
    food_G = compute_G(food_reward, 0.75)
    g = np.array(step_G).reshape(-1, 1) + np.array(food_G).reshape(-1, 1)
    g = (g-np.mean(g)) / (np.std(g) + 1E-5)
    y = np.concatenate([sample['action'].reshape(1, 4) for sample in samples], axis=0)
    return [forbidden[train], embedding[train], food[train], g[train]], y[train],\
           [forbidden[test] if len(test) > 0 else np.array([]),
            embedding[test] if len(test) > 0 else np.array([]),
            food[test] if len(test) > 0 else np.array([]),
            g[test] if len(test) > 0 else np.array([])],\
           y[test] if len(test) > 0 else np.array([])

def run_game(nb_opponents, my_agent):
    steps = []
    agents = [my_agent] + [GreedyAgent() for _ in range(nb_opponents)]

    state_dict = env.reset(num_agents=nb_opponents + 1)[0]
    observation = state_dict['observation']

    done = False
    my_agent.last_action = state_dict.action
    prev_food_eaten = 1
    prev_len = 1
    for step in range(1, steps_per_ep):
        actions = []

        for i, agent in enumerate(agents):
            obs = deepcopy(observation)
            obs['index'] = i
            action = agent(obs, config)
            actions.append(action)

        cur_state = agents[0].stateSpace

        state_dict = env.step(actions)[0]
        observation = state_dict['observation']
        my_goose_ind = observation['index']

        my_goose_length = len(observation['geese'][my_goose_ind])

        action = state_dict['action']
        status = state_dict['status']

        if status != "ACTIVE":
            done = True
            next_state = None
        else:
            next_state = agents[0].getStateSpace(observation, action)

        # negative reward for crashing into another goose
        if (my_goose_length == 0):
            done = True
            if (prev_len == 1) and ((observation.step % 40) == 0):
                no_crash_reward = 0
            else:
                no_crash_reward = -3
        else:
            no_crash_reward = 0
        prev_len = my_goose_length
        cur_food_eaten = state_dict.reward % 100
        if cur_food_eaten > prev_food_eaten:
            food_reward = 1
        else:
            food_reward = 0
        prev_food_eaten = cur_food_eaten

        steps.append({'cur_state': cur_state,
                      'action': action_to_target(action),
                      'step_reward': no_crash_reward,
                      'food_reward': food_reward,
                      'next_state': next_state,
                      'done': done})
        if done:
            break

    return steps



In [15]:
steps = []
agents = [my_agent] + [GreedyAgent() for _ in range(nb_opponents)]

state_dict = env.reset(num_agents=nb_opponents + 1)[0]
observation = state_dict['observation']

done = False
my_agent.last_action = state_dict.action
prev_food_eaten = 1
prev_len = 1
for step in range(1, steps_per_ep):
    actions = []

    for i, agent in enumerate(agents):
        obs = deepcopy(observation)
        obs['index'] = i
        action = agent(obs, config)
        actions.append(action)

    cur_state = agents[0].stateSpace

    state_dict = env.step(actions)[0]
    observation = state_dict['observation']
    my_goose_ind = observation['index']

    my_goose_length = len(observation['geese'][my_goose_ind])

    action = state_dict['action']
    status = state_dict['status']

    if status != "ACTIVE":
        done = True
        next_state = None
    else:
        next_state = agents[0].getStateSpace(observation, action)

    # negative reward for crashing into another goose
    if (my_goose_length == 0):
        done = True
        if (prev_len == 1) and ((observation.step % 40) == 0):
            no_crash_reward = 0
        else:
            no_crash_reward = -3
    else:
        no_crash_reward = 0
    prev_len = my_goose_length
    cur_food_eaten = state_dict.reward % 100
    if cur_food_eaten > prev_food_eaten:
        food_reward = 1
    else:
        food_reward = 0
    prev_food_eaten = cur_food_eaten

    steps.append({'cur_state': cur_state,
                  'action': action_to_target(action),
                  'step_reward': no_crash_reward,
                  'food_reward': food_reward,
                  'next_state': next_state,
                  'done': done,
                 's':state_dict})
    if done:
        break


In [16]:
steps

[{'cur_state': (array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0.]),
   array([0., 0., 0., 1.]),
   array([-0.33333333, -0.5       ,  0.5       , -1.        ])),
  'action': array([1., 0., 0., 0.]),
  'step_reward': 0,
  'food_reward': 0,
  'next_state': (array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0.]),
   array([0., 1., 0., 0.]),
   array([ 0.5       ,  0.        , -0.33333333, -1.        ])),
  'done': False,
  's': {'action': 'WEST',
   'reward': 201,
   'info': {},
   'observation': {'remainingOverageTime': 60,
    'step': 1,
    'geese': [[3], [20]],
    'food': [46, 25],
    'index': 0},
   'status': 'ACTIVE'}},
 {'cur_state': (array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0.]),
   array([0., 1., 0., 0.]),
   array([ 0.5       ,  0.        , -0.33333333, -1.        ])),
  'action': array([0.,

In [8]:
my_agent = ShortSightAgent(greedy=False)

In [9]:
nb_opponents = 1

In [10]:
steps = []
agents = [my_agent] + [GreedyAgent() for _ in range(nb_opponents)]

state_dict = env.reset(num_agents=nb_opponents + 1)[0]
observation = state_dict['observation']

done = False
my_agent.last_action = state_dict.action
prev_food_pos = observation.food
for step in range(1, steps_per_ep):
    actions = []

    for i, agent in enumerate(agents):
        obs = deepcopy(observation)
        obs['index'] = i
        action = agent(obs, config)
        actions.append(action)

    cur_state = agents[0].stateSpace

    state_dict = env.step(actions)[0]
    observation = state_dict['observation']
    my_goose_ind = observation['index']

    my_goose_length = len(observation['geese'][my_goose_ind])

    action = state_dict['action']
    status = state_dict['status']

    if status != "ACTIVE":
        done = True
        next_state = None
    else:
        next_state = agents[0].getStateSpace(observation, action)

    # Check if my goose died
    if my_goose_length == 0:
        done = True
        reward = 0
        food_reward = 0
    else:
        reward = 1
        cur_head_pos = observation['geese'][my_goose_ind][0]
        if cur_head_pos in prev_food_pos:
            food_reward = 2
        else:
            food_reward = 0
    prev_food_pos = observation.food

    steps.append({'cur_state': cur_state,
                  'action': action_to_target(action),
                  'step_reward': reward,
                  'food_reward': food_reward,
                  'next_state': next_state,
                  'done': done})
    if done:
        break

ValueError: too many values to unpack (expected 2)

In [13]:
from kaggle_environments import make
from kaggle_environments.envs.hungry_geese.hungry_geese import Observation, Configuration, Action, \
                                                row_col, adjacent_positions, translate, min_distance
from sklearn.model_selection import train_test_split
from agents.short_sight_agent_no_food import ShortSightAgentNoFood
from agents.greedy_agent import GreedyAgent
from utils.helpers import action_to_target
import numpy as np
from copy import deepcopy

steps_per_ep = 200
nb_opponents = 7

env = make("hungry_geese", debug=False)
config = env.configuration

validation_ratio = 0

initial_learning_rate = 0.01
def food_G(rewards):
    rewards_back = rewards[::-1]
    v_prime = 0
    g = []
    for reward in rewards_back:
        v = reward + 0.75*v_prime
        v_prime = v
        g.append(v)
    return g[::-1]

def step_G(rewards):
    rewards_back = rewards[::-1]
    v_prime = 0
    g = []
    for reward in rewards_back:
        v = reward + 0.1*v_prime
        v_prime = v
        g.append(v)
    return g[::-1]

def G(food_rewards, step_rewards):
    return (np.array(food_G(food_rewards)) + step_G(np.array(step_rewards))).reshape(-1, 1)

def transform_sample(samples):
    nb_samples = len(samples)
    if validation_ratio > 0:
        train, test = train_test_split(range(nb_samples), test_size=validation_ratio)
    else:
        train = np.random.choice(nb_samples, nb_samples, replace=False)
        test = np.array([])

    numerical = np.concatenate([sample['cur_state'][1].reshape(1, 4) for sample in samples], axis=0)
    embedding = np.concatenate([sample['cur_state'][0].reshape(1, 24) for sample in samples],
                               axis=0)
    step_reward = [sample['step_reward'] for sample in samples]
    food_reward = [sample['food_reward'] for sample in samples]
    g = np.array(step_reward).reshape(-1, 1)
    g = (g-np.mean(g)) / (np.std(g) + 1E-5)
    y = np.concatenate([sample['action'].reshape(1, 4) for sample in samples], axis=0)
    return [numerical[train], embedding[train], g[train]], y[train],\
           [numerical[test] if len(test) > 0 else np.array([]),
            embedding[test] if len(test) > 0 else np.array([]),
            g[test] if len(test) > 0 else np.array([])],\
           y[test] if len(test) > 0 else np.array([])

def run_game(nb_opponents, my_agent):
    steps = []
    agents = [my_agent] + [GreedyAgent() for _ in range(nb_opponents)]

    state_dict = env.reset(num_agents=nb_opponents + 1)[0]
    observation = state_dict['observation']

    done = False
    my_agent.last_action = state_dict.action
    prev_food_eaten = 0
    for step in range(1, steps_per_ep):
        actions = []

        for i, agent in enumerate(agents):
            obs = deepcopy(observation)
            obs['index'] = i
            action = agent(obs, config)
            actions.append(action)

        cur_state = agents[0].stateSpace

        state_dict = env.step(actions)[0]
        observation = state_dict['observation']
        my_goose_ind = observation['index']

        my_goose_length = len(observation['geese'][my_goose_ind])

        action = state_dict['action']
        status = state_dict['status']

        if status != "ACTIVE":
            done = True
            next_state = None
        else:
            next_state = agents[0].getStateSpace(observation, action)

        # Check if my goose died
        if my_goose_length == 0:
            done = True
            reward = -1
        else:
            reward = 0

        cur_food_eaten = state_dict.reward % 100
        if cur_food_eaten > prev_food_eaten:
            food_reward = 2
        else:
            food_reward = 0
        prev_food_eaten = cur_food_eaten

        steps.append({'cur_state': cur_state,
                      'action': action_to_target(action),
                      'step_reward': reward,
                      'food_reward': food_reward,
                      'next_state': next_state,
                      'done': done})
        if done:
            break

    return steps






In [25]:
my_agent = ShortSightAgentNoFood(greedy=True)
my_agent.load_weights('ShortSightAgentNoFood')

In [31]:
steps_per_ep = 200

In [14]:
steps = []
agents = [my_agent] + [GreedyAgent() for _ in range(nb_opponents)]

state_dict = env.reset(num_agents=nb_opponents + 1)[0]
observation = state_dict['observation']

done = False
my_agent.last_action = state_dict.action
prev_food_eaten = 1
for step in range(0, steps_per_ep):
    actions = []

    for i, agent in enumerate(agents):
        obs = deepcopy(observation)
        obs['index'] = i
        action = agent(obs, config)
        actions.append(action)

    cur_state = agents[0].stateSpace

    state_dict = env.step(actions)[0]
    observation = state_dict['observation']
    my_goose_ind = observation['index']

    my_goose_length = len(observation['geese'][my_goose_ind])

    action = state_dict['action']
    status = state_dict['status']

    if status != "ACTIVE":
        done = True
        next_state = None
    else:
        next_state = agents[0].getStateSpace(observation, action)

    # Check if my goose died
    if my_goose_length == 0:
        done = True
        reward = -1
    else:
        reward = 0

    cur_food_eaten = state_dict.reward % 100
    if cur_food_eaten > prev_food_eaten:
        food_reward = 2
    else:
        food_reward = 0
    prev_food_eaten = cur_food_eaten

    steps.append({'cur_state': cur_state,
                  'action': action_to_target(action),
                  'step_reward': reward,
                  'food_reward': food_reward,
                  'next_state': next_state,
                  'done': done})
    if done:
        break

In [15]:
steps

[{'cur_state': (array([0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0.]),
   array([0., 0., 0., 1.]),
   array([ 0.2, -0.2, -0.2,  0.8])),
  'action': array([1., 0., 0., 0.]),
  'step_reward': 0,
  'food_reward': 0,
  'next_state': (array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0.]),
   array([0., 1., 0., 0.]),
   array([ 0.2,  0. , -0.2,  1. ])),
  'done': False},
 {'cur_state': (array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0.]),
   array([0., 1., 0., 0.]),
   array([ 0.2,  0. , -0.2,  1. ])),
  'action': array([0., 0., 0., 1.]),
  'step_reward': 0,
  'food_reward': 2,
  'next_state': (array([0., 0., 0., 0., 0., 0., 0., 3., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0.]),
   array([0., 0., 1., 0.]),
   array([-0.6,  0. , -0.4,  1. ])),
  'done': False},
 {'cur_state': (array([0

In [34]:
state_dict = env.reset(num_agents=nb_opponents + 1)[0]

In [36]:
env.reset(num_agents=nb_opponents + 1)

[{'action': 'NORTH',
  'reward': 0,
  'info': {},
  'observation': {'remainingOverageTime': 60,
   'step': 0,
   'geese': [[34], [62], [64], [27], [10], [12], [42], [32]],
   'food': [74, 60],
   'index': 0},
  'status': 'ACTIVE'},
 {'action': 'NORTH',
  'reward': 0,
  'info': {},
  'observation': {'remainingOverageTime': 60, 'index': 1},
  'status': 'ACTIVE'},
 {'action': 'NORTH',
  'reward': 0,
  'info': {},
  'observation': {'remainingOverageTime': 60, 'index': 2},
  'status': 'ACTIVE'},
 {'action': 'NORTH',
  'reward': 0,
  'info': {},
  'observation': {'remainingOverageTime': 60, 'index': 3},
  'status': 'ACTIVE'},
 {'action': 'NORTH',
  'reward': 0,
  'info': {},
  'observation': {'remainingOverageTime': 60, 'index': 4},
  'status': 'ACTIVE'},
 {'action': 'NORTH',
  'reward': 0,
  'info': {},
  'observation': {'remainingOverageTime': 60, 'index': 5},
  'status': 'ACTIVE'},
 {'action': 'NORTH',
  'reward': 0,
  'info': {},
  'observation': {'remainingOverageTime': 60, 'index': 6},

In [35]:
state_dict

{'action': 'NORTH',
 'reward': 0,
 'info': {},
 'observation': {'remainingOverageTime': 60,
  'step': 0,
  'geese': [[49], [50], [6], [24], [58], [19], [20], [65]],
  'food': [51, 17],
  'index': 0},
 'status': 'ACTIVE'}

In [33]:
state_dict

{'action': 'NORTH',
 'reward': 2401,
 'info': {},
 'observation': {'remainingOverageTime': 60,
  'step': 24,
  'geese': [[], [44], [73], [], [47], [26, 15], [31, 20, 19], [66, 55]],
  'food': [61, 48],
  'index': 0},
 'status': 'DONE'}

In [5]:
from tensorflow.keras.layers import Dense, Input, Embedding, Flatten, concatenate
from tensorflow.keras import Model, Sequential
import tensorflow as tf

from utils.state_space import *
from utils.helpers import pred_to_action, pred_to_action_greedy, target_to_action
from tensorflow.python.framework.ops import disable_eager_execution

import pickle
disable_eager_execution()

In [8]:
forbidden_action = Input(shape=(4,))
food_pos = Input(shape=(4,))
embedding = Embedding(4, 1, input_length=3)
dense_logit = Dense(1, activation='linear')

top = Input(shape=(4,))
right = Input(shape=(4,))
bottom = Input(shape=(4,))
left = Input(shape=(4,))

top_embeddings = Flatten()(embedding(top))
right_embeddings = Flatten()(embedding(right))
bottom_embeddings = Flatten()(embedding(bottom))
left_embeddings = Flatten()(embedding(left))

top_logit = dense_logit(top_embeddings)
right_logit = dense_logit(right_embeddings)
bottom_logit = dense_logit(bottom_embeddings)
left_logit = dense_logit(left_embeddings)

logits = concatenate([left_logit, right_logit, top_logit, bottom_logit])

In [None]:
forbidden_action = Input(shape=(4,))
food_pos = Input(shape=(4,))
embedding = Embedding(4, 1, input_length=3)
dense_logit = Dense(1, activation='linear')

top = Input(shape=(4,))
right = Input(shape=(4,))
bottom = Input(shape=(4,))
left = Input(shape=(4,))

top_embeddings = Flatten()(embedding(top))
right_embeddings = Flatten()(embedding(right))
bottom_embeddings = Flatten()(embedding(bottom))
left_embeddings = Flatten()(embedding(left))

top_logit = dense_logit(top_embeddings)

top = Sequential()
embedding._name = f'embeddings_1'
top.add(embedding)
top.add(Flatten())
top.add(dense_logit)

right = Sequential()
embedding._name = f'embeddings_2'
right.add(embedding)
right.add(Flatten())
right.add(dense_logit)

bottom = Sequential()
embedding._name = f'embeddings_3'
bottom.add(embedding)
bottom.add(Flatten())
bottom.add(dense_logit)

left = Sequential()
embedding._name = f'embeddings_4'
left.add(embedding)
left.add(Flatten())
left.add(dense_logit)

#m.add(Dense(10, activation='elu'))

food_m = Sequential()
food_m.add(food_pos)
#food_m.add(Dense(4, activation='elu'))


logits = concatenate(top.outputs + right.outputs + bottom.outputs + left.outputs)

# m.add(Dense(4, activation='linear'))

inputs = [forbidden_action, top.input, right.input, bottom.input, left.input]

# c = concatenate(outputs)
# pred = Dense(4, activation='linear')(c)
no_action = tf.math.multiply(forbidden_action, -10000)
pred = tf.math.add(logits, no_action)

G = Input(shape=(1, ))
G_input = [G]

def custom_loss(y_true, y_pred, G, numerical):
    log_softmax = tf.math.log_softmax(y_pred, axis=1)
    selected_action = tf.math.multiply(y_true, log_softmax)
    selected_action_weighted = tf.math.multiply(selected_action, G)
    # selected_action = tf.math.reduce_sum(tf.math.multiply(y_true, log_softmax), axis=1)
    # selected_action_weighted = tf.math.multiply(tf.reshape(selected_action, [-1]),
    #                                             tf.reshape(G, [-1]))
    possible_actions = tf.ones(shape=tf.shape(numerical)) - numerical
    softmax = tf.math.softmax(y_pred)
    entropy = -tf.reduce_mean(tf.math.multiply(tf.math.multiply(log_softmax, softmax),
                                               possible_actions))
    J = tf.math.reduce_mean(selected_action_weighted) + entropy_reg*entropy
    l = -J
    return l

def reinforce_loss(y_true, y_pred):
    return custom_loss(y_true, y_pred, G, forbidden_action)

cur_loss = reinforce_loss

m = Model([inputs] + G_input, pred)

optimizer = tf.keras.optimizers.Adam(lr=lr)
m.compile(optimizer=optimizer,
          loss=cur_loss,
          metrics=[],
          experimental_run_tf_function=False)

In [2]:
numerical = Input(shape=(4,))
embedding = Embedding(2, 1, input_length=8)
m = Sequential()
m.add(embedding)
m.add(Flatten())

In [3]:
m.add(Dense(4, activation='linear'))

In [4]:
no_action = tf.math.multiply(numerical, -1000)
pred = tf.math.add(m.outputs, no_action)

In [5]:
G = Input(shape=(1, ))
G_input = [G]

In [6]:
def custom_loss(y_true, y_pred, G):
    log_softmax = tf.math.log_softmax(y_pred, axis=1)
    selected_action = tf.math.reduce_sum(tf.math.multiply(y_true, log_softmax), axis=1)
    selected_action_weighted = tf.math.multiply(selected_action, G)
    J = tf.math.reduce_mean(selected_action_weighted)
    l = -J
    return l

def reinforce_loss(y_true, y_pred):
    return custom_loss(y_true, y_pred, G)

cur_loss = reinforce_loss

In [7]:
m = Model([numerical, m.input] + G_input, pred)

In [8]:
optimizer = tf.keras.optimizers.Adam()
m.compile(optimizer=optimizer,
          loss=cur_loss,
          metrics=[],
          experimental_run_tf_function=False)

In [15]:
pred = m.predict([np.ones(4).reshape(-1, 4), np.zeros(8).reshape(-1,8), np.zeros(1).reshape(-1,1)])

In [None]:

#c = concatenate(outputs)
logit = Dense(4, activation='linear')(m.outputs)
no_action = tf.math.multiply(numerical, -1000)
pred = tf.math.add(logit, no_action)

G = Input(shape=(1, ))
G_input = [G]

In [21]:
np.argmax(np.random.multinomial(1, pred[0][0]))

ValueError: pvals < 0, pvals > 1 or pvals contains NaNs

In [20]:
pred[0][0]

array([-1000.0321 , -1000.03546,  -999.9579 , -1000.037  ], dtype=float32)

In [22]:
def softmax(x):
    z = x - np.max(x)
    return np.exp(z) / np.sum(np.exp(z))

pred

array([[[-1000.0321 , -1000.03546,  -999.9579 , -1000.037  ]]],
      dtype=float32)

In [23]:
softmax(pred)

array([[[0.24577153, 0.24494785, 0.2647063 , 0.24457435]]], dtype=float32)

In [26]:
np.argmax(np.random.multinomial(1, softmax(pred)[0][0]))

1

In [1]:
from tensorflow.keras.layers import Dense, Input, Embedding, Flatten, concatenate
from tensorflow.keras import Model, Sequential
import tensorflow as tf

from utils.state_space import *
from utils.helpers import pred_to_action, pred_to_action_greedy, target_to_action
from tensorflow.python.framework.ops import disable_eager_execution

import pickle

Loading environment football failed: No module named 'gfootball'


In [64]:

def custom_loss(y_true, y_pred, G):
    log_softmax = tf.math.log_softmax(y_pred, axis=1)
    selected_action = tf.math.multiply(y_true, log_softmax)
    selected_action_weighted = tf.math.multiply(selected_action, G)
    return selected_action_weighted, selected_action, G

In [65]:
y_true = np.zeros(8).reshape(-1, 4)
y_true[0][0] = 1
y_true[1][1] = 1

y_pred = np.array([float(np.random.randint(1,10)) for _ in range(8)]).reshape(-1, 4)

G = np.array([2,3]).reshape(-1, 1)

In [66]:
custom_loss(y_true, y_pred, G)

(<tf.Tensor: shape=(2, 4), dtype=float64, numpy=
 array([[-5.51810126, -0.        , -0.        , -0.        ],
        [-0.        , -0.99423485, -0.        , -0.        ]])>,
 <tf.Tensor: shape=(2, 4), dtype=float64, numpy=
 array([[-2.75905063, -0.        , -0.        , -0.        ],
        [-0.        , -0.33141162, -0.        , -0.        ]])>,
 array([[2],
        [3]]))

In [20]:
np.log(np.exp(y_pred) / np.sum(np.exp(y_pred)))

array([[-6.23331344, -5.23331344, -2.23331344, -0.23331344],
       [-8.23331344, -4.23331344, -3.23331344, -3.23331344]])

In [50]:
log_softmax = tf.math.log_softmax(y_pred, axis=1)

In [51]:
log_softmax

<tf.Tensor: shape=(2, 4), dtype=float64, numpy=
array([[-6.13501328, -5.13501328, -2.13501328, -0.13501328],
       [-5.86483632, -1.86483632, -0.86483632, -0.86483632]])>

In [57]:
selected_action = tf.math.reduce_sum(tf.math.multiply(y_true, log_softmax), axis=1)

In [53]:
tf.math.multiply(y_true, log_softmax)

<tf.Tensor: shape=(2, 4), dtype=float64, numpy=
array([[-6.13501328, -0.        , -0.        , -0.        ],
       [-0.        , -1.86483632, -0.        , -0.        ]])>

In [54]:
tf.math.reduce_sum(tf.math.multiply(y_true, log_softmax), axis=1)

<tf.Tensor: shape=(2,), dtype=float64, numpy=array([-6.13501328, -1.86483632])>

In [27]:
y_true

array([[1., 0., 0., 0.],
       [0., 1., 0., 0.]])

In [29]:
selected_action

<tf.Tensor: shape=(2,), dtype=float64, numpy=array([-6.13501328, -1.86483632])>

In [31]:
tf.math.multiply(y_true, log_softmax)

<tf.Tensor: shape=(2, 4), dtype=float64, numpy=
array([[-6.13501328, -0.        , -0.        , -0.        ],
       [-0.        , -1.86483632, -0.        , -0.        ]])>

In [32]:
tf.math.reduce_sum(tf.math.multiply(y_true, log_softmax), axis=1)

<tf.Tensor: shape=(2,), dtype=float64, numpy=array([-6.13501328, -1.86483632])>

In [34]:
G.reshape(-1)

array([2, 3])

In [41]:
tf.math.multiply(selected_action, G)

<tf.Tensor: shape=(2, 2), dtype=float64, numpy=
array([[-12.27002656,  -3.72967265],
       [-18.40503984,  -5.59450897]])>

In [43]:
selected_action

<tf.Tensor: shape=(2,), dtype=float64, numpy=array([-6.13501328, -1.86483632])>

In [1]:
l = []

In [2]:

l.append(1)

In [3]:

l.append(2)

In [4]:

l

[1, 2]

In [5]:
l.append(3)

In [6]:
l

[1, 2, 3]

In [7]:
l[-2:]

[2, 3]

In [40]:
m.outputs[0]

<tf.Tensor 'dense_15/Elu:0' shape=(None, 10) dtype=float32>

In [41]:
food_m.outputs

[<tf.Tensor 'dense_16/Elu:0' shape=(None, 4) dtype=float32>]

In [43]:
forbidden_action = Input(shape=(4,))
food_pos = Input(shape=(4,))
embedding = Embedding(4, 1, input_length=24)
m = Sequential()
m.add(embedding)
m.add(Flatten())
# m.add(Dense(30, activation='elu'))
# m.add(Dense(20, activation='elu'))
m.add(Dense(10, activation='elu'))

food_m = Sequential()
food_m.add(food_pos)
food_m.add(Dense(4, activation='elu'))

concat = concatenate(m.outputs + food_m.outputs)
d = Dense(10, activation='elu')(concat)
logits = Dense(4, activation='linear')(d)

#m.add(Dense(4, activation='linear'))

inputs = [forbidden_action, m.input, food_m.inputs]

# c = concatenate(outputs)
# pred = Dense(4, activation='linear')(c)
no_action = tf.math.multiply(forbidden_action, -10000)
pred = tf.math.add(logits, no_action)

G = Input(shape=(1, ))
G_input = [G]

def custom_loss(y_true, y_pred, G, numerical):
    log_softmax = tf.math.log_softmax(y_pred, axis=1)
    selected_action = tf.math.multiply(y_true, log_softmax)
    selected_action_weighted = tf.math.multiply(selected_action, G)
    # selected_action = tf.math.reduce_sum(tf.math.multiply(y_true, log_softmax), axis=1)
    # selected_action_weighted = tf.math.multiply(tf.reshape(selected_action, [-1]),
    #                                             tf.reshape(G, [-1]))
    possible_actions = tf.ones(shape=tf.shape(numerical)) - numerical
    softmax = tf.math.softmax(y_pred)
    entropy = -tf.reduce_mean(tf.math.multiply(tf.math.multiply(log_softmax, softmax),
                                               possible_actions))
    J = tf.math.reduce_mean(selected_action_weighted) + entropy_reg*entropy
    l = -J
    return l

def reinforce_loss(y_true, y_pred):
    return custom_loss(y_true, y_pred, G, forbidden_action)

cur_loss = reinforce_loss

m = Model([inputs] + G_input, pred)

optimizer = tf.keras.optimizers.Adam(lr=lr)
m.compile(optimizer=optimizer,
          loss=cur_loss,
          metrics=[],
          experimental_run_tf_function=False)

NameError: name 'lr' is not defined

In [17]:
from kaggle_environments import make
from kaggle_environments.envs.hungry_geese.hungry_geese import Observation, Configuration, Action, \
                                                row_col, adjacent_positions, translate, min_distance
from sklearn.model_selection import train_test_split
from agents.short_sight_agent_no_food import ShortSightAgentNoFood
from agents.greedy_agent import GreedyAgent
from utils.helpers import action_to_target
import numpy as np
from copy import deepcopy

steps_per_ep = 200
nb_opponents = 7

env = make("hungry_geese", debug=False)
config = env.configuration

validation_ratio = 0

initial_learning_rate = 0.01
def food_G(rewards):
    rewards_back = rewards[::-1]
    v_prime = 0
    g = []
    for reward in rewards_back:
        v = reward + 0.75*v_prime
        v_prime = v
        g.append(v)
    return g[::-1]

def step_G(rewards):
    rewards_back = rewards[::-1]
    v_prime = 0
    g = []
    for reward in rewards_back:
        v = reward + 0.1*v_prime
        v_prime = v
        g.append(v)
    return g[::-1]

def G(food_rewards, step_rewards):
    return (np.array(food_G(food_rewards)) + step_G(np.array(step_rewards))).reshape(-1, 1)

def transform_sample(samples):
    nb_samples = len(samples)
    if validation_ratio > 0:
        train, test = train_test_split(range(nb_samples), test_size=validation_ratio)
    else:
        train = np.random.choice(nb_samples, nb_samples, replace=False)
        test = np.array([])

    food = np.concatenate([sample['cur_state'][1].reshape(1, 4) for sample in samples], axis=0)
    forbidden = np.concatenate([sample['cur_state'][1].reshape(1, 4) for sample in samples], axis=0)
    embedding = np.concatenate([sample['cur_state'][0].reshape(1, 24) for sample in samples],
                               axis=0)
    step_reward = [sample['step_reward'] for sample in samples]
    food_reward = [sample['food_reward'] for sample in samples]
    g = np.array(step_reward).reshape(-1, 1) + np.array(food_reward).reshape(-1, 1)
    g = (g-np.mean(g)) / (np.std(g) + 1E-5)
    y = np.concatenate([sample['action'].reshape(1, 4) for sample in samples], axis=0)
    return [forbidden[train], embedding[train], food[train], g[train]], y[train],\
           [forbidden[test] if len(test) > 0 else np.array([]),
            embedding[test] if len(test) > 0 else np.array([]),
            food[test] if len(test) > 0 else np.array([]),
            g[test] if len(test) > 0 else np.array([])],\
           y[test] if len(test) > 0 else np.array([])

def run_game(nb_opponents, my_agent):
    steps = []
    agents = [my_agent] + [GreedyAgent() for _ in range(nb_opponents)]

    state_dict = env.reset(num_agents=nb_opponents + 1)[0]
    observation = state_dict['observation']

    done = False
    my_agent.last_action = state_dict.action
    prev_food_eaten = 0
    prev_len = 1
    for step in range(1, steps_per_ep):
        actions = []

        for i, agent in enumerate(agents):
            obs = deepcopy(observation)
            obs['index'] = i
            action = agent(obs, config)
            actions.append(action)

        cur_state = agents[0].stateSpace

        state_dict = env.step(actions)[0]
        observation = state_dict['observation']
        my_goose_ind = observation['index']

        my_goose_length = len(observation['geese'][my_goose_ind])

        action = state_dict['action']
        status = state_dict['status']

        if status != "ACTIVE":
            done = True
            next_state = None
        else:
            next_state = agents[0].getStateSpace(observation, action)

        # negative reward for crashing into another goose
        if (my_goose_length == 0):
            done = True
            if (prev_len == 1) and ((observation.step % 40) == 0):
                no_crash_reward = 0
            else:
                no_crash_reward = -5
        else:
            no_crash_reward = 0
        prev_len = my_goose_length
        cur_food_eaten = state_dict.reward % 100
        if cur_food_eaten > prev_food_eaten:
            food_reward = 1
        else:
            food_reward = 0
        prev_food_eaten = cur_food_eaten

        steps.append({'cur_state': cur_state,
                      'action': action_to_target(action),
                      'step_reward': no_crash_reward,
                      'food_reward': food_reward,
                      'next_state': next_state,
                      'done': done})
        if done:
            break

    return steps




In [20]:
steps = []
agents = [my_agent] + [GreedyAgent() for _ in range(nb_opponents)]

state_dict = env.reset(num_agents=nb_opponents + 1)[0]
observation = state_dict['observation']

done = False
my_agent.last_action = state_dict.action
prev_food_eaten = 0
prev_len = 1
for step in range(1, steps_per_ep):
    actions = []

    for i, agent in enumerate(agents):
        obs = deepcopy(observation)
        obs['index'] = i
        action = agent(obs, config)
        actions.append(action)

    cur_state = agents[0].stateSpace

    state_dict = env.step(actions)[0]
    observation = state_dict['observation']
    my_goose_ind = observation['index']

    my_goose_length = len(observation['geese'][my_goose_ind])

    action = state_dict['action']
    status = state_dict['status']

    if status != "ACTIVE":
        done = True
        next_state = None
    else:
        next_state = agents[0].getStateSpace(observation, action)

    # negative reward for crashing into another goose
    if (my_goose_length == 0):
        done = True
        if (prev_len == 1) and ((observation.step % 40) == 0):
            no_crash_reward = 0
        else:
            no_crash_reward = -5
    else:
        no_crash_reward = 0
    prev_len = my_goose_length
    cur_food_eaten = state_dict.reward % 100
    if cur_food_eaten > prev_food_eaten:
        food_reward = 1
    else:
        food_reward = 0
    prev_food_eaten = cur_food_eaten

    steps.append({'cur_state': cur_state,
                  'action': action_to_target(action),
                  'step_reward': no_crash_reward,
                  'food_reward': food_reward,
                  'next_state': next_state,
                  'done': done})
    if done:
        break

Goose Collision: EAST


In [21]:
len(steps)

77

In [22]:
state_dict

{'action': 'SOUTH',
 'reward': 7803,
 'info': {},
 'observation': {'remainingOverageTime': 60,
  'step': 77,
  'geese': [[24, 13, 2], []],
  'food': [11, 42],
  'index': 0},
 'status': 'DONE'}

In [23]:
steps

[{'cur_state': (array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0.]),
   array([0., 0., 0., 1.]),
   array([-0.33333333, -0.25      , -1.        , -1.        ])),
  'action': array([0., 1., 0., 0.]),
  'step_reward': 0,
  'food_reward': 1,
  'next_state': (array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0.]),
   array([1., 0., 0., 0.]),
   array([-0.33333333, -0.2       , -1.        , -0.5       ])),
  'done': False},
 {'cur_state': (array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0.]),
   array([1., 0., 0., 0.]),
   array([-0.33333333, -0.2       , -1.        , -0.5       ])),
  'action': array([0., 1., 0., 0.]),
  'step_reward': 0,
  'food_reward': 0,
  'next_state': (array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0.]),
   array([1., 0., 0., 0

In [22]:
step_reward = [sample['step_reward'] for sample in steps]
food_reward = [sample['food_reward'] for sample in steps]

In [23]:
step_reward

[0, 0, 0, 0, -5]

In [24]:
food_reward

[1, 0, 0, 0, 0]

In [25]:
food_G(food_reward)

[1.0, 0.0, 0.0, 0.0, 0.0]

In [26]:
step_G(step_reward)

[-0.0005000000000000001, -0.005000000000000001, -0.05, -0.5, -5.0]

In [26]:
board = np.array([np.random.randint(100) for _ in range (11*7)]).reshape(7,11)

In [27]:
board

array([[63, 42,  2, 78, 89,  2, 10, 38, 13, 54, 44],
       [29, 27, 27, 54, 55, 73, 53, 18,  5, 65, 86],
       [84, 14, 17, 94, 36, 63, 74, 82,  5, 98, 32],
       [81, 62, 85, 87, 99, 53, 12, 28, 36, 49, 76],
       [10,  5, 37, 83, 19, 31, 46,  1, 46, 48, 25],
       [87, 67, 87, 11, 12,  5, 35, 92, 94,  9,  0],
       [65, 32, 72, 24, 71, 39, 90, 38, 18, 31,  8]])

In [36]:
top = board[2:3, 4:7].reshape(-1)
right = np.rot90(board[2:5, 6:7], 1).reshape(-1)
bottom = np.rot90(board[4:5, 4:7], 2).reshape(-1)
left = np.rot90(board[2:5, 4:5], 1).reshape(-1)

In [37]:
top

array([36, 63, 74])

In [38]:
right

array([74, 12, 46])

In [59]:
top = board[2:3, 4:7].reshape(-1)
right = board[2:5, 6:7].reshape(-1)
bottom = board[4:5, 6:3:-1].reshape(-1)
left = board[4:1:-1, 4:5].reshape(-1)

In [60]:
top

array([36, 63, 74])

In [61]:
right

array([74, 12, 46])

In [62]:
bottom

array([46, 31, 19])

In [63]:
left

array([19, 99, 36])

In [40]:
np.rot90(board[2:5, 6:7], 1).reshape(-1)

array([74, 12, 46])

In [41]:
bottom

array([46, 31, 19])

In [42]:
left

array([36, 99, 19])

In [45]:
board[4:2:-1, 4:5]

array([[19],
       [99]])

In [48]:
board[2:5, 6:7].reshape(-1)

array([74, 12, 46])