In [13]:
from magent2.environments import battle_v4, adversarial_pursuit_v4, tiger_deer_v4
from pettingzoo.utils import random_demo
import torch
import time
# import torch.nn as nn
# import torch.optim as optim
# import torch.nn.functional as F
# from dqn_basic import DQN_Basic
from collections import namedtuple, deque
import numpy as np
# import matplotlib.pyplot as plt
# from AnimalLSTM import Animal
from Animal import Animal
from pathlib import Path
from collections import deque
import datetime, os
from metric_logger import MetricLogger
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

In [14]:
def convert_numpy_binary_to_integer(arr):
    assert len(arr.shape) == 3
    assert arr.shape[0] <= 8
    arr = arr.astype(np.int32)
    input_array_transposed = np.transpose(arr, (1, 2, 0))
    packed_array = np.packbits(input_array_transposed, axis=-1, bitorder='little').squeeze(-1)
    return packed_array

In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# frame_size = (128, 128)
max_cycles = 300
# total_episodes = 50
# map_size = 30
map_size = 50

# env = tiger_deer_v4.env(map_size=map_size, minimap_mode=False, render_mode='human', tiger_step_recover=-0.1, deer_attacked=-0.1, max_cycles=max_cycles, extra_features=False)
env = tiger_deer_v4.env(map_size=map_size, minimap_mode=False, render_mode='rgb_array', max_cycles=max_cycles, extra_features=True)
env.reset(seed=None)
obs_indices_to_keep = [0,1,3]

# get number of deer and tigers
number_of_deer = len([x for x in env.agents if 'deer' in x])
number_of_tigers = len([x for x in env.agents if 'tiger' in x])


# set deer data
deer_observation_shape = env.observation_spaces['deer_0'].shape
deer_observation_shape = tuple(np.roll(deer_observation_shape,1))
# deer_action_space = env.action_spaces['deer_0'].n
deer_action_space = 3
deer_observation_shape = (len(obs_indices_to_keep)+env.action_spaces['deer_0'].n, deer_observation_shape[1], deer_observation_shape[2])


# set tiger data
tiger_observation_shape = env.observation_spaces['tiger_0'].shape
tiger_observation_shape = tuple(np.roll(tiger_observation_shape,1))
# tiger_action_space = env.action_spaces['tiger_0'].n
tiger_action_space = 6
tiger_observation_shape = (len(obs_indices_to_keep)+env.action_spaces['tiger_0'].n, tiger_observation_shape[1], tiger_observation_shape[2])

# print outputs
print("Number of Tigers:", number_of_tigers)
print("Number of Deer:",number_of_deer)
print("Deer Observations:", deer_observation_shape, "Tiger Observations:",tiger_observation_shape)
assert deer_observation_shape != None
assert tiger_observation_shape != None
# 'rgb_array' or 'human'

Number of Tigers: 12
Number of Deer: 125
Deer Observations: (8, 7, 7) Tiger Observations: (16, 10, 10)


## Utility Functions

### Action Mapping
Takes the limited actions (forward, left, right) and maps to the true actions (up, left, down, right,...)

In [16]:
deer_action_mapping = {
    0: {0: 0, 1: 1, 3: 3, 4: 4},
    1: {0: 1, 1: 4, 3: 0, 4: 3},
    2: {0: 3, 1: 0, 3: 4, 4: 1},
}
def deer_action_mapper(new_unmapped_action, prev_actual_action, size_of_actions=5):
    global deer_action_mapping
    return deer_action_mapping.get(new_unmapped_action, {}).get(prev_actual_action, np.random.choice(np.delete(np.arange(0, size_of_actions), 2)))

tiger_action_mapping = {
    0: {0: 0, 1: 1, 3: 3, 4: 4},
    1: {0: 1, 1: 4, 3: 0, 4: 3},
    2: {0: 3, 1: 0, 3: 4, 4: 1},
    3: {0: 5, 1: 6, 3: 7, 4: 8},
    4: {0: 6, 1: 8, 3: 5, 4: 7},
    5: {0: 7, 1: 5, 3: 8, 4: 6},
}

tiger_direction_mapping = {
    0: 0,
    1: 1,
    3: 3,
    4: 4,
    5: 0,
    6: 1,
    7: 3,
    8: 4,
}

def tiger_action_mapper(new_action, prev_action, size_of_actions=5): # slower
    global tiger_action_mapping, tiger_direction_mapping
    direction = tiger_direction_mapping.get(prev_action, 0)
    return tiger_action_mapping.get(new_action, {}).get(direction, 0)

def mask_observations(observations, prev_actual_action):
    current_position = (observations.shape[1] // 2, observations.shape[2] // 2)
    direction = tiger_direction_mapping.get(prev_actual_action, 0)
    # print(observations)
    if direction == 1:  # Up
        observations[:, current_position[0]:, :] = 0 # actually_left
    elif direction == 0:  # Left
        observations[:, :, current_position[1]:] = 0
    elif direction == 4:  # Right
        observations[:, :, :current_position[1]] = 0
    elif direction == 3:  # Down
        observations[:, :current_position[0]+1, :] = 0 # actually right
    else:
        raise ValueError("Direction mapping issue for mask observations")
    # print("direction: ", direction, " observations: ", observations)
    

In [17]:
def get_agent_type_from_agent_name(agent):
    if 'tiger' in agent:
        return 'tiger'
    return 'deer'

In [18]:
def generate_first_action(agentType):
    if 'tiger' in agentType:
        # action = np.random.randint(0,tiger_action_space,(1,))[0]
        action = np.random.choice(np.arange(0, 6))
        previous_actual_action = np.random.choice(np.delete(np.arange(0, 9), 2))
        actual_action = tiger_action_mapper(action, previous_actual_action)
    else:
        # action = np.random.randint(0,deer_action_space,(1,))[0]
        action = np.random.choice(np.arange(0, 3))
        previous_actual_action = np.random.choice(np.delete(np.arange(0, 5), 2))
        actual_action = deer_action_mapper(action, previous_actual_action)
    
    return action, actual_action, previous_actual_action

In [19]:
def change_active_model(activeDeer, activeTiger, deerCurrentSteps, tigerCurrentSteps, steps_to_switch_at_deer, steps_to_switch_at_tiger, total_steps_deer, total_steps_tiger):
    if deerCurrentSteps >= total_steps_deer:
        return False, True
    if tigerCurrentSteps >= total_steps_tiger:
        return True, False
    if deerCurrentSteps//steps_to_switch_at_deer < tigerCurrentSteps//steps_to_switch_at_tiger:
        # if not activeDeer:
        #     print("Switching to Deer")
        return True, False
    if tigerCurrentSteps//steps_to_switch_at_tiger < deerCurrentSteps//steps_to_switch_at_deer:
        # if not activeTiger:
        #     print("Switching to Tiger")
        return False, True
    return activeDeer, activeTiger

In [20]:
def custom_reward_modifier(agentType, observation, prev_actual_action, reward, start_index_one_hot=3):
    if 'deer' not in agentType:
        return reward
    
    #############################################################
    # alignment reward
    alignment_reward = 0.005
    action_array = observation[prev_actual_action+start_index_one_hot]
    _, K, _ = observation.shape
    center = K//2
    # Assuming 'center' is the index of the center of the (K,K) section
    center_coords = np.array([center // K, center % K])
    coords = np.indices((K,K)).reshape(2,-1).T
    distances = np.linalg.norm(coords - center_coords, axis=1)
    # distances = distances
    values = action_array.flatten()[distances >= 0.001] / distances[distances >= 0.001]
    # Sum the values and print the result
    alignement_result = np.sum(values[np.isfinite(values)])
    alignement_result *= alignment_reward
    #############################################################
    # wall_penalty
    wall_penalty = -0.01
    action_array = observation[0]
    values = action_array.flatten()[distances >= 0.001] / distances[distances >= 0.001]
    wall_result = np.sum(values[np.isfinite(values)])
    wall_result *= wall_penalty

    return reward + alignement_result + wall_result

In [21]:
agent_name_to_id_dict = {str(name): index for index,name in enumerate(env.agents)}
agent_id_to_name_list = [x for x in env.agents]
def agent_name_to_id(agent_name):
    global agent_name_to_id_dict
    return agent_name_to_id_dict[agent_name]

def agent_id_to_name(id):
    global agent_id_to_name_list
    return agent_id_to_name_list[id]

One hot actions takes the observed actions of nearby agents and converts them to one hot actions in the observed numpy array

In [22]:
import numpy as np

def one_hot_actions(array, num_actions):
    # Get the shape of the input array
    K, K = array.shape

    # Create an empty output array filled with zeros
    output = np.zeros((num_actions, K, K))

    # Iterate over the range of num_actions
    for action in range(num_actions):
        # Set the corresponding elements to 1 based on the original array
        output[action, :, :] = (array == action)

    return output

In [23]:
import numpy as np

def get_subsection_of_env_map(array, position, size=(13, 13)):
    # Get the shape of the input array
    array_shape = array.shape

    # Create an empty output array filled with -1
    output_size = [array.shape[0]]
    output_size.extend(list(size))
    output = np.full(tuple(output_size), -1)

    # Calculate the start and end indices for the subsection
    start_row = int(position[0] - size[0] // 2)
    end_row = int(start_row + size[0])
    start_col = int(position[1] - size[1] // 2)
    end_col = int(start_col + size[1])

    # Calculate the slices for the input array and output array
    slice_row_in = slice(max(start_row, 0), min(end_row, array_shape[1]))
    slice_col_in = slice(max(start_col, 0), min(end_col, array_shape[2]))

    slice_row_out = slice(max(-start_row, 0), min(array_shape[1] - start_row, size[0]))
    slice_col_out = slice(max(-start_col, 0), min(array_shape[2] - start_col, size[1]))

    # Copy the subsection from the input array to the output array
    output[:,slice_row_out, slice_col_out] = array[:,slice_row_in, slice_col_in]

    return output

# # Example usage
# array = np.random.randint(0, 100, (2, 20, 20))
# position = (19, 0)
# sub_array = get_subsection_of_env_map(array, position, size=(11, 11))

# print(sub_array)

Cleans up observations by removing unncessary layers, and one hot actions.

In [24]:
def handle_observations(observation, obs_indices_to_keep, agent_id, env_map, num_actions):
    observation = np.transpose(observation, (2,0,1))
    observation_size = observation.shape[1]
    observation = observation[obs_indices_to_keep,:,:]
    # print(observation.shape)
    position = np.where(env_map[0,:,:] == agent_id)
    # print("Position", position)

    sub_env_map = get_subsection_of_env_map(env_map, position, size=(observation_size, observation_size))
    one_hot_actions_arr = one_hot_actions(sub_env_map[1,:,:], num_actions)
    observation = np.concatenate((observation, one_hot_actions_arr), axis=0, dtype=np.float32)
    return observation



## Train Model
### Make Models and Loggers

In [25]:
use_cuda = torch.cuda.is_available()
print(f"Using CUDA: {use_cuda}")
print()

deer_save_dir = Path("deer_checkpoints") / datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S")
deer_save_dir.mkdir(parents=True)

tiger_save_dir = Path("tiger_checkpoints") / datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S")
tiger_save_dir.mkdir(parents=True)



deer = Animal(state_dim=deer_observation_shape, action_dim=deer_action_space, save_dir=deer_save_dir)
tiger = Animal(state_dim=tiger_observation_shape, action_dim=tiger_action_space, save_dir=tiger_save_dir)

# deer.load("deer_checkpoints\\2023-04-26T22-43-08\\animal_net_8.chkpt")
# tiger.load("tiger_checkpoints\\2023-04-26T22-43-08\\animal_net_2.chkpt")

deer_logger = MetricLogger(deer_save_dir, animal='deer')
tiger_logger = MetricLogger(tiger_save_dir, animal='tiger')


# active training flags
activeDeer = True
activeTiger = False


# Switch the training set
switching = True
steps_to_switch_at_deer = 15000
steps_to_switch_at_tiger = int((steps_to_switch_at_deer*2)//5)
total_steps_deer = 800000000
total_steps_tiger = int((total_steps_deer//3))

# divide steps to switch at to actually switch at the proper requested switching steps
# steps_to_switch_at = steps_to_switch_at//2
# if switching enabled, make sure no equivalence between active variables
if switching:
    if activeDeer == activeTiger:
        activeTiger = not activeTiger

Using CUDA: True





### Train Basic Model

In [26]:
print("Number of Tigers:", number_of_tigers)
print("Number of Deer:",number_of_deer)

env.reset(seed=None)

actual_tiger_actions = env.action_spaces['tiger_0'].n
actual_deer_actions = env.action_spaces['deer_0'].n

indexOfFirstTiger = [index for index, x in enumerate(env.agents) if 'tiger' in x][0]
e = -1
while deer.curr_step < total_steps_deer or tiger.curr_step < total_steps_tiger:
    e = e + 1
# for e in range(total_episodes):
    # print("Episode: ", e)
    env.reset(seed=None)
    state = np.transpose(env.state(),(2,0,1))
    deer_env_map = np.ones((2, state.shape[1], state.shape[2]))*-1
    tiger_env_map = np.ones((2, state.shape[1], state.shape[2]))*-1
    env_map = np.ones((2, state.shape[1], state.shape[2]))*-1
    deer_prev_data = {}
    tiger_prev_data = {}
    # Play the game!
    previousAgent = {'id':None, 'name': None, 'actual_action':None, 'prev_ind':None}
    for agent in env.agent_iter():

        agentType = get_agent_type_from_agent_name(agent)
        agent_id = agent_name_to_id(agent)
        # print("Agent id: ", agent_id)
        
        observation, reward, termination, truncation, info = env.last()
        done = termination or truncation

        if done:
            env.step(None)
            continue
        
        state = np.transpose(env.state(),(2,0,1))
        state_with_ids = convert_numpy_binary_to_integer(state[5:13,:,:])
        env_map[0,:,:] = np.ones_like(env_map[0,:,:])*-1
        deer_mask = state[1,:,:] > 0
        tiger_mask = state[3,:,:] > 0
        mask = deer_mask | tiger_mask
        env_map[0,:,:][mask] = state_with_ids[:,:][mask]
        env_map[0,:,:][~mask] = -1
        env_map[1,:,:][~mask] = -1
        

        if previousAgent['id'] is not None:
            position_indices = np.where(env_map[0,:,:] == previousAgent['id'])
            env_map[1, position_indices[0], position_indices[1]] = previousAgent['actual_action']
            previousAgent['id'] = None

        
        mask = env_map[0,:,:] < indexOfFirstTiger
        deer_env_map[0,:,:][mask] = env_map[0,:,:][mask]
        deer_env_map[0,:,:][~mask] = -1
        deer_env_map[1,:,:][mask] = env_map[1,:,:][mask]

        mask = ~mask
        tiger_env_map[0,:,:][mask] = env_map[0,:,:][mask]
        tiger_env_map[0,:,:][~mask] = -1
        tiger_env_map[1,:,:][mask] = env_map[1,:,:][mask]


        animal_logger = None
        if 'tiger' in agentType:
            agents_prev_data = tiger_prev_data
            animal = tiger
            animal_logger = tiger_logger
            action_mapper = tiger_action_mapper
            activeAnimal = activeTiger
            num_actions = tiger_action_space
            animal_env_map = tiger_env_map
            actual_num_actions = actual_tiger_actions
        else:
            agents_prev_data = deer_prev_data
            animal = deer
            animal_logger = deer_logger
            action_mapper = deer_action_mapper
            activeAnimal = activeDeer
            num_actions = deer_action_space
            animal_env_map = deer_env_map
            actual_num_actions = actual_deer_actions

        # print("Env map",env_map)
        # print("Animal Env map",animal_env_map)
        observation = handle_observations(observation, obs_indices_to_keep, agent_id, animal_env_map, actual_num_actions)
        
        
        action = None
        actual_action = None

        # instantiate previous data
        if agent not in agents_prev_data.keys():
            action, actual_action, prev_actual_action = generate_first_action(agentType)
            mask_observations(observation, prev_actual_action)
            hidden = None
            cell = None
        else:
            # prev_observation, prev_action, prev_done, prev_actual_action, hidden, cell = agents_prev_data[agent][0:6]
            prev_observation, prev_action, prev_done, prev_actual_action = agents_prev_data[agent][0:4]
            reward = custom_reward_modifier(agentType, observation, prev_actual_action, reward)
            mask_observations(observation, prev_actual_action)
            
            
            # if not prev_done:
            if activeAnimal:
                # animal.cache(prev_observation, observation, hidden, cell, prev_action, reward, done)
                animal.cache(prev_observation, observation, prev_action, reward, done)
                #learn
                q, loss = animal.learn()
                # logging
                animal_logger.log_step(reward, loss, q)
            # action, (hidden, cell) = animal.act(observation, hidden, cell, activeAnimal)
            action = animal.act(observation, activeAnimal)
            actual_action = action_mapper(action, prev_actual_action)

        # update previous agent
        previousAgent['id'] = agent_name_to_id(agent)
        previousAgent['name'] = agent
        previousAgent['actual_action'] = actual_action
        previousAgent['prev_ind'] = np.where(env_map[0,:,:] == previousAgent['id'])
        
        # save previous data
        # agents_prev_data[agent] = [observation, action, done, actual_action, hidden, cell, reward,  info]
        agents_prev_data[agent] = [observation, action, done, actual_action, reward, info]

        # step the function to next agent
        env.step(actual_action)
        

    deer_logger.log_episode()
    tiger_logger.log_episode()

    # if e % 5 == 0:
    if activeDeer:
        # print("Deer - Min Possible Reward ", -number_of_deer)
        deer_logger.record(episode=e, epsilon=deer.exploration_rate, step=deer.curr_step)
    if activeTiger:
        # print("Tiger - Max Possible Reward ", number_of_deer)
        tiger_logger.record(episode=e, epsilon=tiger.exploration_rate, step=tiger.curr_step)
    if switching:
        activeDeer, activeTiger = change_active_model(activeDeer, activeTiger, deer.curr_step, tiger.curr_step, steps_to_switch_at_deer, steps_to_switch_at_tiger, total_steps_deer, total_steps_tiger)

# log last episode
deer_logger.record(episode=e, epsilon=deer.exploration_rate, step=deer.curr_step)
tiger_logger.record(episode=e, epsilon=tiger.exploration_rate, step=tiger.curr_step)

#4604 basic with adv reward

Number of Tigers: 12
Number of Deer: 125
Animal saved to deer_checkpoints\2023-04-29T16-21-03\animal_net_0.chkpt at step 0
Episode 0 - deer - Step 28615 - Epsilon 0.9928717762727763 - Mean Reward -94.575 - Mean Length 28615.0 - Mean Loss 0.002 - Mean Q Value 0.066 - Time Delta 45.704 - Time 2023-04-29T16:21:49
Animal saved to tiger_checkpoints\2023-04-29T16-21-03\animal_net_0.chkpt at step 0
Episode 1 - tiger - Step 2207 - Epsilon 0.9994484021170325 - Mean Reward 15.0 - Mean Length 1103.5 - Mean Loss 0.0 - Mean Q Value 0.0 - Time Delta 62.04 - Time 2023-04-29T16:22:06
Episode 2 - tiger - Step 4772 - Epsilon 0.9988077111924818 - Mean Reward 26.333 - Mean Length 1590.667 - Mean Loss 0.0 - Mean Q Value 0.0 - Time Delta 14.941 - Time 2023-04-29T16:22:21
Episode 3 - tiger - Step 7113 - Epsilon 0.998223329927623 - Mean Reward 29.0 - Mean Length 1778.25 - Mean Loss 0.0 - Mean Q Value 0.0 - Time Delta 16.12 - Time 2023-04-29T16:22:37
Episode 4 - tiger - Step 9226 - Epsilon 0.9976961576392559 -

KeyboardInterrupt: 

<Figure size 432x288 with 0 Axes>

In [25]:
tiger.save_dir=Path("tiger_checkpoints") / "Basic_With_AdvReward2"
tiger.save()
deer.save_dir=Path("deer_checkpoints") / "Basic_With_AdvReward2"
deer.save()

Animal saved to tiger_checkpoints\Basic_With_AdvReward2\animal_net_64.chkpt at step 3219036
Animal saved to deer_checkpoints\Basic_With_AdvReward2\animal_net_161.chkpt at step 8056211


### Train LSTM Model

In [None]:
print("Number of Tigers:", number_of_tigers)
print("Number of Deer:",number_of_deer)

env.reset(seed=None)

actual_tiger_actions = env.action_spaces['tiger_0'].n
actual_deer_actions = env.action_spaces['deer_0'].n

indexOfFirstTiger = [index for index, x in enumerate(env.agents) if 'tiger' in x][0]
e = -1
while deer.curr_step < total_steps_deer or tiger.curr_step < total_steps_tiger:
    e = e + 1
# for e in range(total_episodes):
    # print("Episode: ", e)
    env.reset(seed=None)
    state = np.transpose(env.state(),(2,0,1))
    deer_env_map = np.ones((2, state.shape[1], state.shape[2]))*-1
    tiger_env_map = np.ones((2, state.shape[1], state.shape[2]))*-1
    env_map = np.ones((2, state.shape[1], state.shape[2]))*-1
    deer_prev_data = {}
    tiger_prev_data = {}
    # Play the game!
    previousAgent = {'id':None, 'name': None, 'actual_action':None, 'prev_ind':None}
    for agent in env.agent_iter():

        agentType = get_agent_type_from_agent_name(agent)
        agent_id = agent_name_to_id(agent)
        # print("Agent id: ", agent_id)
        
        observation, reward, termination, truncation, info = env.last()
        done = termination or truncation

        if done:
            env.step(None)
            continue
        
        state = np.transpose(env.state(),(2,0,1))
        state_with_ids = convert_numpy_binary_to_integer(state[5:13,:,:])
        env_map[0,:,:] = np.ones_like(env_map[0,:,:])*-1
        deer_mask = state[1,:,:] > 0
        tiger_mask = state[3,:,:] > 0
        mask = deer_mask | tiger_mask
        env_map[0,:,:][mask] = state_with_ids[:,:][mask]
        env_map[0,:,:][~mask] = -1
        env_map[1,:,:][~mask] = -1
        

        if previousAgent['id'] is not None:
            position_indices = np.where(env_map[0,:,:] == previousAgent['id'])
            env_map[1, position_indices[0], position_indices[1]] = previousAgent['actual_action']
            previousAgent['id'] = None

        
        mask = env_map[0,:,:] < indexOfFirstTiger
        deer_env_map[0,:,:][mask] = env_map[0,:,:][mask]
        deer_env_map[0,:,:][~mask] = -1
        deer_env_map[1,:,:][mask] = env_map[1,:,:][mask]

        mask = ~mask
        tiger_env_map[0,:,:][mask] = env_map[0,:,:][mask]
        tiger_env_map[0,:,:][~mask] = -1
        tiger_env_map[1,:,:][mask] = env_map[1,:,:][mask]


        animal_logger = None
        if 'tiger' in agentType:
            agents_prev_data = tiger_prev_data
            animal = tiger
            animal_logger = tiger_logger
            action_mapper = tiger_action_mapper
            activeAnimal = activeTiger
            num_actions = tiger_action_space
            animal_env_map = tiger_env_map
            actual_num_actions = actual_tiger_actions
        else:
            agents_prev_data = deer_prev_data
            animal = deer
            animal_logger = deer_logger
            action_mapper = deer_action_mapper
            activeAnimal = activeDeer
            num_actions = deer_action_space
            animal_env_map = deer_env_map
            actual_num_actions = actual_deer_actions

        # print("Env map",env_map)
        # print("Animal Env map",animal_env_map)
        observation = handle_observations(observation, obs_indices_to_keep, agent_id, animal_env_map, actual_num_actions)
        
        
        action = None
        actual_action = None

        # instantiate previous data
        if agent not in agents_prev_data.keys():
            action, actual_action, prev_actual_action = generate_first_action(agentType)
            mask_observations(observation, prev_actual_action)
            hidden = None
            cell = None
        else:
            prev_observation, prev_action, prev_done, prev_actual_action, hidden, cell = agents_prev_data[agent][0:6]
            reward = custom_reward_modifier(agentType, observation, prev_actual_action, reward)
            mask_observations(observation, prev_actual_action)
            
            
            # if not prev_done:
            if activeAnimal:
                animal.cache(prev_observation, observation, hidden, cell, prev_action, reward, done)
                #learn
                q, loss = animal.learn()
                # logging
                animal_logger.log_step(reward, loss, q)
            action, (hidden, cell) = animal.act(observation, hidden, cell, activeAnimal)
            actual_action = action_mapper(action, prev_actual_action)

        # update previous agent
        previousAgent['id'] = agent_name_to_id(agent)
        previousAgent['name'] = agent
        previousAgent['actual_action'] = actual_action
        previousAgent['prev_ind'] = np.where(env_map[0,:,:] == previousAgent['id'])
        
        # save previous data
        agents_prev_data[agent] = [observation, action, done, actual_action, hidden, cell, reward,  info]

        # step the function to next agent
        env.step(actual_action)
        

    deer_logger.log_episode()
    tiger_logger.log_episode()

    # if e % 5 == 0:
    if activeDeer:
        # print("Deer - Min Possible Reward ", -number_of_deer)
        deer_logger.record(episode=e, epsilon=deer.exploration_rate, step=deer.curr_step)
    if activeTiger:
        # print("Tiger - Max Possible Reward ", number_of_deer)
        tiger_logger.record(episode=e, epsilon=tiger.exploration_rate, step=tiger.curr_step)
    if switching:
        activeDeer, activeTiger = change_active_model(activeDeer, activeTiger, deer.curr_step, tiger.curr_step, steps_to_switch_at_deer, steps_to_switch_at_tiger, total_steps_deer, total_steps_tiger)

# log last episode
deer_logger.record(episode=e, epsilon=deer.exploration_rate, step=deer.curr_step)
tiger_logger.record(episode=e, epsilon=tiger.exploration_rate, step=tiger.curr_step)

### Visualize Trained Model

In [27]:
map_size=50
env = tiger_deer_v4.env(map_size=map_size, minimap_mode=False, render_mode='human', max_cycles=max_cycles, extra_features=True)
example_episodes = 20
for e in range(example_episodes):
    print("Episode: ", e)
    env.reset(seed=None)
    state = np.transpose(env.state(),(2,0,1))
    deer_env_map = np.ones((2, state.shape[1], state.shape[2]))*-1
    tiger_env_map = np.ones((2, state.shape[1], state.shape[2]))*-1
    env_map = np.ones((2, state.shape[1], state.shape[2]))*-1
    deer_prev_data = {}
    tiger_prev_data = {}
    # Play the game!
    previousAgent = {'id':None, 'name': None, 'actual_action':None, 'prev_ind':None}
    for agent in env.agent_iter():

        agentType = get_agent_type_from_agent_name(agent)
        agent_id = agent_name_to_id(agent)
        # print("Agent id: ", agent_id)
        
        observation, reward, termination, truncation, info = env.last()
        done = termination or truncation

        if done:
            env.step(None)
            continue
        
        state = np.transpose(env.state(),(2,0,1))
        state_with_ids = convert_numpy_binary_to_integer(state[5:13,:,:])
        env_map[0,:,:] = np.ones_like(env_map[0,:,:])*-1
        deer_mask = state[1,:,:] > 0
        tiger_mask = state[3,:,:] > 0
        mask = deer_mask | tiger_mask
        env_map[0,:,:][mask] = state_with_ids[:,:][mask]
        env_map[0,:,:][~mask] = -1
        env_map[1,:,:][~mask] = -1
        

        if previousAgent['id'] is not None:
            position_indices = np.where(env_map[0,:,:] == previousAgent['id'])
            env_map[1, position_indices[0], position_indices[1]] = previousAgent['actual_action']
            previousAgent['id'] = None

        
        mask = env_map[0,:,:] < indexOfFirstTiger
        deer_env_map[0,:,:][mask] = env_map[0,:,:][mask]
        deer_env_map[0,:,:][~mask] = -1
        deer_env_map[1,:,:][mask] = env_map[1,:,:][mask]

        mask = ~mask
        tiger_env_map[0,:,:][mask] = env_map[0,:,:][mask]
        tiger_env_map[0,:,:][~mask] = -1
        tiger_env_map[1,:,:][mask] = env_map[1,:,:][mask]


        if 'tiger' in agentType:
            agents_prev_data = tiger_prev_data
            animal = tiger
            action_mapper = tiger_action_mapper
            animal_env_map = tiger_env_map
            actual_num_actions = actual_tiger_actions
        else:
            agents_prev_data = deer_prev_data
            animal = deer
            action_mapper = deer_action_mapper
            animal_env_map = deer_env_map
            actual_num_actions = actual_deer_actions

        # print("Env map",env_map)
        # print("Animal Env map",animal_env_map)
        observation = handle_observations(observation, obs_indices_to_keep, agent_id, animal_env_map, actual_num_actions)
        
        
        action = None
        actual_action = None

        # instantiate previous data
        if agent not in agents_prev_data.keys():
            action, actual_action, prev_actual_action = generate_first_action(agentType)
            mask_observations(observation, prev_actual_action)
            hidden = None
            cell = None
        else:
            mask_observations(observation, prev_actual_action)
            # action, (hidden, cell) = animal.act(observation, hidden, cell, activeAnimal)
            action = animal.act(observation,activeAnimal)
            actual_action = action_mapper(action, prev_actual_action)

        # update previous agent
        previousAgent['id'] = agent_name_to_id(agent)
        previousAgent['name'] = agent
        previousAgent['actual_action'] = actual_action
        previousAgent['prev_ind'] = np.where(env_map[0,:,:] == previousAgent['id'])
        
        # save previous data
        # agents_prev_data[agent] = [observation, action, done, actual_action, hidden, cell, reward,  info]
        agents_prev_data[agent] = [observation, action, done, actual_action, reward,  info]

        # step the function to next agent
        env.step(actual_action)
    

Episode:  0
Episode:  1


SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [51]:
# deer_save_dir=
# tiger_save_dir=
deer2 = Animal(state_dim=deer_observation_shape, action_dim=deer_action_space, save_dir=deer_save_dir)
tiger2 = Animal(state_dim=tiger_observation_shape, action_dim=tiger_action_space, save_dir=tiger_save_dir)

deer2.load("deer_checkpoints\\Basic_With_AdvReward2\\animal_net_161.chkpt")
tiger2.load("tiger_checkpoints\\Basic_With_AdvReward2\\animal_net_64.chkpt")
# tiger2.net.eval()
# deer2.net.eval()




In [58]:
map_size=50
env = tiger_deer_v4.env(map_size=map_size, minimap_mode=False, render_mode='human', max_cycles=max_cycles, extra_features=True)
example_episodes = 3
for e in range(example_episodes):
    print("Episode: ", e)
    env.reset(seed=None)
    state = np.transpose(env.state(),(2,0,1))
    deer_env_map = np.ones((2, state.shape[1], state.shape[2]))*-1
    tiger_env_map = np.ones((2, state.shape[1], state.shape[2]))*-1
    env_map = np.ones((2, state.shape[1], state.shape[2]))*-1
    deer_prev_data = {}
    tiger_prev_data = {}
    # Play the game!
    previousAgent = {'id':None, 'name': None, 'actual_action':None, 'prev_ind':None}
    for agent in env.agent_iter():

        agentType = get_agent_type_from_agent_name(agent)
        agent_id = agent_name_to_id(agent)
        # print("Agent id: ", agent_id)
        
        observation, reward, termination, truncation, info = env.last()
        done = termination or truncation

        if done:
            env.step(None)
            continue
        
        state = np.transpose(env.state(),(2,0,1))
        state_with_ids = convert_numpy_binary_to_integer(state[5:13,:,:])
        env_map[0,:,:] = np.ones_like(env_map[0,:,:])*-1
        deer_mask = state[1,:,:] > 0
        tiger_mask = state[3,:,:] > 0
        mask = deer_mask | tiger_mask
        env_map[0,:,:][mask] = state_with_ids[:,:][mask]
        env_map[0,:,:][~mask] = -1
        env_map[1,:,:][~mask] = -1
        

        if previousAgent['id'] is not None:
            position_indices = np.where(env_map[0,:,:] == previousAgent['id'])
            env_map[1, position_indices[0], position_indices[1]] = previousAgent['actual_action']
            previousAgent['id'] = None

        
        mask = env_map[0,:,:] < indexOfFirstTiger
        deer_env_map[0,:,:][mask] = env_map[0,:,:][mask]
        deer_env_map[0,:,:][~mask] = -1
        deer_env_map[1,:,:][mask] = env_map[1,:,:][mask]

        mask = ~mask
        tiger_env_map[0,:,:][mask] = env_map[0,:,:][mask]
        tiger_env_map[0,:,:][~mask] = -1
        tiger_env_map[1,:,:][mask] = env_map[1,:,:][mask]


        if 'tiger' in agentType:
            agents_prev_data = tiger_prev_data
            animal = tiger2
            action_mapper = tiger_action_mapper
            animal_env_map = tiger_env_map
            actual_num_actions = actual_tiger_actions
        else:
            agents_prev_data = deer_prev_data
            animal = deer2
            action_mapper = deer_action_mapper
            animal_env_map = deer_env_map
            actual_num_actions = actual_deer_actions

        # print("Env map",env_map)
        # print("Animal Env map",animal_env_map)
        observation = handle_observations(observation, obs_indices_to_keep, agent_id, animal_env_map, actual_num_actions)
        
        
        action = None
        actual_action = None

        # instantiate previous data
        if agent not in agents_prev_data.keys():
            action, actual_action, prev_actual_action = generate_first_action(agentType)
            mask_observations(observation, prev_actual_action)
            hidden = None
            cell = None
        else:
            mask_observations(observation, prev_actual_action)
            # action, (hidden, cell) = animal.act(observation, hidden, cell, activeAnimal)
            action = animal.act(observation,activeAnimal)
            actual_action = action_mapper(action, prev_actual_action)

        # update previous agent
        previousAgent['id'] = agent_name_to_id(agent)
        previousAgent['name'] = agent
        previousAgent['actual_action'] = actual_action
        previousAgent['prev_ind'] = np.where(env_map[0,:,:] == previousAgent['id'])
        
        # save previous data
        # agents_prev_data[agent] = [observation, action, done, actual_action, hidden, cell, reward,  info]
        agents_prev_data[agent] = [observation, action, done, actual_action, reward,  info]

        # step the function to next agent
        env.step(actual_action)
    

Episode:  0


SystemExit: 