In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
from __future__ import print_function

import pickle
import os
import numpy as np
import tempfile
import tensorflow as tf
import matplotlib.pyplot as plt

from tf_rl.controller import DiscreteDeepQ, ModelController
from tf_rl.simulation import KarpathyGame
from tf_rl import simulate
from tf_rl.models import MLP
from collections import OrderedDict
from euclid import Vector2

from baselines import deepq
import baselines.common.tf_util as U
from baselines.deepq.replay_buffer import ReplayBuffer
from baselines.common.schedules import LinearSchedule

In [None]:
current_settings = {
    #earlier objects are eaten by later objects (pred eat prey)
    'objects': [
        'prey',
        'pred',
        'cue',
    ],
    'colors': {
        'prey': [212, 211, 208],
        'pred':  [100, 37, 0],
        'cue': [0,0,0],
    },
    'object_reward': {
        'prey': {'prey': 0.1, 'pred': -0.1, 'cue': 0.0},
        'pred': {'prey': 1.0, 'pred': -1.0, 'cue': 0.0},
    },
    'hero_bounces_off_walls': False,
    'world_size': (500,300),   
    "maximum_velocity":      {'prey': 0, 'pred': 50},
    "object_radius": 10.0,
    "cue_types": 2,
    "num_objects": OrderedDict([('prey', 5), ('pred', 5), ('cue', 1)]),
    # active means that the objects are learning
    "num_objects_active": OrderedDict([('prey', 0), ('pred', 5)]), 
    #'multiple' to create each DQN for each prey/predator
    #'one' to use one DQN for all preys/predators
    # only really matters if the preys/predators are active
    "network_prey": 'one',
    "network_pred": 'multiple',
    "num_observation_lines" : 32,
    "observation_line_length": 75.,
    "tolerable_distance_to_wall": 50,
    "wall_distance_penalty":  -1.0,
    "delta_v": 50
}

#'new' to create new sim with values above
#'load' to load a previously trained graph
RUN = 'load'  

# First three for names for saving new runs
MODEL_NAME = 'model-5preds10-load-100weight'
REPLAY_NAME = 'replay-5preds10-load-100weight'
ELAPSE_NAME = 'elapse-5preds10-load-100weight'
REWARDS_NAME = 'rewards-5preds10-load-100weight'
COLLISIONS_NAME = 'collsions-5preds10-load-100weight'

# Last two for names for reloading model/replay buffers
MODEL_RE = '5preds-10w/model-5pred-2cues-500trials-10weight-2603626.ckpt'
REPLAY_RE = '5preds-10w/replay-5pred-2cues-500trials-10weight-2603626.pkl'

In [None]:
# create the game simulator
g = KarpathyGame(current_settings)

In [None]:
tf.reset_default_graph()

all_act = []
all_train = []
all_update = []
all_debug = []
all_replay = []

# Build graphs
if current_settings['num_objects_active']['pred'] != 0:
    if current_settings['network_pred'] == 'one':
        network_pred = 1
    else:
        network_pred = current_settings['num_objects_active']['pred']

    for i in range(network_pred):
        name = 'pred' + str(i)
        with tf.variable_scope(name):
            brain_pred = deepq.models.mlp([200, 200])
            act, train, update_target, debug = deepq.build_train(
                make_obs_ph=lambda name: U.BatchInput((g.observation_size,), name=name),
                q_func=brain_pred,
                num_actions=g.num_actions,
                optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),
            )
        replay_buffer = ReplayBuffer(50000)
        all_replay.append(replay_buffer)
        all_act.append(act)
        all_train.append(train)
        all_update.append(update_target)
        all_debug.append(debug)

In [None]:
FPS          = 30
ACTION_EVERY = 3
    
fast_mode = False
if fast_mode:
    WAIT, VISUALIZE_EVERY = False, 100
else:
    WAIT, VISUALIZE_EVERY = True, 1

elapsed = []
rewards = []
collisions = []
timesteps = [0]
acts = []
    
# Initializing or reloading variables
# Start TensorFlow session with 2 CPUs
with U.make_session(2) as sess:
    
    # Initialize the parameters and copy them to the target network.
    U.initialize()
    for i in range(current_settings['num_objects_active']['prey']):
        name = 'prey' + str(i)
        with tf.variable_scope(name):
            update_target()
    for i in range(current_settings['num_objects_active']['pred']):
        name = 'pred' + str(i)
        with tf.variable_scope(name):
            update_target()
            
    if RUN == 'load':
#         # when only restoring a subset of variables
#         restore = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope = 'pred0')
#         saver = tf.train.Saver(restore)

        # reload models
        saver = tf.train.Saver()
        current_dir = os.getcwd()
        model_name = current_dir + '/saved_graphs/' + MODEL_RE 
        saver.restore(sess, model_name)
        # reload replay buffers
        replay_name = current_dir + '/saved_graphs/' + REPLAY_RE 
        with open(replay_name, 'rb') as f:
            all_replay = pickle.load(f)
            
#         # remember to append buffer if restoring a subset of variables
#         all_replay.append(replay_buffer)
    
    # Run simulation
    try:
        simulate(simulation=g,
                 replay = all_replay,
                 act = all_act,
                 train = all_train,
                 update = all_update,
                 debug = all_debug,
                 fps=FPS,
                 visualize_every=VISUALIZE_EVERY,
                 action_every=ACTION_EVERY,
                 wait=WAIT,
                 disable_training=False,
                 simulation_resolution=.001,
                 save_path=None,
                 timesteps = timesteps,
                 elapsed = elapsed,
                 all_rewards = rewards,
                 percent = 1,
                 all_actions = acts)
        
    except KeyboardInterrupt:
        print("Interrupted")
        g.shut_down_graphics()
        print('graphics shut down')
        
    # Save trial times
    elapse_name = 'saved_graphs/' + ELAPSE_NAME + '-' + str(timesteps[0]) + '.pkl'
    with open(elapse_name, "wb") as f:
        pickle.dump(elapsed, f)
        
    # Save rewards
    rewards_name = 'saved_graphs/' + REWARDS_NAME + '-' + str(timesteps[0]) + '.pkl'
    with open(rewards_name, "wb") as f:
        pickle.dump(rewards, f)
        
    # Save collisions
    collisions_name = 'saved_graphs/' + COLLISIONS_NAME + '-' + str(timesteps[0]) + '.pkl'
    with open(collisions_name, "wb") as f:
        pickle.dump(g.collisions, f)

In [None]:
# reset
%reset -f
from __future__ import print_function

import pickle
import os
import numpy as np
import tempfile
import tensorflow as tf
import matplotlib.pyplot as plt

from tf_rl.controller import DiscreteDeepQ, ModelController
from tf_rl.simulation import KarpathyGame
from tf_rl import simulate
from tf_rl.models import MLP
from collections import OrderedDict
from euclid import Vector2

from baselines import deepq
import baselines.common.tf_util as U
from baselines.deepq.replay_buffer import ReplayBuffer
from baselines.common.schedules import LinearSchedule
tf.reset_default_graph()


# doing stuff
current_settings = {
    #earlier objects are eaten by later objects (pred eat prey)
    'objects': [
        'prey',
        'pred',
        'cue',
    ],
    'colors': {
        'prey': [212, 211, 208],
        'pred':  [100, 37, 0],
        'cue': [0,0,0],
    },
    'object_reward': {
        'prey': {'prey': 0.1, 'pred': -0.1, 'cue': 0.0},
        'pred': {'prey': 1.0, 'pred': -1.0, 'cue': 0.0},
    },
    'hero_bounces_off_walls': False,
    'world_size': (500,300),   
    "maximum_velocity":      {'prey': 0, 'pred': 50},
    "object_radius": 10.0,
    "cue_types": 2,
    "num_objects": OrderedDict([('prey', 5), ('pred', 5), ('cue', 1)]),
    # active means that the objects are learning
    "num_objects_active": OrderedDict([('prey', 0), ('pred', 5)]), 
    #'multiple' to create each DQN for each prey/predator
    #'one' to use one DQN for all preys/predators
    # only really matters if the preys/predators are active
    "network_prey": 'one',
    "network_pred": 'multiple',
    "num_observation_lines" : 32,
    "observation_line_length": 75.,
    "tolerable_distance_to_wall": 50,
    "wall_distance_penalty":  -1.0,
    "delta_v": 50
}

#'new' to create new sim with values above
#'load' to load a previously trained graph
RUN = 'load'  

# First three for names for saving new runs
MODEL_NAME = 'model-5preds20-load-100weight'
REPLAY_NAME = 'replay-5preds20-load-100weight'
ELAPSE_NAME = 'elapse-5preds20-load-100weight'
REWARDS_NAME = 'rewards-5preds20-load-100weight'
COLLISIONS_NAME = 'collsions-5preds20-load-100weight'

# Last two for names for reloading model/replay buffers
MODEL_RE = '5preds-20w/model-5pred-2cues-500trials-20weight-1561231.ckpt'
REPLAY_RE = '5preds-20w/replay-5pred-2cues-500trials-20weight-1561231.pkl'

# create the game simulator
g = KarpathyGame(current_settings)

tf.reset_default_graph()

all_act = []
all_train = []
all_update = []
all_debug = []
all_replay = []

# Build graphs
if current_settings['num_objects_active']['pred'] != 0:
    if current_settings['network_pred'] == 'one':
        network_pred = 1
    else:
        network_pred = current_settings['num_objects_active']['pred']

    for i in range(network_pred):
        name = 'pred' + str(i)
        with tf.variable_scope(name):
            brain_pred = deepq.models.mlp([200, 200])
            act, train, update_target, debug = deepq.build_train(
                make_obs_ph=lambda name: U.BatchInput((g.observation_size,), name=name),
                q_func=brain_pred,
                num_actions=g.num_actions,
                optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),
            )
        replay_buffer = ReplayBuffer(50000)
        all_replay.append(replay_buffer)
        all_act.append(act)
        all_train.append(train)
        all_update.append(update_target)
        all_debug.append(debug)
        
FPS          = 30
ACTION_EVERY = 3
    
fast_mode = False
if fast_mode:
    WAIT, VISUALIZE_EVERY = False, 100
else:
    WAIT, VISUALIZE_EVERY = True, 1

elapsed = []
rewards = []
timesteps = [0]
    
# Initializing or reloading variables
# Start TensorFlow session with 2 CPUs
with U.make_session(2) as sess:
    
    # Initialize the parameters and copy them to the target network.
    U.initialize()
    for i in range(current_settings['num_objects_active']['prey']):
        name = 'prey' + str(i)
        with tf.variable_scope(name):
            update_target()
    for i in range(current_settings['num_objects_active']['pred']):
        name = 'pred' + str(i)
        with tf.variable_scope(name):
            update_target()
            
    if RUN == 'load':
#         # when only restoring a subset of variables
#         restore = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope = 'pred0')
#         saver = tf.train.Saver(restore)

        # reload models
        saver = tf.train.Saver()
        current_dir = os.getcwd()
        model_name = current_dir + '/saved_graphs/' + MODEL_RE 
        saver.restore(sess, model_name)
        # reload replay buffers
        replay_name = current_dir + '/saved_graphs/' + REPLAY_RE 
        with open(replay_name, 'rb') as f:
            all_replay = pickle.load(f)
            
#         # remember to append buffer if restoring a subset of variables
#         all_replay.append(replay_buffer)
    
    # Run simulation
    try:
        simulate(simulation=g,
                 replay = all_replay,
                 act = all_act,
                 train = all_train,
                 update = all_update,
                 debug = all_debug,
                 fps=FPS,
                 visualize_every=VISUALIZE_EVERY,
                 action_every=ACTION_EVERY,
                 wait=WAIT,
                 disable_training=False,
                 simulation_resolution=.001,
                 save_path=None,
                 timesteps = timesteps,
                 elapsed = elapsed,
                 all_rewards = rewards,
                 percent = 1)
        
    except KeyboardInterrupt:
        print("Interrupted")
        g.shut_down_graphics()
        print('graphics shut down')
        
    # Save models    
    model_name = 'saved_graphs/' + MODEL_NAME + '-' + str(timesteps[0]) + '.ckpt'
    saver = tf.train.Saver()
    saver.save(sess, model_name)
        
    # Save trial times
    elapse_name = 'saved_graphs/' + ELAPSE_NAME + '-' + str(timesteps[0]) + '.pkl'
    with open(elapse_name, "wb") as f:
        pickle.dump(elapsed, f)
        
    # Save rewards
    rewards_name = 'saved_graphs/' + REWARDS_NAME + '-' + str(timesteps[0]) + '.pkl'
    with open(rewards_name, "wb") as f:
        pickle.dump(rewards, f)
        
    # Save collisions
    collisions_name = 'saved_graphs/' + COLLISIONS_NAME + '-' + str(timesteps[0]) + '.pkl'
    with open(collisions_name, "wb") as f:
        pickle.dump(g.collisions, f)

In [None]:
# reset
%reset -f
from __future__ import print_function

import pickle
import os
import numpy as np
import tempfile
import tensorflow as tf
import matplotlib.pyplot as plt

from tf_rl.controller import DiscreteDeepQ, ModelController
from tf_rl.simulation import KarpathyGame
from tf_rl import simulate
from tf_rl.models import MLP
from collections import OrderedDict
from euclid import Vector2

from baselines import deepq
import baselines.common.tf_util as U
from baselines.deepq.replay_buffer import ReplayBuffer
from baselines.common.schedules import LinearSchedule
tf.reset_default_graph()


# doing stuff
current_settings = {
    #earlier objects are eaten by later objects (pred eat prey)
    'objects': [
        'prey',
        'pred',
        'cue',
    ],
    'colors': {
        'prey': [212, 211, 208],
        'pred':  [100, 37, 0],
        'cue': [0,0,0],
    },
    'object_reward': {
        'prey': {'prey': 0.1, 'pred': -0.1, 'cue': 0.0},
        'pred': {'prey': 1.0, 'pred': -1.0, 'cue': 0.0},
    },
    'hero_bounces_off_walls': False,
    'world_size': (500,300),   
    "maximum_velocity":      {'prey': 0, 'pred': 50},
    "object_radius": 10.0,
    "cue_types": 2,
    "num_objects": OrderedDict([('prey', 5), ('pred', 5), ('cue', 1)]),
    # active means that the objects are learning
    "num_objects_active": OrderedDict([('prey', 0), ('pred', 5)]), 
    #'multiple' to create each DQN for each prey/predator
    #'one' to use one DQN for all preys/predators
    # only really matters if the preys/predators are active
    "network_prey": 'one',
    "network_pred": 'multiple',
    "num_observation_lines" : 32,
    "observation_line_length": 75.,
    "tolerable_distance_to_wall": 50,
    "wall_distance_penalty":  -1.0,
    "delta_v": 50
}

#'new' to create new sim with values above
#'load' to load a previously trained graph
RUN = 'load'  

# First three for names for saving new runs
MODEL_NAME = 'model-5preds30-load-100weight'
REPLAY_NAME = 'replay-5preds30-load-100weight'
ELAPSE_NAME = 'elapse-5preds30-load-100weight'
REWARDS_NAME = 'rewards-5preds30-load-100weight'
COLLISIONS_NAME = 'collsions-5preds30-load-100weight'

# Last two for names for reloading model/replay buffers
MODEL_RE = '5preds-30w/model-5pred-2cues-500trials-30weight-903616.ckpt'
REPLAY_RE = '5preds-30w/replay-5pred-2cues-500trials-30weight-903616.pkl'

# create the game simulator
g = KarpathyGame(current_settings)

tf.reset_default_graph()

all_act = []
all_train = []
all_update = []
all_debug = []
all_replay = []

# Build graphs
if current_settings['num_objects_active']['pred'] != 0:
    if current_settings['network_pred'] == 'one':
        network_pred = 1
    else:
        network_pred = current_settings['num_objects_active']['pred']

    for i in range(network_pred):
        name = 'pred' + str(i)
        with tf.variable_scope(name):
            brain_pred = deepq.models.mlp([200, 200])
            act, train, update_target, debug = deepq.build_train(
                make_obs_ph=lambda name: U.BatchInput((g.observation_size,), name=name),
                q_func=brain_pred,
                num_actions=g.num_actions,
                optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),
            )
        replay_buffer = ReplayBuffer(50000)
        all_replay.append(replay_buffer)
        all_act.append(act)
        all_train.append(train)
        all_update.append(update_target)
        all_debug.append(debug)
        
FPS          = 30
ACTION_EVERY = 3
    
fast_mode = False
if fast_mode:
    WAIT, VISUALIZE_EVERY = False, 100
else:
    WAIT, VISUALIZE_EVERY = True, 1

elapsed = []
rewards = []
timesteps = [0]
    
# Initializing or reloading variables
# Start TensorFlow session with 2 CPUs
with U.make_session(2) as sess:
    
    # Initialize the parameters and copy them to the target network.
    U.initialize()
    for i in range(current_settings['num_objects_active']['prey']):
        name = 'prey' + str(i)
        with tf.variable_scope(name):
            update_target()
    for i in range(current_settings['num_objects_active']['pred']):
        name = 'pred' + str(i)
        with tf.variable_scope(name):
            update_target()
            
    if RUN == 'load':
#         # when only restoring a subset of variables
#         restore = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope = 'pred0')
#         saver = tf.train.Saver(restore)

        # reload models
        saver = tf.train.Saver()
        current_dir = os.getcwd()
        model_name = current_dir + '/saved_graphs/' + MODEL_RE 
        saver.restore(sess, model_name)
        # reload replay buffers
        replay_name = current_dir + '/saved_graphs/' + REPLAY_RE 
        with open(replay_name, 'rb') as f:
            all_replay = pickle.load(f)
            
#         # remember to append buffer if restoring a subset of variables
#         all_replay.append(replay_buffer)
    
    # Run simulation
    try:
        simulate(simulation=g,
                 replay = all_replay,
                 act = all_act,
                 train = all_train,
                 update = all_update,
                 debug = all_debug,
                 fps=FPS,
                 visualize_every=VISUALIZE_EVERY,
                 action_every=ACTION_EVERY,
                 wait=WAIT,
                 disable_training=False,
                 simulation_resolution=.001,
                 save_path=None,
                 timesteps = timesteps,
                 elapsed = elapsed,
                 all_rewards = rewards,
                 percent = 1)
        
    except KeyboardInterrupt:
        print("Interrupted")
        g.shut_down_graphics()
        print('graphics shut down')
        
    # Save models    
    model_name = 'saved_graphs/' + MODEL_NAME + '-' + str(timesteps[0]) + '.ckpt'
    saver = tf.train.Saver()
    saver.save(sess, model_name)
        
    # Save trial times
    elapse_name = 'saved_graphs/' + ELAPSE_NAME + '-' + str(timesteps[0]) + '.pkl'
    with open(elapse_name, "wb") as f:
        pickle.dump(elapsed, f)
        
    # Save rewards
    rewards_name = 'saved_graphs/' + REWARDS_NAME + '-' + str(timesteps[0]) + '.pkl'
    with open(rewards_name, "wb") as f:
        pickle.dump(rewards, f)
        
    # Save collisions
    collisions_name = 'saved_graphs/' + COLLISIONS_NAME + '-' + str(timesteps[0]) + '.pkl'
    with open(collisions_name, "wb") as f:
        pickle.dump(g.collisions, f)

In [None]:
# reset
%reset -f
from __future__ import print_function

import pickle
import os
import numpy as np
import tempfile
import tensorflow as tf
import matplotlib.pyplot as plt

from tf_rl.controller import DiscreteDeepQ, ModelController
from tf_rl.simulation import KarpathyGame
from tf_rl import simulate
from tf_rl.models import MLP
from collections import OrderedDict
from euclid import Vector2

from baselines import deepq
import baselines.common.tf_util as U
from baselines.deepq.replay_buffer import ReplayBuffer
from baselines.common.schedules import LinearSchedule
tf.reset_default_graph()


# doing stuff
current_settings = {
    #earlier objects are eaten by later objects (pred eat prey)
    'objects': [
        'prey',
        'pred',
        'cue',
    ],
    'colors': {
        'prey': [212, 211, 208],
        'pred':  [100, 37, 0],
        'cue': [0,0,0],
    },
    'object_reward': {
        'prey': {'prey': 0.1, 'pred': -0.1, 'cue': 0.0},
        'pred': {'prey': 1.0, 'pred': -1.0, 'cue': 0.0},
    },
    'hero_bounces_off_walls': False,
    'world_size': (500,300),   
    "maximum_velocity":      {'prey': 0, 'pred': 50},
    "object_radius": 10.0,
    "cue_types": 2,
    "num_objects": OrderedDict([('prey', 5), ('pred', 5), ('cue', 1)]),
    # active means that the objects are learning
    "num_objects_active": OrderedDict([('prey', 0), ('pred', 5)]), 
    #'multiple' to create each DQN for each prey/predator
    #'one' to use one DQN for all preys/predators
    # only really matters if the preys/predators are active
    "network_prey": 'one',
    "network_pred": 'multiple',
    "num_observation_lines" : 32,
    "observation_line_length": 75.,
    "tolerable_distance_to_wall": 50,
    "wall_distance_penalty":  -1.0,
    "delta_v": 50
}

#'new' to create new sim with values above
#'load' to load a previously trained graph
RUN = 'load'  

# First three for names for saving new runs
MODEL_NAME = 'model-5preds40-load-100weight'
REPLAY_NAME = 'replay-5preds40-load-100weight'
ELAPSE_NAME = 'elapse-5preds40-load-100weight'
REWARDS_NAME = 'rewards-5preds40-load-100weight'
COLLISIONS_NAME = 'collsions-5preds40-load-100weight'

# Last two for names for reloading model/replay buffers
MODEL_RE = '5preds-40w/model-5pred-2cues-500trials-40weight-579358.ckpt'
REPLAY_RE = '5preds-40w/replay-5pred-2cues-500trials-40weight-579358.pkl'

# create the game simulator
g = KarpathyGame(current_settings)

tf.reset_default_graph()

all_act = []
all_train = []
all_update = []
all_debug = []
all_replay = []

# Build graphs
if current_settings['num_objects_active']['pred'] != 0:
    if current_settings['network_pred'] == 'one':
        network_pred = 1
    else:
        network_pred = current_settings['num_objects_active']['pred']

    for i in range(network_pred):
        name = 'pred' + str(i)
        with tf.variable_scope(name):
            brain_pred = deepq.models.mlp([200, 200])
            act, train, update_target, debug = deepq.build_train(
                make_obs_ph=lambda name: U.BatchInput((g.observation_size,), name=name),
                q_func=brain_pred,
                num_actions=g.num_actions,
                optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),
            )
        replay_buffer = ReplayBuffer(50000)
        all_replay.append(replay_buffer)
        all_act.append(act)
        all_train.append(train)
        all_update.append(update_target)
        all_debug.append(debug)
        
FPS          = 30
ACTION_EVERY = 3
    
fast_mode = False
if fast_mode:
    WAIT, VISUALIZE_EVERY = False, 100
else:
    WAIT, VISUALIZE_EVERY = True, 1

elapsed = []
rewards = []
timesteps = [0]
    
# Initializing or reloading variables
# Start TensorFlow session with 2 CPUs
with U.make_session(2) as sess:
    
    # Initialize the parameters and copy them to the target network.
    U.initialize()
    for i in range(current_settings['num_objects_active']['prey']):
        name = 'prey' + str(i)
        with tf.variable_scope(name):
            update_target()
    for i in range(current_settings['num_objects_active']['pred']):
        name = 'pred' + str(i)
        with tf.variable_scope(name):
            update_target()
            
    if RUN == 'load':
#         # when only restoring a subset of variables
#         restore = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope = 'pred0')
#         saver = tf.train.Saver(restore)

        # reload models
        saver = tf.train.Saver()
        current_dir = os.getcwd()
        model_name = current_dir + '/saved_graphs/' + MODEL_RE 
        saver.restore(sess, model_name)
        # reload replay buffers
        replay_name = current_dir + '/saved_graphs/' + REPLAY_RE 
        with open(replay_name, 'rb') as f:
            all_replay = pickle.load(f)
            
#         # remember to append buffer if restoring a subset of variables
#         all_replay.append(replay_buffer)
    
    # Run simulation
    try:
        simulate(simulation=g,
                 replay = all_replay,
                 act = all_act,
                 train = all_train,
                 update = all_update,
                 debug = all_debug,
                 fps=FPS,
                 visualize_every=VISUALIZE_EVERY,
                 action_every=ACTION_EVERY,
                 wait=WAIT,
                 disable_training=False,
                 simulation_resolution=.001,
                 save_path=None,
                 timesteps = timesteps,
                 elapsed = elapsed,
                 all_rewards = rewards,
                 percent = 1)
        
    except KeyboardInterrupt:
        print("Interrupted")
        g.shut_down_graphics()
        print('graphics shut down')
        
    # Save models    
    model_name = 'saved_graphs/' + MODEL_NAME + '-' + str(timesteps[0]) + '.ckpt'
    saver = tf.train.Saver()
    saver.save(sess, model_name)
    
    # Save replay buffers
    replay_name = 'saved_graphs/' + REPLAY_NAME + '-' + str(timesteps[0]) + '.pkl'
    with open(replay_name, "wb") as f:
        pickle.dump(all_replay, f)
        
    # Save trial times
    elapse_name = 'saved_graphs/' + ELAPSE_NAME + '-' + str(timesteps[0]) + '.pkl'
    with open(elapse_name, "wb") as f:
        pickle.dump(elapsed, f)
        
    # Save rewards
    rewards_name = 'saved_graphs/' + REWARDS_NAME + '-' + str(timesteps[0]) + '.pkl'
    with open(rewards_name, "wb") as f:
        pickle.dump(rewards, f)
        
    # Save collisions
    collisions_name = 'saved_graphs/' + COLLISIONS_NAME + '-' + str(timesteps[0]) + '.pkl'
    with open(collisions_name, "wb") as f:
        pickle.dump(g.collisions, f)

In [None]:
# reset
%reset -f
from __future__ import print_function

import pickle
import os
import numpy as np
import tempfile
import tensorflow as tf
import matplotlib.pyplot as plt

from tf_rl.controller import DiscreteDeepQ, ModelController
from tf_rl.simulation import KarpathyGame
from tf_rl import simulate
from tf_rl.models import MLP
from collections import OrderedDict
from euclid import Vector2

from baselines import deepq
import baselines.common.tf_util as U
from baselines.deepq.replay_buffer import ReplayBuffer
from baselines.common.schedules import LinearSchedule
tf.reset_default_graph()


# doing stuff
current_settings = {
    #earlier objects are eaten by later objects (pred eat prey)
    'objects': [
        'prey',
        'pred',
        'cue',
    ],
    'colors': {
        'prey': [212, 211, 208],
        'pred':  [100, 37, 0],
        'cue': [0,0,0],
    },
    'object_reward': {
        'prey': {'prey': 0.1, 'pred': -0.1, 'cue': 0.0},
        'pred': {'prey': 1.0, 'pred': -1.0, 'cue': 0.0},
    },
    'hero_bounces_off_walls': False,
    'world_size': (500,300),   
    "maximum_velocity":      {'prey': 0, 'pred': 50},
    "object_radius": 10.0,
    "cue_types": 2,
    "num_objects": OrderedDict([('prey', 5), ('pred', 5), ('cue', 1)]),
    # active means that the objects are learning
    "num_objects_active": OrderedDict([('prey', 0), ('pred', 5)]), 
    #'multiple' to create each DQN for each prey/predator
    #'one' to use one DQN for all preys/predators
    # only really matters if the preys/predators are active
    "network_prey": 'one',
    "network_pred": 'multiple',
    "num_observation_lines" : 32,
    "observation_line_length": 75.,
    "tolerable_distance_to_wall": 50,
    "wall_distance_penalty":  -1.0,
    "delta_v": 50
}

#'new' to create new sim with values above
#'load' to load a previously trained graph
RUN = 'load'  

# First three for names for saving new runs
MODEL_NAME = 'model-5preds50-load-100weight'
REPLAY_NAME = 'replay-5preds50-load-100weight'
ELAPSE_NAME = 'elapse-5preds50-load-100weight'
REWARDS_NAME = 'rewards-5preds50-load-100weight'
COLLISIONS_NAME = 'collsions-5preds50-load-100weight'

# Last two for names for reloading model/replay buffers
MODEL_RE = '5preds-50w/model-5pred-2cues-500trials-50weight-516364.ckpt'
REPLAY_RE = '5preds-50w/replay-5pred-2cues-500trials-50weight-516364.pkl'

# create the game simulator
g = KarpathyGame(current_settings)

tf.reset_default_graph()

all_act = []
all_train = []
all_update = []
all_debug = []
all_replay = []

# Build graphs
if current_settings['num_objects_active']['pred'] != 0:
    if current_settings['network_pred'] == 'one':
        network_pred = 1
    else:
        network_pred = current_settings['num_objects_active']['pred']

    for i in range(network_pred):
        name = 'pred' + str(i)
        with tf.variable_scope(name):
            brain_pred = deepq.models.mlp([200, 200])
            act, train, update_target, debug = deepq.build_train(
                make_obs_ph=lambda name: U.BatchInput((g.observation_size,), name=name),
                q_func=brain_pred,
                num_actions=g.num_actions,
                optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),
            )
        replay_buffer = ReplayBuffer(50000)
        all_replay.append(replay_buffer)
        all_act.append(act)
        all_train.append(train)
        all_update.append(update_target)
        all_debug.append(debug)
        
FPS          = 30
ACTION_EVERY = 3
    
fast_mode = False
if fast_mode:
    WAIT, VISUALIZE_EVERY = False, 100
else:
    WAIT, VISUALIZE_EVERY = True, 1

elapsed = []
rewards = []
timesteps = [0]
    
# Initializing or reloading variables
# Start TensorFlow session with 2 CPUs
with U.make_session(2) as sess:
    
    # Initialize the parameters and copy them to the target network.
    U.initialize()
    for i in range(current_settings['num_objects_active']['prey']):
        name = 'prey' + str(i)
        with tf.variable_scope(name):
            update_target()
    for i in range(current_settings['num_objects_active']['pred']):
        name = 'pred' + str(i)
        with tf.variable_scope(name):
            update_target()
            
    if RUN == 'load':
#         # when only restoring a subset of variables
#         restore = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope = 'pred0')
#         saver = tf.train.Saver(restore)

        # reload models
        saver = tf.train.Saver()
        current_dir = os.getcwd()
        model_name = current_dir + '/saved_graphs/' + MODEL_RE 
        saver.restore(sess, model_name)
        # reload replay buffers
        replay_name = current_dir + '/saved_graphs/' + REPLAY_RE 
        with open(replay_name, 'rb') as f:
            all_replay = pickle.load(f)
            
#         # remember to append buffer if restoring a subset of variables
#         all_replay.append(replay_buffer)
    
    # Run simulation
    try:
        simulate(simulation=g,
                 replay = all_replay,
                 act = all_act,
                 train = all_train,
                 update = all_update,
                 debug = all_debug,
                 fps=FPS,
                 visualize_every=VISUALIZE_EVERY,
                 action_every=ACTION_EVERY,
                 wait=WAIT,
                 disable_training=False,
                 simulation_resolution=.001,
                 save_path=None,
                 timesteps = timesteps,
                 elapsed = elapsed,
                 all_rewards = rewards,
                 percent = 1)
        
    except KeyboardInterrupt:
        print("Interrupted")
        g.shut_down_graphics()
        print('graphics shut down')
        
    # Save models    
    model_name = 'saved_graphs/' + MODEL_NAME + '-' + str(timesteps[0]) + '.ckpt'
    saver = tf.train.Saver()
    saver.save(sess, model_name)
    
    # Save replay buffers
    replay_name = 'saved_graphs/' + REPLAY_NAME + '-' + str(timesteps[0]) + '.pkl'
    with open(replay_name, "wb") as f:
        pickle.dump(all_replay, f)
        
    # Save trial times
    elapse_name = 'saved_graphs/' + ELAPSE_NAME + '-' + str(timesteps[0]) + '.pkl'
    with open(elapse_name, "wb") as f:
        pickle.dump(elapsed, f)
        
    # Save rewards
    rewards_name = 'saved_graphs/' + REWARDS_NAME + '-' + str(timesteps[0]) + '.pkl'
    with open(rewards_name, "wb") as f:
        pickle.dump(rewards, f)
        
    # Save collisions
    collisions_name = 'saved_graphs/' + COLLISIONS_NAME + '-' + str(timesteps[0]) + '.pkl'
    with open(collisions_name, "wb") as f:
        pickle.dump(g.collisions, f)

In [None]:
# reset
%reset -f
from __future__ import print_function

import pickle
import os
import numpy as np
import tempfile
import tensorflow as tf
import matplotlib.pyplot as plt

from tf_rl.controller import DiscreteDeepQ, ModelController
from tf_rl.simulation import KarpathyGame
from tf_rl import simulate
from tf_rl.models import MLP
from collections import OrderedDict
from euclid import Vector2

from baselines import deepq
import baselines.common.tf_util as U
from baselines.deepq.replay_buffer import ReplayBuffer
from baselines.common.schedules import LinearSchedule
tf.reset_default_graph()


# doing stuff
current_settings = {
    #earlier objects are eaten by later objects (pred eat prey)
    'objects': [
        'prey',
        'pred',
        'cue',
    ],
    'colors': {
        'prey': [212, 211, 208],
        'pred':  [100, 37, 0],
        'cue': [0,0,0],
    },
    'object_reward': {
        'prey': {'prey': 0.1, 'pred': -0.1, 'cue': 0.0},
        'pred': {'prey': 1.0, 'pred': -1.0, 'cue': 0.0},
    },
    'hero_bounces_off_walls': False,
    'world_size': (500,300),   
    "maximum_velocity":      {'prey': 0, 'pred': 50},
    "object_radius": 10.0,
    "cue_types": 2,
    "num_objects": OrderedDict([('prey', 5), ('pred', 5), ('cue', 1)]),
    # active means that the objects are learning
    "num_objects_active": OrderedDict([('prey', 0), ('pred', 5)]), 
    #'multiple' to create each DQN for each prey/predator
    #'one' to use one DQN for all preys/predators
    # only really matters if the preys/predators are active
    "network_prey": 'one',
    "network_pred": 'multiple',
    "num_observation_lines" : 32,
    "observation_line_length": 75.,
    "tolerable_distance_to_wall": 50,
    "wall_distance_penalty":  -1.0,
    "delta_v": 50
}

#'new' to create new sim with values above
#'load' to load a previously trained graph
RUN = 'load'  

# First three for names for saving new runs
MODEL_NAME = 'model-5preds60-load-100weight'
REPLAY_NAME = 'replay-5preds60-load-100weight'
ELAPSE_NAME = 'elapse-5preds60-load-100weight'
REWARDS_NAME = 'rewards-5preds60-load-100weight'
COLLISIONS_NAME = 'collsions-5preds60-load-100weight'

# Last two for names for reloading model/replay buffers
MODEL_RE = '5preds-60w/model-5pred-2cues-500trials-60weight-343942.ckpt'
REPLAY_RE = '5preds-60w/replay-5pred-2cues-500trials-60weight-343942.pkl'

# create the game simulator
g = KarpathyGame(current_settings)

tf.reset_default_graph()

all_act = []
all_train = []
all_update = []
all_debug = []
all_replay = []

# Build graphs
if current_settings['num_objects_active']['pred'] != 0:
    if current_settings['network_pred'] == 'one':
        network_pred = 1
    else:
        network_pred = current_settings['num_objects_active']['pred']

    for i in range(network_pred):
        name = 'pred' + str(i)
        with tf.variable_scope(name):
            brain_pred = deepq.models.mlp([200, 200])
            act, train, update_target, debug = deepq.build_train(
                make_obs_ph=lambda name: U.BatchInput((g.observation_size,), name=name),
                q_func=brain_pred,
                num_actions=g.num_actions,
                optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),
            )
        replay_buffer = ReplayBuffer(50000)
        all_replay.append(replay_buffer)
        all_act.append(act)
        all_train.append(train)
        all_update.append(update_target)
        all_debug.append(debug)
        
FPS          = 30
ACTION_EVERY = 3
    
fast_mode = False
if fast_mode:
    WAIT, VISUALIZE_EVERY = False, 100
else:
    WAIT, VISUALIZE_EVERY = True, 1

elapsed = []
rewards = []
timesteps = [0]
    
# Initializing or reloading variables
# Start TensorFlow session with 2 CPUs
with U.make_session(2) as sess:
    
    # Initialize the parameters and copy them to the target network.
    U.initialize()
    for i in range(current_settings['num_objects_active']['prey']):
        name = 'prey' + str(i)
        with tf.variable_scope(name):
            update_target()
    for i in range(current_settings['num_objects_active']['pred']):
        name = 'pred' + str(i)
        with tf.variable_scope(name):
            update_target()
            
    if RUN == 'load':
#         # when only restoring a subset of variables
#         restore = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope = 'pred0')
#         saver = tf.train.Saver(restore)

        # reload models
        saver = tf.train.Saver()
        current_dir = os.getcwd()
        model_name = current_dir + '/saved_graphs/' + MODEL_RE 
        saver.restore(sess, model_name)
        # reload replay buffers
        replay_name = current_dir + '/saved_graphs/' + REPLAY_RE 
        with open(replay_name, 'rb') as f:
            all_replay = pickle.load(f)
            
#         # remember to append buffer if restoring a subset of variables
#         all_replay.append(replay_buffer)
    
    # Run simulation
    try:
        simulate(simulation=g,
                 replay = all_replay,
                 act = all_act,
                 train = all_train,
                 update = all_update,
                 debug = all_debug,
                 fps=FPS,
                 visualize_every=VISUALIZE_EVERY,
                 action_every=ACTION_EVERY,
                 wait=WAIT,
                 disable_training=False,
                 simulation_resolution=.001,
                 save_path=None,
                 timesteps = timesteps,
                 elapsed = elapsed,
                 all_rewards = rewards,
                 percent = 1)
        
    except KeyboardInterrupt:
        print("Interrupted")
        g.shut_down_graphics()
        print('graphics shut down')
        
    # Save models    
    model_name = 'saved_graphs/' + MODEL_NAME + '-' + str(timesteps[0]) + '.ckpt'
    saver = tf.train.Saver()
    saver.save(sess, model_name)
    
    # Save replay buffers
    replay_name = 'saved_graphs/' + REPLAY_NAME + '-' + str(timesteps[0]) + '.pkl'
    with open(replay_name, "wb") as f:
        pickle.dump(all_replay, f)
        
    # Save trial times
    elapse_name = 'saved_graphs/' + ELAPSE_NAME + '-' + str(timesteps[0]) + '.pkl'
    with open(elapse_name, "wb") as f:
        pickle.dump(elapsed, f)
        
    # Save rewards
    rewards_name = 'saved_graphs/' + REWARDS_NAME + '-' + str(timesteps[0]) + '.pkl'
    with open(rewards_name, "wb") as f:
        pickle.dump(rewards, f)
        
    # Save collisions
    collisions_name = 'saved_graphs/' + COLLISIONS_NAME + '-' + str(timesteps[0]) + '.pkl'
    with open(collisions_name, "wb") as f:
        pickle.dump(g.collisions, f)

In [None]:
# reset
%reset -f
from __future__ import print_function

import pickle
import os
import numpy as np
import tempfile
import tensorflow as tf
import matplotlib.pyplot as plt

from tf_rl.controller import DiscreteDeepQ, ModelController
from tf_rl.simulation import KarpathyGame
from tf_rl import simulate
from tf_rl.models import MLP
from collections import OrderedDict
from euclid import Vector2

from baselines import deepq
import baselines.common.tf_util as U
from baselines.deepq.replay_buffer import ReplayBuffer
from baselines.common.schedules import LinearSchedule
tf.reset_default_graph()


# doing stuff
current_settings = {
    #earlier objects are eaten by later objects (pred eat prey)
    'objects': [
        'prey',
        'pred',
        'cue',
    ],
    'colors': {
        'prey': [212, 211, 208],
        'pred':  [100, 37, 0],
        'cue': [0,0,0],
    },
    'object_reward': {
        'prey': {'prey': 0.1, 'pred': -0.1, 'cue': 0.0},
        'pred': {'prey': 1.0, 'pred': -1.0, 'cue': 0.0},
    },
    'hero_bounces_off_walls': False,
    'world_size': (500,300),   
    "maximum_velocity":      {'prey': 0, 'pred': 50},
    "object_radius": 10.0,
    "cue_types": 2,
    "num_objects": OrderedDict([('prey', 5), ('pred', 5), ('cue', 1)]),
    # active means that the objects are learning
    "num_objects_active": OrderedDict([('prey', 0), ('pred', 5)]), 
    #'multiple' to create each DQN for each prey/predator
    #'one' to use one DQN for all preys/predators
    # only really matters if the preys/predators are active
    "network_prey": 'one',
    "network_pred": 'multiple',
    "num_observation_lines" : 32,
    "observation_line_length": 75.,
    "tolerable_distance_to_wall": 50,
    "wall_distance_penalty":  -1.0,
    "delta_v": 50
}

#'new' to create new sim with values above
#'load' to load a previously trained graph
RUN = 'load'  

# First three for names for saving new runs
MODEL_NAME = 'model-5preds80-load-100weight'
REPLAY_NAME = 'replay-5preds80-load-100weight'
ELAPSE_NAME = 'elapse-5preds80-load-100weight'
REWARDS_NAME = 'rewards-5preds80-load-100weight'
COLLISIONS_NAME = 'collsions-5preds80-load-100weight'

# Last two for names for reloading model/replay buffers
MODEL_RE = '5preds-80w/model-5pred-2cues-500trials-80weight-238297.ckpt'
REPLAY_RE = '5preds-80w/replay-5pred-2cues-500trials-80weight-238297.pkl'

# create the game simulator
g = KarpathyGame(current_settings)

tf.reset_default_graph()

all_act = []
all_train = []
all_update = []
all_debug = []
all_replay = []

# Build graphs
if current_settings['num_objects_active']['pred'] != 0:
    if current_settings['network_pred'] == 'one':
        network_pred = 1
    else:
        network_pred = current_settings['num_objects_active']['pred']

    for i in range(network_pred):
        name = 'pred' + str(i)
        with tf.variable_scope(name):
            brain_pred = deepq.models.mlp([200, 200])
            act, train, update_target, debug = deepq.build_train(
                make_obs_ph=lambda name: U.BatchInput((g.observation_size,), name=name),
                q_func=brain_pred,
                num_actions=g.num_actions,
                optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),
            )
        replay_buffer = ReplayBuffer(50000)
        all_replay.append(replay_buffer)
        all_act.append(act)
        all_train.append(train)
        all_update.append(update_target)
        all_debug.append(debug)
        
FPS          = 30
ACTION_EVERY = 3
    
fast_mode = False
if fast_mode:
    WAIT, VISUALIZE_EVERY = False, 100
else:
    WAIT, VISUALIZE_EVERY = True, 1

elapsed = []
rewards = []
timesteps = [0]
    
# Initializing or reloading variables
# Start TensorFlow session with 2 CPUs
with U.make_session(2) as sess:
    
    # Initialize the parameters and copy them to the target network.
    U.initialize()
    for i in range(current_settings['num_objects_active']['prey']):
        name = 'prey' + str(i)
        with tf.variable_scope(name):
            update_target()
    for i in range(current_settings['num_objects_active']['pred']):
        name = 'pred' + str(i)
        with tf.variable_scope(name):
            update_target()
            
    if RUN == 'load':
#         # when only restoring a subset of variables
#         restore = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope = 'pred0')
#         saver = tf.train.Saver(restore)

        # reload models
        saver = tf.train.Saver()
        current_dir = os.getcwd()
        model_name = current_dir + '/saved_graphs/' + MODEL_RE 
        saver.restore(sess, model_name)
        # reload replay buffers
        replay_name = current_dir + '/saved_graphs/' + REPLAY_RE 
        with open(replay_name, 'rb') as f:
            all_replay = pickle.load(f)
            
#         # remember to append buffer if restoring a subset of variables
#         all_replay.append(replay_buffer)
    
    # Run simulation
    try:
        simulate(simulation=g,
                 replay = all_replay,
                 act = all_act,
                 train = all_train,
                 update = all_update,
                 debug = all_debug,
                 fps=FPS,
                 visualize_every=VISUALIZE_EVERY,
                 action_every=ACTION_EVERY,
                 wait=WAIT,
                 disable_training=False,
                 simulation_resolution=.001,
                 save_path=None,
                 timesteps = timesteps,
                 elapsed = elapsed,
                 all_rewards = rewards,
                 percent = 1)
        
    except KeyboardInterrupt:
        print("Interrupted")
        g.shut_down_graphics()
        print('graphics shut down')
        
    # Save models    
    model_name = 'saved_graphs/' + MODEL_NAME + '-' + str(timesteps[0]) + '.ckpt'
    saver = tf.train.Saver()
    saver.save(sess, model_name)
    
    # Save replay buffers
    replay_name = 'saved_graphs/' + REPLAY_NAME + '-' + str(timesteps[0]) + '.pkl'
    with open(replay_name, "wb") as f:
        pickle.dump(all_replay, f)
        
    # Save trial times
    elapse_name = 'saved_graphs/' + ELAPSE_NAME + '-' + str(timesteps[0]) + '.pkl'
    with open(elapse_name, "wb") as f:
        pickle.dump(elapsed, f)
        
    # Save rewards
    rewards_name = 'saved_graphs/' + REWARDS_NAME + '-' + str(timesteps[0]) + '.pkl'
    with open(rewards_name, "wb") as f:
        pickle.dump(rewards, f)
        
    # Save collisions
    collisions_name = 'saved_graphs/' + COLLISIONS_NAME + '-' + str(timesteps[0]) + '.pkl'
    with open(collisions_name, "wb") as f:
        pickle.dump(g.collisions, f)

In [None]:
# reset
%reset -f
from __future__ import print_function

import pickle
import os
import numpy as np
import tempfile
import tensorflow as tf
import matplotlib.pyplot as plt

from tf_rl.controller import DiscreteDeepQ, ModelController
from tf_rl.simulation import KarpathyGame
from tf_rl import simulate
from tf_rl.models import MLP
from collections import OrderedDict
from euclid import Vector2

from baselines import deepq
import baselines.common.tf_util as U
from baselines.deepq.replay_buffer import ReplayBuffer
from baselines.common.schedules import LinearSchedule
tf.reset_default_graph()


# doing stuff
current_settings = {
    #earlier objects are eaten by later objects (pred eat prey)
    'objects': [
        'prey',
        'pred',
        'cue',
    ],
    'colors': {
        'prey': [212, 211, 208],
        'pred':  [100, 37, 0],
        'cue': [0,0,0],
    },
    'object_reward': {
        'prey': {'prey': 0.1, 'pred': -0.1, 'cue': 0.0},
        'pred': {'prey': 1.0, 'pred': -1.0, 'cue': 0.0},
    },
    'hero_bounces_off_walls': False,
    'world_size': (500,300),   
    "maximum_velocity":      {'prey': 0, 'pred': 50},
    "object_radius": 10.0,
    "cue_types": 2,
    "num_objects": OrderedDict([('prey', 5), ('pred', 5), ('cue', 1)]),
    # active means that the objects are learning
    "num_objects_active": OrderedDict([('prey', 0), ('pred', 5)]), 
    #'multiple' to create each DQN for each prey/predator
    #'one' to use one DQN for all preys/predators
    # only really matters if the preys/predators are active
    "network_prey": 'one',
    "network_pred": 'multiple',
    "num_observation_lines" : 32,
    "observation_line_length": 75.,
    "tolerable_distance_to_wall": 50,
    "wall_distance_penalty":  -1.0,
    "delta_v": 50
}

#'new' to create new sim with values above
#'load' to load a previously trained graph
RUN = 'load'  

# First three for names for saving new runs
MODEL_NAME = 'model-5preds90-load-100weight'
REPLAY_NAME = 'replay-5preds90-load-100weight'
ELAPSE_NAME = 'elapse-5preds90-load-100weight'
REWARDS_NAME = 'rewards-5preds90-load-100weight'
COLLISIONS_NAME = 'collsions-5preds90-load-100weight'

# Last two for names for reloading model/replay buffers
MODEL_RE = '5preds-90w/model-5pred-2cues-500trials-90weight-180703.ckpt'
REPLAY_RE = '5preds-90w/replay-5pred-2cues-500trials-90weight-180703.pkl'

# create the game simulator
g = KarpathyGame(current_settings)

tf.reset_default_graph()

all_act = []
all_train = []
all_update = []
all_debug = []
all_replay = []

# Build graphs
if current_settings['num_objects_active']['pred'] != 0:
    if current_settings['network_pred'] == 'one':
        network_pred = 1
    else:
        network_pred = current_settings['num_objects_active']['pred']

    for i in range(network_pred):
        name = 'pred' + str(i)
        with tf.variable_scope(name):
            brain_pred = deepq.models.mlp([200, 200])
            act, train, update_target, debug = deepq.build_train(
                make_obs_ph=lambda name: U.BatchInput((g.observation_size,), name=name),
                q_func=brain_pred,
                num_actions=g.num_actions,
                optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),
            )
        replay_buffer = ReplayBuffer(50000)
        all_replay.append(replay_buffer)
        all_act.append(act)
        all_train.append(train)
        all_update.append(update_target)
        all_debug.append(debug)
        
FPS          = 30
ACTION_EVERY = 3
    
fast_mode = False
if fast_mode:
    WAIT, VISUALIZE_EVERY = False, 100
else:
    WAIT, VISUALIZE_EVERY = True, 1

elapsed = []
rewards = []
timesteps = [0]
    
# Initializing or reloading variables
# Start TensorFlow session with 2 CPUs
with U.make_session(2) as sess:
    
    # Initialize the parameters and copy them to the target network.
    U.initialize()
    for i in range(current_settings['num_objects_active']['prey']):
        name = 'prey' + str(i)
        with tf.variable_scope(name):
            update_target()
    for i in range(current_settings['num_objects_active']['pred']):
        name = 'pred' + str(i)
        with tf.variable_scope(name):
            update_target()
            
    if RUN == 'load':
#         # when only restoring a subset of variables
#         restore = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope = 'pred0')
#         saver = tf.train.Saver(restore)

        # reload models
        saver = tf.train.Saver()
        current_dir = os.getcwd()
        model_name = current_dir + '/saved_graphs/' + MODEL_RE 
        saver.restore(sess, model_name)
        # reload replay buffers
        replay_name = current_dir + '/saved_graphs/' + REPLAY_RE 
        with open(replay_name, 'rb') as f:
            all_replay = pickle.load(f)
            
#         # remember to append buffer if restoring a subset of variables
#         all_replay.append(replay_buffer)
    
    # Run simulation
    try:
        simulate(simulation=g,
                 replay = all_replay,
                 act = all_act,
                 train = all_train,
                 update = all_update,
                 debug = all_debug,
                 fps=FPS,
                 visualize_every=VISUALIZE_EVERY,
                 action_every=ACTION_EVERY,
                 wait=WAIT,
                 disable_training=False,
                 simulation_resolution=.001,
                 save_path=None,
                 timesteps = timesteps,
                 elapsed = elapsed,
                 all_rewards = rewards,
                 percent = 1)
        
    except KeyboardInterrupt:
        print("Interrupted")
        g.shut_down_graphics()
        print('graphics shut down')
        
    # Save models    
    model_name = 'saved_graphs/' + MODEL_NAME + '-' + str(timesteps[0]) + '.ckpt'
    saver = tf.train.Saver()
    saver.save(sess, model_name)
    
    # Save replay buffers
    replay_name = 'saved_graphs/' + REPLAY_NAME + '-' + str(timesteps[0]) + '.pkl'
    with open(replay_name, "wb") as f:
        pickle.dump(all_replay, f)
        
    # Save trial times
    elapse_name = 'saved_graphs/' + ELAPSE_NAME + '-' + str(timesteps[0]) + '.pkl'
    with open(elapse_name, "wb") as f:
        pickle.dump(elapsed, f)
        
    # Save rewards
    rewards_name = 'saved_graphs/' + REWARDS_NAME + '-' + str(timesteps[0]) + '.pkl'
    with open(rewards_name, "wb") as f:
        pickle.dump(rewards, f)
        
    # Save collisions
    collisions_name = 'saved_graphs/' + COLLISIONS_NAME + '-' + str(timesteps[0]) + '.pkl'
    with open(collisions_name, "wb") as f:
        pickle.dump(g.collisions, f)

In [None]:
# reset
%reset -f
from __future__ import print_function

import pickle
import os
import numpy as np
import tempfile
import tensorflow as tf
import matplotlib.pyplot as plt

from tf_rl.controller import DiscreteDeepQ, ModelController
from tf_rl.simulation import KarpathyGame
from tf_rl import simulate
from tf_rl.models import MLP
from collections import OrderedDict
from euclid import Vector2

from baselines import deepq
import baselines.common.tf_util as U
from baselines.deepq.replay_buffer import ReplayBuffer
from baselines.common.schedules import LinearSchedule
tf.reset_default_graph()


# doing stuff
current_settings = {
    #earlier objects are eaten by later objects (pred eat prey)
    'objects': [
        'prey',
        'pred',
        'cue',
    ],
    'colors': {
        'prey': [212, 211, 208],
        'pred':  [100, 37, 0],
        'cue': [0,0,0],
    },
    'object_reward': {
        'prey': {'prey': 0.1, 'pred': -0.1, 'cue': 0.0},
        'pred': {'prey': 1.0, 'pred': -1.0, 'cue': 0.0},
    },
    'hero_bounces_off_walls': False,
    'world_size': (500,300),   
    "maximum_velocity":      {'prey': 0, 'pred': 50},
    "object_radius": 10.0,
    "cue_types": 2,
    "num_objects": OrderedDict([('prey', 5), ('pred', 5), ('cue', 1)]),
    # active means that the objects are learning
    "num_objects_active": OrderedDict([('prey', 0), ('pred', 5)]), 
    #'multiple' to create each DQN for each prey/predator
    #'one' to use one DQN for all preys/predators
    # only really matters if the preys/predators are active
    "network_prey": 'one',
    "network_pred": 'multiple',
    "num_observation_lines" : 32,
    "observation_line_length": 75.,
    "tolerable_distance_to_wall": 50,
    "wall_distance_penalty":  -1.0,
    "delta_v": 50
}

#'new' to create new sim with values above
#'load' to load a previously trained graph
RUN = 'load'  

# First three for names for saving new runs
MODEL_NAME = 'model-5preds100-load-100weight'
REPLAY_NAME = 'replay-5preds100-load-100weight'
ELAPSE_NAME = 'elapse-5preds100-load-100weight'
REWARDS_NAME = 'rewards-5preds100-load-100weight'
COLLISIONS_NAME = 'collsions-5preds100-load-100weight'

# Last two for names for reloading model/replay buffers
MODEL_RE = '5preds-free/model-5pred-2cues-500trials-189205.ckpt'
REPLAY_RE = '5preds-free/replay-5pred-2cues-500trials-189205.pkl'

# create the game simulator
g = KarpathyGame(current_settings)

tf.reset_default_graph()

all_act = []
all_train = []
all_update = []
all_debug = []
all_replay = []

# Build graphs
if current_settings['num_objects_active']['pred'] != 0:
    if current_settings['network_pred'] == 'one':
        network_pred = 1
    else:
        network_pred = current_settings['num_objects_active']['pred']

    for i in range(network_pred):
        name = 'pred' + str(i)
        with tf.variable_scope(name):
            brain_pred = deepq.models.mlp([200, 200])
            act, train, update_target, debug = deepq.build_train(
                make_obs_ph=lambda name: U.BatchInput((g.observation_size,), name=name),
                q_func=brain_pred,
                num_actions=g.num_actions,
                optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),
            )
        replay_buffer = ReplayBuffer(50000)
        all_replay.append(replay_buffer)
        all_act.append(act)
        all_train.append(train)
        all_update.append(update_target)
        all_debug.append(debug)
        
FPS          = 30
ACTION_EVERY = 3
    
fast_mode = False
if fast_mode:
    WAIT, VISUALIZE_EVERY = False, 100
else:
    WAIT, VISUALIZE_EVERY = True, 1

elapsed = []
rewards = []
timesteps = [0]
    
# Initializing or reloading variables
# Start TensorFlow session with 2 CPUs
with U.make_session(2) as sess:
    
    # Initialize the parameters and copy them to the target network.
    U.initialize()
    for i in range(current_settings['num_objects_active']['prey']):
        name = 'prey' + str(i)
        with tf.variable_scope(name):
            update_target()
    for i in range(current_settings['num_objects_active']['pred']):
        name = 'pred' + str(i)
        with tf.variable_scope(name):
            update_target()
            
    if RUN == 'load':
#         # when only restoring a subset of variables
#         restore = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope = 'pred0')
#         saver = tf.train.Saver(restore)

        # reload models
        saver = tf.train.Saver()
        current_dir = os.getcwd()
        model_name = current_dir + '/saved_graphs/' + MODEL_RE 
        saver.restore(sess, model_name)
        # reload replay buffers
        replay_name = current_dir + '/saved_graphs/' + REPLAY_RE 
        with open(replay_name, 'rb') as f:
            all_replay = pickle.load(f)
            
#         # remember to append buffer if restoring a subset of variables
#         all_replay.append(replay_buffer)
    
    # Run simulation
    try:
        simulate(simulation=g,
                 replay = all_replay,
                 act = all_act,
                 train = all_train,
                 update = all_update,
                 debug = all_debug,
                 fps=FPS,
                 visualize_every=VISUALIZE_EVERY,
                 action_every=ACTION_EVERY,
                 wait=WAIT,
                 disable_training=False,
                 simulation_resolution=.001,
                 save_path=None,
                 timesteps = timesteps,
                 elapsed = elapsed,
                 all_rewards = rewards,
                 percent = 1)
        
    except KeyboardInterrupt:
        print("Interrupted")
        g.shut_down_graphics()
        print('graphics shut down')
        
    # Save models    
    model_name = 'saved_graphs/' + MODEL_NAME + '-' + str(timesteps[0]) + '.ckpt'
    saver = tf.train.Saver()
    saver.save(sess, model_name)
    
    # Save replay buffers
    replay_name = 'saved_graphs/' + REPLAY_NAME + '-' + str(timesteps[0]) + '.pkl'
    with open(replay_name, "wb") as f:
        pickle.dump(all_replay, f)
        
    # Save trial times
    elapse_name = 'saved_graphs/' + ELAPSE_NAME + '-' + str(timesteps[0]) + '.pkl'
    with open(elapse_name, "wb") as f:
        pickle.dump(elapsed, f)
        
    # Save rewards
    rewards_name = 'saved_graphs/' + REWARDS_NAME + '-' + str(timesteps[0]) + '.pkl'
    with open(rewards_name, "wb") as f:
        pickle.dump(rewards, f)
        
    # Save collisions
    collisions_name = 'saved_graphs/' + COLLISIONS_NAME + '-' + str(timesteps[0]) + '.pkl'
    with open(collisions_name, "wb") as f:
        pickle.dump(g.collisions, f)