In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from __future__ import print_function

import pickle
import os
import numpy as np
import tempfile
import tensorflow as tf
import matplotlib.pyplot as plt

from tf_rl.controller import DiscreteDeepQ, ModelController
from tf_rl.simulation import KarpathyGame
from tf_rl import simulate
from tf_rl.models import MLP
from collections import OrderedDict
from euclid import Vector2

from baselines import deepq
import baselines.common.tf_util as U
from baselines.deepq.replay_buffer import ReplayBuffer
from baselines.common.schedules import LinearSchedule

In [3]:
current_settings = {
    #earlier objects are eaten by later objects (pred eat prey)
    'objects': [
        'prey',
        'pred',
        'cue',
    ],
    'colors': {
        'prey': [212, 211, 208],
        'pred':  [100, 37, 0],
        'cue': [0,0,0],
    },
    'object_reward': {
        'prey': {'prey': 0.1, 'pred': -0.1, 'cue': 0.0},
        'pred': {'prey': 1.0, 'pred': -1.0, 'cue': 0.0},
    },
    'hero_bounces_off_walls': False,
    'world_size': (500,300),   
    "maximum_velocity":      {'prey': 0, 'pred': 50},
    "object_radius": 10.0,
    "cue_types": 2,
    "num_objects": OrderedDict([('prey', 5), ('pred', 1), ('cue', 1)]),
    # active means that the objects are learning
    "num_objects_active": OrderedDict([('prey', 0), ('pred', 1)]), 
    #'multiple' to create each DQN for each prey/predator
    #'one' to use one DQN for all preys/predators
    # only really matters if the preys/predators are active
    "network_prey": 'one',
    "network_pred": 'multiple',
    "num_observation_lines" : 32,
    "observation_line_length": 75.,
    "tolerable_distance_to_wall": 50,
    "wall_distance_penalty":  -1.0,
    "delta_v": 50
}

#'new' to create new sim with values above
#'load' to load a previously trained graph
RUN = 'new'  

# First three for names for saving new runs
MODEL_NAME = 'model-1pred-2cues-500trials-10'
REPLAY_NAME = 'replay-1pred-2cues-500trials-10'
ELAPSE_NAME = 'elapse-1pred-2cues-500trials-10'
REWARDS_NAME = 'rewards-1pred-2cues-500trials-10'

# Last two for names for reloading model/replay buffers
MODEL_RE = 'model-1-191685'
REPLAY_RE = 'replay-1-191685'

# create the game simulator
g = KarpathyGame(current_settings)

tf.reset_default_graph()

all_act = []
all_train = []
all_update = []
all_debug = []
all_replay = []

# Build graphs
if current_settings['num_objects_active']['prey'] != 0:
    if current_settings['network_prey'] == 'one':
        network_prey = 1
    else:
        network_prey = current_settings['num_objects_active']['prey']

    for i in range(network_prey):
        name = 'prey' + str(i)
        with tf.variable_scope(name):
            brain_prey = deepq.models.mlp([200, 200])
            act, train, update_target, debug = deepq.build_train(
                make_obs_ph=lambda name: U.BatchInput((g.observation_size,), name=name),
                q_func=brain_prey,
                num_actions=g.num_actions,
                optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),
            )
        replay_buffer = ReplayBuffer(50000)
        all_replay.append(replay_buffer)
        all_act.append(act)
        all_train.append(train)
        all_update.append(update_target)
        all_debug.append(debug)

if current_settings['num_objects_active']['pred'] != 0:
    if current_settings['network_pred'] == 'one':
        network_pred = 1
    else:
        network_pred = current_settings['num_objects_active']['pred']

    for i in range(network_pred):
        # CHANGE THIS
        name = 'pred10'
        with tf.variable_scope(name):
            brain_pred = deepq.models.mlp([200, 200])
            act, train, update_target, debug = deepq.build_train(
                make_obs_ph=lambda name: U.BatchInput((g.observation_size,), name=name),
                q_func=brain_pred,
                num_actions=g.num_actions,
                optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),
            )
        replay_buffer = ReplayBuffer(50000)
        all_replay.append(replay_buffer)
        all_act.append(act)
        all_train.append(train)
        all_update.append(update_target)
        all_debug.append(debug)

# Simulate
FPS          = 30
ACTION_EVERY = 3
    
fast_mode = False
if fast_mode:
    WAIT, VISUALIZE_EVERY = False, 100
else:
    WAIT, VISUALIZE_EVERY = True, 1

elapsed = []
rewards = []
timesteps = [0]
    
# Initializing or reloading variables
# Start TensorFlow session with 2 CPUs
with U.make_session(2) as sess:
    
    # Initialize the parameters and copy them to the target network.
    U.initialize()
    for i in range(current_settings['num_objects_active']['prey']):
        name = 'prey' + str(i)
        with tf.variable_scope(name):
            update_target()
    for i in range(current_settings['num_objects_active']['pred']):
        name = 'pred10'
        with tf.variable_scope(name):
            update_target()
            
    if RUN == 'load':
#         # when only restoring a subset of variables
#         restore = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope = 'pred0')
#         saver = tf.train.Saver(restore)

        # reload models
        saver = tf.train.Saver()
        current_dir = os.getcwd()
        model_name = current_dir + '/saved_graphs/' + MODEL_RE + '.ckpt'
        saver.restore(sess, model_name)
        # reload replay buffers
        replay_name = current_dir + '/saved_graphs/' + REPLAY_RE + '.pkl'
        with open(replay_name, 'rb') as f:
            all_replay = pickle.load(f)
            
#         # remember to append buffer if restoring a subset of variables
#         all_replay.append(replay_buffer)
    
    # Run simulation
    try:
        simulate(simulation=g,
                 replay = all_replay,
                 act = all_act,
                 train = all_train,
                 update = all_update,
                 debug = all_debug,
                 fps=FPS,
                 visualize_every=VISUALIZE_EVERY,
                 action_every=ACTION_EVERY,
                 wait=WAIT,
                 disable_training=False,
                 simulation_resolution=.001,
                 save_path=None,
                 timesteps = timesteps,
                 elapsed = elapsed,
                 all_rewards = rewards)
        
    except KeyboardInterrupt:
        print("Interrupted")
        g.shut_down_graphics()
        print('graphics shut down')
        
    # Save models    
    model_name = 'saved_graphs/' + MODEL_NAME + '-' + str(timesteps[0]) + '.ckpt'
    saver = tf.train.Saver()
    saver.save(sess, model_name)
    
    # Save replay buffers
    replay_name = 'saved_graphs/' + REPLAY_NAME + '-' + str(timesteps[0]) + '.pkl'
    with open(replay_name, "wb") as f:
        pickle.dump(all_replay, f)
        
    # Save trial times
    elapse_name = 'saved_graphs/' + ELAPSE_NAME + '-' + str(timesteps[0]) + '.pkl'
    with open(elapse_name, "wb") as f:
        pickle.dump(elapsed, f)
        
    # Save rewards
    rewards_name = 'saved_graphs/' + REWARDS_NAME + '-' + str(timesteps[0]) + '.pkl'
    with open(rewards_name, "wb") as f:
        pickle.dump(rewards, f)

%reset -f 
from __future__ import print_function

import pickle
import os
import numpy as np
import tempfile
import tensorflow as tf
import matplotlib.pyplot as plt

from tf_rl.controller import DiscreteDeepQ, ModelController
from tf_rl.simulation import KarpathyGame
from tf_rl import simulate
from tf_rl.models import MLP
from collections import OrderedDict
from euclid import Vector2

from baselines import deepq
import baselines.common.tf_util as U
from baselines.deepq.replay_buffer import ReplayBuffer
from baselines.common.schedules import LinearSchedule
tf.reset_default_graph()



[2017-07-24 14:23:27,150] VARIABLES collection name is deprecated, please use GLOBAL_VARIABLES instead; VARIABLES will be removed after 2017-03-02.




[2017-07-24 14:23:27,297] VARIABLES collection name is deprecated, please use GLOBAL_VARIABLES instead; VARIABLES will be removed after 2017-03-02.


0
0
0
0
0
1
1
2
4
6
7
9
10
10
13
17
22
25
25
27
30
32
37
37
39
42
45
45
49
57
60
63
64
66
70
75
83
93
97
104
115
121
128
137
152
166
178
192
206
218
234
247
280
303
329
350
377
402
427
460
493
done


In [4]:
current_settings = {
    #earlier objects are eaten by later objects (pred eat prey)
    'objects': [
        'prey',
        'pred',
        'cue',
    ],
    'colors': {
        'prey': [212, 211, 208],
        'pred':  [100, 37, 0],
        'cue': [0,0,0],
    },
    'object_reward': {
        'prey': {'prey': 0.1, 'pred': -0.1, 'cue': 0.0},
        'pred': {'prey': 1.0, 'pred': -1.0, 'cue': 0.0},
    },
    'hero_bounces_off_walls': False,
    'world_size': (500,300),   
    "maximum_velocity":      {'prey': 0, 'pred': 50},
    "object_radius": 10.0,
    "cue_types": 2,
    "num_objects": OrderedDict([('prey', 5), ('pred', 1), ('cue', 1)]),
    # active means that the objects are learning
    "num_objects_active": OrderedDict([('prey', 0), ('pred', 1)]), 
    #'multiple' to create each DQN for each prey/predator
    #'one' to use one DQN for all preys/predators
    # only really matters if the preys/predators are active
    "network_prey": 'one',
    "network_pred": 'multiple',
    "num_observation_lines" : 32,
    "observation_line_length": 75.,
    "tolerable_distance_to_wall": 50,
    "wall_distance_penalty":  -1.0,
    "delta_v": 50
}

#'new' to create new sim with values above
#'load' to load a previously trained graph
RUN = 'new'  

# First three for names for saving new runs
MODEL_NAME = 'model-1pred-2cues-500trials-11'
REPLAY_NAME = 'replay-1pred-2cues-500trials-11'
ELAPSE_NAME = 'elapse-1pred-2cues-500trials-11'
REWARDS_NAME = 'rewards-1pred-2cues-500trials-11'

# Last two for names for reloading model/replay buffers
MODEL_RE = 'model-1-191685'
REPLAY_RE = 'replay-1-191685'

# create the game simulator
g = KarpathyGame(current_settings)

tf.reset_default_graph()

all_act = []
all_train = []
all_update = []
all_debug = []
all_replay = []

# Build graphs
if current_settings['num_objects_active']['prey'] != 0:
    if current_settings['network_prey'] == 'one':
        network_prey = 1
    else:
        network_prey = current_settings['num_objects_active']['prey']

    for i in range(network_prey):
        name = 'prey' + str(i)
        with tf.variable_scope(name):
            brain_prey = deepq.models.mlp([200, 200])
            act, train, update_target, debug = deepq.build_train(
                make_obs_ph=lambda name: U.BatchInput((g.observation_size,), name=name),
                q_func=brain_prey,
                num_actions=g.num_actions,
                optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),
            )
        replay_buffer = ReplayBuffer(50000)
        all_replay.append(replay_buffer)
        all_act.append(act)
        all_train.append(train)
        all_update.append(update_target)
        all_debug.append(debug)

if current_settings['num_objects_active']['pred'] != 0:
    if current_settings['network_pred'] == 'one':
        network_pred = 1
    else:
        network_pred = current_settings['num_objects_active']['pred']

    for i in range(network_pred):
        # CHANGE THIS
        name = 'pred11'
        with tf.variable_scope(name):
            brain_pred = deepq.models.mlp([200, 200])
            act, train, update_target, debug = deepq.build_train(
                make_obs_ph=lambda name: U.BatchInput((g.observation_size,), name=name),
                q_func=brain_pred,
                num_actions=g.num_actions,
                optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),
            )
        replay_buffer = ReplayBuffer(50000)
        all_replay.append(replay_buffer)
        all_act.append(act)
        all_train.append(train)
        all_update.append(update_target)
        all_debug.append(debug)

# Simulate
FPS          = 30
ACTION_EVERY = 3
    
fast_mode = False
if fast_mode:
    WAIT, VISUALIZE_EVERY = False, 100
else:
    WAIT, VISUALIZE_EVERY = True, 1

elapsed = []
rewards = []
timesteps = [0]
    
# Initializing or reloading variables
# Start TensorFlow session with 2 CPUs
with U.make_session(2) as sess:
    
    # Initialize the parameters and copy them to the target network.
    U.initialize()
    for i in range(current_settings['num_objects_active']['prey']):
        name = 'prey' + str(i)
        with tf.variable_scope(name):
            update_target()
    for i in range(current_settings['num_objects_active']['pred']):
        name = 'pred11'
        with tf.variable_scope(name):
            update_target()
            
    if RUN == 'load':
#         # when only restoring a subset of variables
#         restore = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope = 'pred0')
#         saver = tf.train.Saver(restore)

        # reload models
        saver = tf.train.Saver()
        current_dir = os.getcwd()
        model_name = current_dir + '/saved_graphs/' + MODEL_RE + '.ckpt'
        saver.restore(sess, model_name)
        # reload replay buffers
        replay_name = current_dir + '/saved_graphs/' + REPLAY_RE + '.pkl'
        with open(replay_name, 'rb') as f:
            all_replay = pickle.load(f)
            
#         # remember to append buffer if restoring a subset of variables
#         all_replay.append(replay_buffer)
    
    # Run simulation
    try:
        simulate(simulation=g,
                 replay = all_replay,
                 act = all_act,
                 train = all_train,
                 update = all_update,
                 debug = all_debug,
                 fps=FPS,
                 visualize_every=VISUALIZE_EVERY,
                 action_every=ACTION_EVERY,
                 wait=WAIT,
                 disable_training=False,
                 simulation_resolution=.001,
                 save_path=None,
                 timesteps = timesteps,
                 elapsed = elapsed,
                 all_rewards = rewards)
        
    except KeyboardInterrupt:
        print("Interrupted")
        g.shut_down_graphics()
        print('graphics shut down')
        
    # Save models    
    model_name = 'saved_graphs/' + MODEL_NAME + '-' + str(timesteps[0]) + '.ckpt'
    saver = tf.train.Saver()
    saver.save(sess, model_name)
    
    # Save replay buffers
    replay_name = 'saved_graphs/' + REPLAY_NAME + '-' + str(timesteps[0]) + '.pkl'
    with open(replay_name, "wb") as f:
        pickle.dump(all_replay, f)
        
    # Save trial times
    elapse_name = 'saved_graphs/' + ELAPSE_NAME + '-' + str(timesteps[0]) + '.pkl'
    with open(elapse_name, "wb") as f:
        pickle.dump(elapsed, f)
        
    # Save rewards
    rewards_name = 'saved_graphs/' + REWARDS_NAME + '-' + str(timesteps[0]) + '.pkl'
    with open(rewards_name, "wb") as f:
        pickle.dump(rewards, f)

%reset -f 
from __future__ import print_function

import pickle
import os
import numpy as np
import tempfile
import tensorflow as tf
import matplotlib.pyplot as plt

from tf_rl.controller import DiscreteDeepQ, ModelController
from tf_rl.simulation import KarpathyGame
from tf_rl import simulate
from tf_rl.models import MLP
from collections import OrderedDict
from euclid import Vector2

from baselines import deepq
import baselines.common.tf_util as U
from baselines.deepq.replay_buffer import ReplayBuffer
from baselines.common.schedules import LinearSchedule
tf.reset_default_graph()



[2017-07-24 14:35:41,122] VARIABLES collection name is deprecated, please use GLOBAL_VARIABLES instead; VARIABLES will be removed after 2017-03-02.




[2017-07-24 14:35:41,204] VARIABLES collection name is deprecated, please use GLOBAL_VARIABLES instead; VARIABLES will be removed after 2017-03-02.


0
1
2
4
5
6
6
7
7
9
12
13
16
16
19
23
26
29
38
44
52
68
80
93
103
114
132
146
157
168
185
214
244
261
278
288
313
348
365
382
409
444
468
done


In [5]:
current_settings = {
    #earlier objects are eaten by later objects (pred eat prey)
    'objects': [
        'prey',
        'pred',
        'cue',
    ],
    'colors': {
        'prey': [212, 211, 208],
        'pred':  [100, 37, 0],
        'cue': [0,0,0],
    },
    'object_reward': {
        'prey': {'prey': 0.1, 'pred': -0.1, 'cue': 0.0},
        'pred': {'prey': 1.0, 'pred': -1.0, 'cue': 0.0},
    },
    'hero_bounces_off_walls': False,
    'world_size': (500,300),   
    "maximum_velocity":      {'prey': 0, 'pred': 50},
    "object_radius": 10.0,
    "cue_types": 2,
    "num_objects": OrderedDict([('prey', 5), ('pred', 1), ('cue', 1)]),
    # active means that the objects are learning
    "num_objects_active": OrderedDict([('prey', 0), ('pred', 1)]), 
    #'multiple' to create each DQN for each prey/predator
    #'one' to use one DQN for all preys/predators
    # only really matters if the preys/predators are active
    "network_prey": 'one',
    "network_pred": 'multiple',
    "num_observation_lines" : 32,
    "observation_line_length": 75.,
    "tolerable_distance_to_wall": 50,
    "wall_distance_penalty":  -1.0,
    "delta_v": 50
}

#'new' to create new sim with values above
#'load' to load a previously trained graph
RUN = 'new'  

# First three for names for saving new runs
MODEL_NAME = 'model-1pred-2cues-500trials-12'
REPLAY_NAME = 'replay-1pred-2cues-500trials-12'
ELAPSE_NAME = 'elapse-1pred-2cues-500trials-12'
REWARDS_NAME = 'rewards-1pred-2cues-500trials-12'

# Last two for names for reloading model/replay buffers
MODEL_RE = 'model-1-191685'
REPLAY_RE = 'replay-1-191685'

# create the game simulator
g = KarpathyGame(current_settings)

tf.reset_default_graph()

all_act = []
all_train = []
all_update = []
all_debug = []
all_replay = []

# Build graphs
if current_settings['num_objects_active']['prey'] != 0:
    if current_settings['network_prey'] == 'one':
        network_prey = 1
    else:
        network_prey = current_settings['num_objects_active']['prey']

    for i in range(network_prey):
        name = 'prey' + str(i)
        with tf.variable_scope(name):
            brain_prey = deepq.models.mlp([200, 200])
            act, train, update_target, debug = deepq.build_train(
                make_obs_ph=lambda name: U.BatchInput((g.observation_size,), name=name),
                q_func=brain_prey,
                num_actions=g.num_actions,
                optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),
            )
        replay_buffer = ReplayBuffer(50000)
        all_replay.append(replay_buffer)
        all_act.append(act)
        all_train.append(train)
        all_update.append(update_target)
        all_debug.append(debug)

if current_settings['num_objects_active']['pred'] != 0:
    if current_settings['network_pred'] == 'one':
        network_pred = 1
    else:
        network_pred = current_settings['num_objects_active']['pred']

    for i in range(network_pred):
        # CHANGE THIS
        name = 'pred12'
        with tf.variable_scope(name):
            brain_pred = deepq.models.mlp([200, 200])
            act, train, update_target, debug = deepq.build_train(
                make_obs_ph=lambda name: U.BatchInput((g.observation_size,), name=name),
                q_func=brain_pred,
                num_actions=g.num_actions,
                optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),
            )
        replay_buffer = ReplayBuffer(50000)
        all_replay.append(replay_buffer)
        all_act.append(act)
        all_train.append(train)
        all_update.append(update_target)
        all_debug.append(debug)

# Simulate
FPS          = 30
ACTION_EVERY = 3
    
fast_mode = False
if fast_mode:
    WAIT, VISUALIZE_EVERY = False, 100
else:
    WAIT, VISUALIZE_EVERY = True, 1

elapsed = []
rewards = []
timesteps = [0]
    
# Initializing or reloading variables
# Start TensorFlow session with 2 CPUs
with U.make_session(2) as sess:
    
    # Initialize the parameters and copy them to the target network.
    U.initialize()
    for i in range(current_settings['num_objects_active']['prey']):
        name = 'prey' + str(i)
        with tf.variable_scope(name):
            update_target()
    for i in range(current_settings['num_objects_active']['pred']):
        name = 'pred12'
        with tf.variable_scope(name):
            update_target()
            
    if RUN == 'load':
#         # when only restoring a subset of variables
#         restore = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope = 'pred0')
#         saver = tf.train.Saver(restore)

        # reload models
        saver = tf.train.Saver()
        current_dir = os.getcwd()
        model_name = current_dir + '/saved_graphs/' + MODEL_RE + '.ckpt'
        saver.restore(sess, model_name)
        # reload replay buffers
        replay_name = current_dir + '/saved_graphs/' + REPLAY_RE + '.pkl'
        with open(replay_name, 'rb') as f:
            all_replay = pickle.load(f)
            
#         # remember to append buffer if restoring a subset of variables
#         all_replay.append(replay_buffer)
    
    # Run simulation
    try:
        simulate(simulation=g,
                 replay = all_replay,
                 act = all_act,
                 train = all_train,
                 update = all_update,
                 debug = all_debug,
                 fps=FPS,
                 visualize_every=VISUALIZE_EVERY,
                 action_every=ACTION_EVERY,
                 wait=WAIT,
                 disable_training=False,
                 simulation_resolution=.001,
                 save_path=None,
                 timesteps = timesteps,
                 elapsed = elapsed,
                 all_rewards = rewards)
        
    except KeyboardInterrupt:
        print("Interrupted")
        g.shut_down_graphics()
        print('graphics shut down')
        
    # Save models    
    model_name = 'saved_graphs/' + MODEL_NAME + '-' + str(timesteps[0]) + '.ckpt'
    saver = tf.train.Saver()
    saver.save(sess, model_name)
    
    # Save replay buffers
    replay_name = 'saved_graphs/' + REPLAY_NAME + '-' + str(timesteps[0]) + '.pkl'
    with open(replay_name, "wb") as f:
        pickle.dump(all_replay, f)
        
    # Save trial times
    elapse_name = 'saved_graphs/' + ELAPSE_NAME + '-' + str(timesteps[0]) + '.pkl'
    with open(elapse_name, "wb") as f:
        pickle.dump(elapsed, f)
        
    # Save rewards
    rewards_name = 'saved_graphs/' + REWARDS_NAME + '-' + str(timesteps[0]) + '.pkl'
    with open(rewards_name, "wb") as f:
        pickle.dump(rewards, f)

%reset -f 
from __future__ import print_function

import pickle
import os
import numpy as np
import tempfile
import tensorflow as tf
import matplotlib.pyplot as plt

from tf_rl.controller import DiscreteDeepQ, ModelController
from tf_rl.simulation import KarpathyGame
from tf_rl import simulate
from tf_rl.models import MLP
from collections import OrderedDict
from euclid import Vector2

from baselines import deepq
import baselines.common.tf_util as U
from baselines.deepq.replay_buffer import ReplayBuffer
from baselines.common.schedules import LinearSchedule
tf.reset_default_graph()



[2017-07-24 14:45:15,914] VARIABLES collection name is deprecated, please use GLOBAL_VARIABLES instead; VARIABLES will be removed after 2017-03-02.




[2017-07-24 14:45:15,980] VARIABLES collection name is deprecated, please use GLOBAL_VARIABLES instead; VARIABLES will be removed after 2017-03-02.


0
0
0
0
2
2
3
3
3
3
3
7
9
10
10
12
14
15
17
18
19
23
23
24
26
27
29
33
33
33
39
40
42
47
49
52
58
63
70
73
73
76
81
83
87
95
101
105
110
114
119
121
125
127
131
134
138
154
162
175
186
200
221
244
266
268
288
308
324
344
376
417
457
492
done


In [6]:
current_settings = {
    #earlier objects are eaten by later objects (pred eat prey)
    'objects': [
        'prey',
        'pred',
        'cue',
    ],
    'colors': {
        'prey': [212, 211, 208],
        'pred':  [100, 37, 0],
        'cue': [0,0,0],
    },
    'object_reward': {
        'prey': {'prey': 0.1, 'pred': -0.1, 'cue': 0.0},
        'pred': {'prey': 1.0, 'pred': -1.0, 'cue': 0.0},
    },
    'hero_bounces_off_walls': False,
    'world_size': (500,300),   
    "maximum_velocity":      {'prey': 0, 'pred': 50},
    "object_radius": 10.0,
    "cue_types": 2,
    "num_objects": OrderedDict([('prey', 5), ('pred', 1), ('cue', 1)]),
    # active means that the objects are learning
    "num_objects_active": OrderedDict([('prey', 0), ('pred', 1)]), 
    #'multiple' to create each DQN for each prey/predator
    #'one' to use one DQN for all preys/predators
    # only really matters if the preys/predators are active
    "network_prey": 'one',
    "network_pred": 'multiple',
    "num_observation_lines" : 32,
    "observation_line_length": 75.,
    "tolerable_distance_to_wall": 50,
    "wall_distance_penalty":  -1.0,
    "delta_v": 50
}

#'new' to create new sim with values above
#'load' to load a previously trained graph
RUN = 'new'  

# First three for names for saving new runs
MODEL_NAME = 'model-1pred-2cues-500trials-13'
REPLAY_NAME = 'replay-1pred-2cues-500trials-13'
ELAPSE_NAME = 'elapse-1pred-2cues-500trials-13'
REWARDS_NAME = 'rewards-1pred-2cues-500trials-13'

# Last two for names for reloading model/replay buffers
MODEL_RE = 'model-1-191685'
REPLAY_RE = 'replay-1-191685'

# create the game simulator
g = KarpathyGame(current_settings)

tf.reset_default_graph()

all_act = []
all_train = []
all_update = []
all_debug = []
all_replay = []

# Build graphs
if current_settings['num_objects_active']['prey'] != 0:
    if current_settings['network_prey'] == 'one':
        network_prey = 1
    else:
        network_prey = current_settings['num_objects_active']['prey']

    for i in range(network_prey):
        name = 'prey' + str(i)
        with tf.variable_scope(name):
            brain_prey = deepq.models.mlp([200, 200])
            act, train, update_target, debug = deepq.build_train(
                make_obs_ph=lambda name: U.BatchInput((g.observation_size,), name=name),
                q_func=brain_prey,
                num_actions=g.num_actions,
                optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),
            )
        replay_buffer = ReplayBuffer(50000)
        all_replay.append(replay_buffer)
        all_act.append(act)
        all_train.append(train)
        all_update.append(update_target)
        all_debug.append(debug)

if current_settings['num_objects_active']['pred'] != 0:
    if current_settings['network_pred'] == 'one':
        network_pred = 1
    else:
        network_pred = current_settings['num_objects_active']['pred']

    for i in range(network_pred):
        # CHANGE THIS
        name = 'pred13'
        with tf.variable_scope(name):
            brain_pred = deepq.models.mlp([200, 200])
            act, train, update_target, debug = deepq.build_train(
                make_obs_ph=lambda name: U.BatchInput((g.observation_size,), name=name),
                q_func=brain_pred,
                num_actions=g.num_actions,
                optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),
            )
        replay_buffer = ReplayBuffer(50000)
        all_replay.append(replay_buffer)
        all_act.append(act)
        all_train.append(train)
        all_update.append(update_target)
        all_debug.append(debug)

# Simulate
FPS          = 30
ACTION_EVERY = 3
    
fast_mode = False
if fast_mode:
    WAIT, VISUALIZE_EVERY = False, 100
else:
    WAIT, VISUALIZE_EVERY = True, 1

elapsed = []
rewards = []
timesteps = [0]
    
# Initializing or reloading variables
# Start TensorFlow session with 2 CPUs
with U.make_session(2) as sess:
    
    # Initialize the parameters and copy them to the target network.
    U.initialize()
    for i in range(current_settings['num_objects_active']['prey']):
        name = 'prey' + str(i)
        with tf.variable_scope(name):
            update_target()
    for i in range(current_settings['num_objects_active']['pred']):
        name = 'pred13'
        with tf.variable_scope(name):
            update_target()
            
    if RUN == 'load':
#         # when only restoring a subset of variables
#         restore = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope = 'pred0')
#         saver = tf.train.Saver(restore)

        # reload models
        saver = tf.train.Saver()
        current_dir = os.getcwd()
        model_name = current_dir + '/saved_graphs/' + MODEL_RE + '.ckpt'
        saver.restore(sess, model_name)
        # reload replay buffers
        replay_name = current_dir + '/saved_graphs/' + REPLAY_RE + '.pkl'
        with open(replay_name, 'rb') as f:
            all_replay = pickle.load(f)
            
#         # remember to append buffer if restoring a subset of variables
#         all_replay.append(replay_buffer)
    
    # Run simulation
    try:
        simulate(simulation=g,
                 replay = all_replay,
                 act = all_act,
                 train = all_train,
                 update = all_update,
                 debug = all_debug,
                 fps=FPS,
                 visualize_every=VISUALIZE_EVERY,
                 action_every=ACTION_EVERY,
                 wait=WAIT,
                 disable_training=False,
                 simulation_resolution=.001,
                 save_path=None,
                 timesteps = timesteps,
                 elapsed = elapsed,
                 all_rewards = rewards)
        
    except KeyboardInterrupt:
        print("Interrupted")
        g.shut_down_graphics()
        print('graphics shut down')
        
    # Save models    
    model_name = 'saved_graphs/' + MODEL_NAME + '-' + str(timesteps[0]) + '.ckpt'
    saver = tf.train.Saver()
    saver.save(sess, model_name)
    
    # Save replay buffers
    replay_name = 'saved_graphs/' + REPLAY_NAME + '-' + str(timesteps[0]) + '.pkl'
    with open(replay_name, "wb") as f:
        pickle.dump(all_replay, f)
        
    # Save trial times
    elapse_name = 'saved_graphs/' + ELAPSE_NAME + '-' + str(timesteps[0]) + '.pkl'
    with open(elapse_name, "wb") as f:
        pickle.dump(elapsed, f)
        
    # Save rewards
    rewards_name = 'saved_graphs/' + REWARDS_NAME + '-' + str(timesteps[0]) + '.pkl'
    with open(rewards_name, "wb") as f:
        pickle.dump(rewards, f)

%reset -f 
from __future__ import print_function

import pickle
import os
import numpy as np
import tempfile
import tensorflow as tf
import matplotlib.pyplot as plt

from tf_rl.controller import DiscreteDeepQ, ModelController
from tf_rl.simulation import KarpathyGame
from tf_rl import simulate
from tf_rl.models import MLP
from collections import OrderedDict
from euclid import Vector2

from baselines import deepq
import baselines.common.tf_util as U
from baselines.deepq.replay_buffer import ReplayBuffer
from baselines.common.schedules import LinearSchedule
tf.reset_default_graph()



[2017-07-24 15:01:42,701] VARIABLES collection name is deprecated, please use GLOBAL_VARIABLES instead; VARIABLES will be removed after 2017-03-02.




[2017-07-24 15:01:42,778] VARIABLES collection name is deprecated, please use GLOBAL_VARIABLES instead; VARIABLES will be removed after 2017-03-02.


0
1
1
4
11
14
14
15
18
19
19
20
24
27
33
38
43
52
64
71
83
92
102
121
130
140
151
175
196
215
236
245
276
314
348
391
418
458
493
done


In [7]:
current_settings = {
    #earlier objects are eaten by later objects (pred eat prey)
    'objects': [
        'prey',
        'pred',
        'cue',
    ],
    'colors': {
        'prey': [212, 211, 208],
        'pred':  [100, 37, 0],
        'cue': [0,0,0],
    },
    'object_reward': {
        'prey': {'prey': 0.1, 'pred': -0.1, 'cue': 0.0},
        'pred': {'prey': 1.0, 'pred': -1.0, 'cue': 0.0},
    },
    'hero_bounces_off_walls': False,
    'world_size': (500,300),   
    "maximum_velocity":      {'prey': 0, 'pred': 50},
    "object_radius": 10.0,
    "cue_types": 2,
    "num_objects": OrderedDict([('prey', 5), ('pred', 1), ('cue', 1)]),
    # active means that the objects are learning
    "num_objects_active": OrderedDict([('prey', 0), ('pred', 1)]), 
    #'multiple' to create each DQN for each prey/predator
    #'one' to use one DQN for all preys/predators
    # only really matters if the preys/predators are active
    "network_prey": 'one',
    "network_pred": 'multiple',
    "num_observation_lines" : 32,
    "observation_line_length": 75.,
    "tolerable_distance_to_wall": 50,
    "wall_distance_penalty":  -1.0,
    "delta_v": 50
}

#'new' to create new sim with values above
#'load' to load a previously trained graph
RUN = 'new'  

# First three for names for saving new runs
MODEL_NAME = 'model-1pred-2cues-500trials-14'
REPLAY_NAME = 'replay-1pred-2cues-500trials-14'
ELAPSE_NAME = 'elapse-1pred-2cues-500trials-14'
REWARDS_NAME = 'rewards-1pred-2cues-500trials-14'

# Last two for names for reloading model/replay buffers
MODEL_RE = 'model-1-191685'
REPLAY_RE = 'replay-1-191685'

# create the game simulator
g = KarpathyGame(current_settings)

tf.reset_default_graph()

all_act = []
all_train = []
all_update = []
all_debug = []
all_replay = []

# Build graphs
if current_settings['num_objects_active']['prey'] != 0:
    if current_settings['network_prey'] == 'one':
        network_prey = 1
    else:
        network_prey = current_settings['num_objects_active']['prey']

    for i in range(network_prey):
        name = 'prey' + str(i)
        with tf.variable_scope(name):
            brain_prey = deepq.models.mlp([200, 200])
            act, train, update_target, debug = deepq.build_train(
                make_obs_ph=lambda name: U.BatchInput((g.observation_size,), name=name),
                q_func=brain_prey,
                num_actions=g.num_actions,
                optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),
            )
        replay_buffer = ReplayBuffer(50000)
        all_replay.append(replay_buffer)
        all_act.append(act)
        all_train.append(train)
        all_update.append(update_target)
        all_debug.append(debug)

if current_settings['num_objects_active']['pred'] != 0:
    if current_settings['network_pred'] == 'one':
        network_pred = 1
    else:
        network_pred = current_settings['num_objects_active']['pred']

    for i in range(network_pred):
        # CHANGE THIS
        name = 'pred14'
        with tf.variable_scope(name):
            brain_pred = deepq.models.mlp([200, 200])
            act, train, update_target, debug = deepq.build_train(
                make_obs_ph=lambda name: U.BatchInput((g.observation_size,), name=name),
                q_func=brain_pred,
                num_actions=g.num_actions,
                optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),
            )
        replay_buffer = ReplayBuffer(50000)
        all_replay.append(replay_buffer)
        all_act.append(act)
        all_train.append(train)
        all_update.append(update_target)
        all_debug.append(debug)

# Simulate
FPS          = 30
ACTION_EVERY = 3
    
fast_mode = False
if fast_mode:
    WAIT, VISUALIZE_EVERY = False, 100
else:
    WAIT, VISUALIZE_EVERY = True, 1

elapsed = []
rewards = []
timesteps = [0]
    
# Initializing or reloading variables
# Start TensorFlow session with 2 CPUs
with U.make_session(2) as sess:
    
    # Initialize the parameters and copy them to the target network.
    U.initialize()
    for i in range(current_settings['num_objects_active']['prey']):
        name = 'prey' + str(i)
        with tf.variable_scope(name):
            update_target()
    for i in range(current_settings['num_objects_active']['pred']):
        name = 'pred14'
        with tf.variable_scope(name):
            update_target()
            
    if RUN == 'load':
#         # when only restoring a subset of variables
#         restore = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope = 'pred0')
#         saver = tf.train.Saver(restore)

        # reload models
        saver = tf.train.Saver()
        current_dir = os.getcwd()
        model_name = current_dir + '/saved_graphs/' + MODEL_RE + '.ckpt'
        saver.restore(sess, model_name)
        # reload replay buffers
        replay_name = current_dir + '/saved_graphs/' + REPLAY_RE + '.pkl'
        with open(replay_name, 'rb') as f:
            all_replay = pickle.load(f)
            
#         # remember to append buffer if restoring a subset of variables
#         all_replay.append(replay_buffer)
    
    # Run simulation
    try:
        simulate(simulation=g,
                 replay = all_replay,
                 act = all_act,
                 train = all_train,
                 update = all_update,
                 debug = all_debug,
                 fps=FPS,
                 visualize_every=VISUALIZE_EVERY,
                 action_every=ACTION_EVERY,
                 wait=WAIT,
                 disable_training=False,
                 simulation_resolution=.001,
                 save_path=None,
                 timesteps = timesteps,
                 elapsed = elapsed,
                 all_rewards = rewards)
        
    except KeyboardInterrupt:
        print("Interrupted")
        g.shut_down_graphics()
        print('graphics shut down')
        
    # Save models    
    model_name = 'saved_graphs/' + MODEL_NAME + '-' + str(timesteps[0]) + '.ckpt'
    saver = tf.train.Saver()
    saver.save(sess, model_name)
    
    # Save replay buffers
    replay_name = 'saved_graphs/' + REPLAY_NAME + '-' + str(timesteps[0]) + '.pkl'
    with open(replay_name, "wb") as f:
        pickle.dump(all_replay, f)
        
    # Save trial times
    elapse_name = 'saved_graphs/' + ELAPSE_NAME + '-' + str(timesteps[0]) + '.pkl'
    with open(elapse_name, "wb") as f:
        pickle.dump(elapsed, f)
        
    # Save rewards
    rewards_name = 'saved_graphs/' + REWARDS_NAME + '-' + str(timesteps[0]) + '.pkl'
    with open(rewards_name, "wb") as f:
        pickle.dump(rewards, f)

%reset -f 
from __future__ import print_function

import pickle
import os
import numpy as np
import tempfile
import tensorflow as tf
import matplotlib.pyplot as plt

from tf_rl.controller import DiscreteDeepQ, ModelController
from tf_rl.simulation import KarpathyGame
from tf_rl import simulate
from tf_rl.models import MLP
from collections import OrderedDict
from euclid import Vector2

from baselines import deepq
import baselines.common.tf_util as U
from baselines.deepq.replay_buffer import ReplayBuffer
from baselines.common.schedules import LinearSchedule
tf.reset_default_graph()



[2017-07-24 15:09:50,243] VARIABLES collection name is deprecated, please use GLOBAL_VARIABLES instead; VARIABLES will be removed after 2017-03-02.




[2017-07-24 15:09:50,310] VARIABLES collection name is deprecated, please use GLOBAL_VARIABLES instead; VARIABLES will be removed after 2017-03-02.


0
1
2
2
4
4
5
6
9
10
11
13
18
21
22
26
26
34
39
44
46
51
61
81
84
97
110
116
124
132
155
170
187
199
220
238
250
258
283
307
340
365
391
435
473
486
done


In [8]:
current_settings = {
    #earlier objects are eaten by later objects (pred eat prey)
    'objects': [
        'prey',
        'pred',
        'cue',
    ],
    'colors': {
        'prey': [212, 211, 208],
        'pred':  [100, 37, 0],
        'cue': [0,0,0],
    },
    'object_reward': {
        'prey': {'prey': 0.1, 'pred': -0.1, 'cue': 0.0},
        'pred': {'prey': 1.0, 'pred': -1.0, 'cue': 0.0},
    },
    'hero_bounces_off_walls': False,
    'world_size': (500,300),   
    "maximum_velocity":      {'prey': 0, 'pred': 50},
    "object_radius": 10.0,
    "cue_types": 2,
    "num_objects": OrderedDict([('prey', 5), ('pred', 1), ('cue', 1)]),
    # active means that the objects are learning
    "num_objects_active": OrderedDict([('prey', 0), ('pred', 1)]), 
    #'multiple' to create each DQN for each prey/predator
    #'one' to use one DQN for all preys/predators
    # only really matters if the preys/predators are active
    "network_prey": 'one',
    "network_pred": 'multiple',
    "num_observation_lines" : 32,
    "observation_line_length": 75.,
    "tolerable_distance_to_wall": 50,
    "wall_distance_penalty":  -1.0,
    "delta_v": 50
}

#'new' to create new sim with values above
#'load' to load a previously trained graph
RUN = 'new'  

# First three for names for saving new runs
MODEL_NAME = 'model-1pred-2cues-500trials-15'
REPLAY_NAME = 'replay-1pred-2cues-500trials-15'
ELAPSE_NAME = 'elapse-1pred-2cues-500trials-15'
REWARDS_NAME = 'rewards-1pred-2cues-500trials-15'

# Last two for names for reloading model/replay buffers
MODEL_RE = 'model-1-191685'
REPLAY_RE = 'replay-1-191685'

# create the game simulator
g = KarpathyGame(current_settings)

tf.reset_default_graph()

all_act = []
all_train = []
all_update = []
all_debug = []
all_replay = []

# Build graphs
if current_settings['num_objects_active']['prey'] != 0:
    if current_settings['network_prey'] == 'one':
        network_prey = 1
    else:
        network_prey = current_settings['num_objects_active']['prey']

    for i in range(network_prey):
        name = 'prey' + str(i)
        with tf.variable_scope(name):
            brain_prey = deepq.models.mlp([200, 200])
            act, train, update_target, debug = deepq.build_train(
                make_obs_ph=lambda name: U.BatchInput((g.observation_size,), name=name),
                q_func=brain_prey,
                num_actions=g.num_actions,
                optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),
            )
        replay_buffer = ReplayBuffer(50000)
        all_replay.append(replay_buffer)
        all_act.append(act)
        all_train.append(train)
        all_update.append(update_target)
        all_debug.append(debug)

if current_settings['num_objects_active']['pred'] != 0:
    if current_settings['network_pred'] == 'one':
        network_pred = 1
    else:
        network_pred = current_settings['num_objects_active']['pred']

    for i in range(network_pred):
        # CHANGE THIS
        name = 'pred15'
        with tf.variable_scope(name):
            brain_pred = deepq.models.mlp([200, 200])
            act, train, update_target, debug = deepq.build_train(
                make_obs_ph=lambda name: U.BatchInput((g.observation_size,), name=name),
                q_func=brain_pred,
                num_actions=g.num_actions,
                optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),
            )
        replay_buffer = ReplayBuffer(50000)
        all_replay.append(replay_buffer)
        all_act.append(act)
        all_train.append(train)
        all_update.append(update_target)
        all_debug.append(debug)

# Simulate
FPS          = 30
ACTION_EVERY = 3
    
fast_mode = False
if fast_mode:
    WAIT, VISUALIZE_EVERY = False, 100
else:
    WAIT, VISUALIZE_EVERY = True, 1

elapsed = []
rewards = []
timesteps = [0]
    
# Initializing or reloading variables
# Start TensorFlow session with 2 CPUs
with U.make_session(2) as sess:
    
    # Initialize the parameters and copy them to the target network.
    U.initialize()
    for i in range(current_settings['num_objects_active']['prey']):
        name = 'prey' + str(i)
        with tf.variable_scope(name):
            update_target()
    for i in range(current_settings['num_objects_active']['pred']):
        name = 'pred15'
        with tf.variable_scope(name):
            update_target()
            
    if RUN == 'load':
#         # when only restoring a subset of variables
#         restore = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope = 'pred0')
#         saver = tf.train.Saver(restore)

        # reload models
        saver = tf.train.Saver()
        current_dir = os.getcwd()
        model_name = current_dir + '/saved_graphs/' + MODEL_RE + '.ckpt'
        saver.restore(sess, model_name)
        # reload replay buffers
        replay_name = current_dir + '/saved_graphs/' + REPLAY_RE + '.pkl'
        with open(replay_name, 'rb') as f:
            all_replay = pickle.load(f)
            
#         # remember to append buffer if restoring a subset of variables
#         all_replay.append(replay_buffer)
    
    # Run simulation
    try:
        simulate(simulation=g,
                 replay = all_replay,
                 act = all_act,
                 train = all_train,
                 update = all_update,
                 debug = all_debug,
                 fps=FPS,
                 visualize_every=VISUALIZE_EVERY,
                 action_every=ACTION_EVERY,
                 wait=WAIT,
                 disable_training=False,
                 simulation_resolution=.001,
                 save_path=None,
                 timesteps = timesteps,
                 elapsed = elapsed,
                 all_rewards = rewards)
        
    except KeyboardInterrupt:
        print("Interrupted")
        g.shut_down_graphics()
        print('graphics shut down')
        
    # Save models    
    model_name = 'saved_graphs/' + MODEL_NAME + '-' + str(timesteps[0]) + '.ckpt'
    saver = tf.train.Saver()
    saver.save(sess, model_name)
    
    # Save replay buffers
    replay_name = 'saved_graphs/' + REPLAY_NAME + '-' + str(timesteps[0]) + '.pkl'
    with open(replay_name, "wb") as f:
        pickle.dump(all_replay, f)
        
    # Save trial times
    elapse_name = 'saved_graphs/' + ELAPSE_NAME + '-' + str(timesteps[0]) + '.pkl'
    with open(elapse_name, "wb") as f:
        pickle.dump(elapsed, f)
        
    # Save rewards
    rewards_name = 'saved_graphs/' + REWARDS_NAME + '-' + str(timesteps[0]) + '.pkl'
    with open(rewards_name, "wb") as f:
        pickle.dump(rewards, f)

%reset -f 
from __future__ import print_function

import pickle
import os
import numpy as np
import tempfile
import tensorflow as tf
import matplotlib.pyplot as plt

from tf_rl.controller import DiscreteDeepQ, ModelController
from tf_rl.simulation import KarpathyGame
from tf_rl import simulate
from tf_rl.models import MLP
from collections import OrderedDict
from euclid import Vector2

from baselines import deepq
import baselines.common.tf_util as U
from baselines.deepq.replay_buffer import ReplayBuffer
from baselines.common.schedules import LinearSchedule
tf.reset_default_graph()



[2017-07-24 15:18:56,851] VARIABLES collection name is deprecated, please use GLOBAL_VARIABLES instead; VARIABLES will be removed after 2017-03-02.




[2017-07-24 15:18:56,917] VARIABLES collection name is deprecated, please use GLOBAL_VARIABLES instead; VARIABLES will be removed after 2017-03-02.


0
1
2
2
5
5
7
7
9
11
13
13
13
17
18
19
21
23
26
28
28
30
33
38
42
42
44
49
51
51
51
56
56
57
59
60
62
63
66
68
70
76
80
84
86
93
99
104
113
121
127
136
140
146
154
170
173
173
181
186
189
189
189
189
189
189
189
192
192
194
211
239
268
301
317
322
335
335
335
337
344
344
344
344
344
344
344
344
344
344
344
344
344
346
346
346
346
348
359
364
381
394
412
425
434
435
455
474
480
486
498
done


In [9]:
current_settings = {
    #earlier objects are eaten by later objects (pred eat prey)
    'objects': [
        'prey',
        'pred',
        'cue',
    ],
    'colors': {
        'prey': [212, 211, 208],
        'pred':  [100, 37, 0],
        'cue': [0,0,0],
    },
    'object_reward': {
        'prey': {'prey': 0.1, 'pred': -0.1, 'cue': 0.0},
        'pred': {'prey': 1.0, 'pred': -1.0, 'cue': 0.0},
    },
    'hero_bounces_off_walls': False,
    'world_size': (500,300),   
    "maximum_velocity":      {'prey': 0, 'pred': 50},
    "object_radius": 10.0,
    "cue_types": 2,
    "num_objects": OrderedDict([('prey', 5), ('pred', 1), ('cue', 1)]),
    # active means that the objects are learning
    "num_objects_active": OrderedDict([('prey', 0), ('pred', 1)]), 
    #'multiple' to create each DQN for each prey/predator
    #'one' to use one DQN for all preys/predators
    # only really matters if the preys/predators are active
    "network_prey": 'one',
    "network_pred": 'multiple',
    "num_observation_lines" : 32,
    "observation_line_length": 75.,
    "tolerable_distance_to_wall": 50,
    "wall_distance_penalty":  -1.0,
    "delta_v": 50
}

#'new' to create new sim with values above
#'load' to load a previously trained graph
RUN = 'new'  

# First three for names for saving new runs
MODEL_NAME = 'model-1pred-2cues-500trials-16'
REPLAY_NAME = 'replay-1pred-2cues-500trials-16'
ELAPSE_NAME = 'elapse-1pred-2cues-500trials-16'
REWARDS_NAME = 'rewards-1pred-2cues-500trials-16'

# Last two for names for reloading model/replay buffers
MODEL_RE = 'model-1-191685'
REPLAY_RE = 'replay-1-191685'

# create the game simulator
g = KarpathyGame(current_settings)

tf.reset_default_graph()

all_act = []
all_train = []
all_update = []
all_debug = []
all_replay = []

# Build graphs
if current_settings['num_objects_active']['prey'] != 0:
    if current_settings['network_prey'] == 'one':
        network_prey = 1
    else:
        network_prey = current_settings['num_objects_active']['prey']

    for i in range(network_prey):
        name = 'prey' + str(i)
        with tf.variable_scope(name):
            brain_prey = deepq.models.mlp([200, 200])
            act, train, update_target, debug = deepq.build_train(
                make_obs_ph=lambda name: U.BatchInput((g.observation_size,), name=name),
                q_func=brain_prey,
                num_actions=g.num_actions,
                optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),
            )
        replay_buffer = ReplayBuffer(50000)
        all_replay.append(replay_buffer)
        all_act.append(act)
        all_train.append(train)
        all_update.append(update_target)
        all_debug.append(debug)

if current_settings['num_objects_active']['pred'] != 0:
    if current_settings['network_pred'] == 'one':
        network_pred = 1
    else:
        network_pred = current_settings['num_objects_active']['pred']

    for i in range(network_pred):
        # CHANGE THIS
        name = 'pred16'
        with tf.variable_scope(name):
            brain_pred = deepq.models.mlp([200, 200])
            act, train, update_target, debug = deepq.build_train(
                make_obs_ph=lambda name: U.BatchInput((g.observation_size,), name=name),
                q_func=brain_pred,
                num_actions=g.num_actions,
                optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),
            )
        replay_buffer = ReplayBuffer(50000)
        all_replay.append(replay_buffer)
        all_act.append(act)
        all_train.append(train)
        all_update.append(update_target)
        all_debug.append(debug)

# Simulate
FPS          = 30
ACTION_EVERY = 3
    
fast_mode = False
if fast_mode:
    WAIT, VISUALIZE_EVERY = False, 100
else:
    WAIT, VISUALIZE_EVERY = True, 1

elapsed = []
rewards = []
timesteps = [0]
    
# Initializing or reloading variables
# Start TensorFlow session with 2 CPUs
with U.make_session(2) as sess:
    
    # Initialize the parameters and copy them to the target network.
    U.initialize()
    for i in range(current_settings['num_objects_active']['prey']):
        name = 'prey' + str(i)
        with tf.variable_scope(name):
            update_target()
    for i in range(current_settings['num_objects_active']['pred']):
        name = 'pred16'
        with tf.variable_scope(name):
            update_target()
            
    if RUN == 'load':
#         # when only restoring a subset of variables
#         restore = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope = 'pred0')
#         saver = tf.train.Saver(restore)

        # reload models
        saver = tf.train.Saver()
        current_dir = os.getcwd()
        model_name = current_dir + '/saved_graphs/' + MODEL_RE + '.ckpt'
        saver.restore(sess, model_name)
        # reload replay buffers
        replay_name = current_dir + '/saved_graphs/' + REPLAY_RE + '.pkl'
        with open(replay_name, 'rb') as f:
            all_replay = pickle.load(f)
            
#         # remember to append buffer if restoring a subset of variables
#         all_replay.append(replay_buffer)
    
    # Run simulation
    try:
        simulate(simulation=g,
                 replay = all_replay,
                 act = all_act,
                 train = all_train,
                 update = all_update,
                 debug = all_debug,
                 fps=FPS,
                 visualize_every=VISUALIZE_EVERY,
                 action_every=ACTION_EVERY,
                 wait=WAIT,
                 disable_training=False,
                 simulation_resolution=.001,
                 save_path=None,
                 timesteps = timesteps,
                 elapsed = elapsed,
                 all_rewards = rewards)
        
    except KeyboardInterrupt:
        print("Interrupted")
        g.shut_down_graphics()
        print('graphics shut down')
        
    # Save models    
    model_name = 'saved_graphs/' + MODEL_NAME + '-' + str(timesteps[0]) + '.ckpt'
    saver = tf.train.Saver()
    saver.save(sess, model_name)
    
    # Save replay buffers
    replay_name = 'saved_graphs/' + REPLAY_NAME + '-' + str(timesteps[0]) + '.pkl'
    with open(replay_name, "wb") as f:
        pickle.dump(all_replay, f)
        
    # Save trial times
    elapse_name = 'saved_graphs/' + ELAPSE_NAME + '-' + str(timesteps[0]) + '.pkl'
    with open(elapse_name, "wb") as f:
        pickle.dump(elapsed, f)
        
    # Save rewards
    rewards_name = 'saved_graphs/' + REWARDS_NAME + '-' + str(timesteps[0]) + '.pkl'
    with open(rewards_name, "wb") as f:
        pickle.dump(rewards, f)

%reset -f 
from __future__ import print_function

import pickle
import os
import numpy as np
import tempfile
import tensorflow as tf
import matplotlib.pyplot as plt

from tf_rl.controller import DiscreteDeepQ, ModelController
from tf_rl.simulation import KarpathyGame
from tf_rl import simulate
from tf_rl.models import MLP
from collections import OrderedDict
from euclid import Vector2

from baselines import deepq
import baselines.common.tf_util as U
from baselines.deepq.replay_buffer import ReplayBuffer
from baselines.common.schedules import LinearSchedule
tf.reset_default_graph()



[2017-07-24 15:42:14,167] VARIABLES collection name is deprecated, please use GLOBAL_VARIABLES instead; VARIABLES will be removed after 2017-03-02.




[2017-07-24 15:42:14,249] VARIABLES collection name is deprecated, please use GLOBAL_VARIABLES instead; VARIABLES will be removed after 2017-03-02.


0
0
0
2
3
5
6
7
10
12
14
14
15
16
19
19
21
22
23
23
23
23
23
23
24
24
24
25
27
27
28
29
29
32
33
35
35
36
37
37
40
40
40
40
41
41
44
48
54
64
71
77
78
85
91
96
112
138
150
177
194
217
246
269
269
304
318
320
329
329
343
378
388
401
417
433
476
done


In [10]:
current_settings = {
    #earlier objects are eaten by later objects (pred eat prey)
    'objects': [
        'prey',
        'pred',
        'cue',
    ],
    'colors': {
        'prey': [212, 211, 208],
        'pred':  [100, 37, 0],
        'cue': [0,0,0],
    },
    'object_reward': {
        'prey': {'prey': 0.1, 'pred': -0.1, 'cue': 0.0},
        'pred': {'prey': 1.0, 'pred': -1.0, 'cue': 0.0},
    },
    'hero_bounces_off_walls': False,
    'world_size': (500,300),   
    "maximum_velocity":      {'prey': 0, 'pred': 50},
    "object_radius": 10.0,
    "cue_types": 2,
    "num_objects": OrderedDict([('prey', 5), ('pred', 1), ('cue', 1)]),
    # active means that the objects are learning
    "num_objects_active": OrderedDict([('prey', 0), ('pred', 1)]), 
    #'multiple' to create each DQN for each prey/predator
    #'one' to use one DQN for all preys/predators
    # only really matters if the preys/predators are active
    "network_prey": 'one',
    "network_pred": 'multiple',
    "num_observation_lines" : 32,
    "observation_line_length": 75.,
    "tolerable_distance_to_wall": 50,
    "wall_distance_penalty":  -1.0,
    "delta_v": 50
}

#'new' to create new sim with values above
#'load' to load a previously trained graph
RUN = 'new'  

# First three for names for saving new runs
MODEL_NAME = 'model-1pred-2cues-500trials-17'
REPLAY_NAME = 'replay-1pred-2cues-500trials-17'
ELAPSE_NAME = 'elapse-1pred-2cues-500trials-17'
REWARDS_NAME = 'rewards-1pred-2cues-500trials-17'

# Last two for names for reloading model/replay buffers
MODEL_RE = 'model-1-191685'
REPLAY_RE = 'replay-1-191685'

# create the game simulator
g = KarpathyGame(current_settings)

tf.reset_default_graph()

all_act = []
all_train = []
all_update = []
all_debug = []
all_replay = []

# Build graphs
if current_settings['num_objects_active']['prey'] != 0:
    if current_settings['network_prey'] == 'one':
        network_prey = 1
    else:
        network_prey = current_settings['num_objects_active']['prey']

    for i in range(network_prey):
        name = 'prey' + str(i)
        with tf.variable_scope(name):
            brain_prey = deepq.models.mlp([200, 200])
            act, train, update_target, debug = deepq.build_train(
                make_obs_ph=lambda name: U.BatchInput((g.observation_size,), name=name),
                q_func=brain_prey,
                num_actions=g.num_actions,
                optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),
            )
        replay_buffer = ReplayBuffer(50000)
        all_replay.append(replay_buffer)
        all_act.append(act)
        all_train.append(train)
        all_update.append(update_target)
        all_debug.append(debug)

if current_settings['num_objects_active']['pred'] != 0:
    if current_settings['network_pred'] == 'one':
        network_pred = 1
    else:
        network_pred = current_settings['num_objects_active']['pred']

    for i in range(network_pred):
        # CHANGE THIS
        name = 'pred17'
        with tf.variable_scope(name):
            brain_pred = deepq.models.mlp([200, 200])
            act, train, update_target, debug = deepq.build_train(
                make_obs_ph=lambda name: U.BatchInput((g.observation_size,), name=name),
                q_func=brain_pred,
                num_actions=g.num_actions,
                optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),
            )
        replay_buffer = ReplayBuffer(50000)
        all_replay.append(replay_buffer)
        all_act.append(act)
        all_train.append(train)
        all_update.append(update_target)
        all_debug.append(debug)

# Simulate
FPS          = 30
ACTION_EVERY = 3
    
fast_mode = False
if fast_mode:
    WAIT, VISUALIZE_EVERY = False, 100
else:
    WAIT, VISUALIZE_EVERY = True, 1

elapsed = []
rewards = []
timesteps = [0]
    
# Initializing or reloading variables
# Start TensorFlow session with 2 CPUs
with U.make_session(2) as sess:
    
    # Initialize the parameters and copy them to the target network.
    U.initialize()
    for i in range(current_settings['num_objects_active']['prey']):
        name = 'prey' + str(i)
        with tf.variable_scope(name):
            update_target()
    for i in range(current_settings['num_objects_active']['pred']):
        name = 'pred17'
        with tf.variable_scope(name):
            update_target()
            
    if RUN == 'load':
#         # when only restoring a subset of variables
#         restore = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope = 'pred0')
#         saver = tf.train.Saver(restore)

        # reload models
        saver = tf.train.Saver()
        current_dir = os.getcwd()
        model_name = current_dir + '/saved_graphs/' + MODEL_RE + '.ckpt'
        saver.restore(sess, model_name)
        # reload replay buffers
        replay_name = current_dir + '/saved_graphs/' + REPLAY_RE + '.pkl'
        with open(replay_name, 'rb') as f:
            all_replay = pickle.load(f)
            
#         # remember to append buffer if restoring a subset of variables
#         all_replay.append(replay_buffer)
    
    # Run simulation
    try:
        simulate(simulation=g,
                 replay = all_replay,
                 act = all_act,
                 train = all_train,
                 update = all_update,
                 debug = all_debug,
                 fps=FPS,
                 visualize_every=VISUALIZE_EVERY,
                 action_every=ACTION_EVERY,
                 wait=WAIT,
                 disable_training=False,
                 simulation_resolution=.001,
                 save_path=None,
                 timesteps = timesteps,
                 elapsed = elapsed,
                 all_rewards = rewards)
        
    except KeyboardInterrupt:
        print("Interrupted")
        g.shut_down_graphics()
        print('graphics shut down')
        
    # Save models    
    model_name = 'saved_graphs/' + MODEL_NAME + '-' + str(timesteps[0]) + '.ckpt'
    saver = tf.train.Saver()
    saver.save(sess, model_name)
    
    # Save replay buffers
    replay_name = 'saved_graphs/' + REPLAY_NAME + '-' + str(timesteps[0]) + '.pkl'
    with open(replay_name, "wb") as f:
        pickle.dump(all_replay, f)
        
    # Save trial times
    elapse_name = 'saved_graphs/' + ELAPSE_NAME + '-' + str(timesteps[0]) + '.pkl'
    with open(elapse_name, "wb") as f:
        pickle.dump(elapsed, f)
        
    # Save rewards
    rewards_name = 'saved_graphs/' + REWARDS_NAME + '-' + str(timesteps[0]) + '.pkl'
    with open(rewards_name, "wb") as f:
        pickle.dump(rewards, f)

%reset -f 
from __future__ import print_function

import pickle
import os
import numpy as np
import tempfile
import tensorflow as tf
import matplotlib.pyplot as plt

from tf_rl.controller import DiscreteDeepQ, ModelController
from tf_rl.simulation import KarpathyGame
from tf_rl import simulate
from tf_rl.models import MLP
from collections import OrderedDict
from euclid import Vector2

from baselines import deepq
import baselines.common.tf_util as U
from baselines.deepq.replay_buffer import ReplayBuffer
from baselines.common.schedules import LinearSchedule
tf.reset_default_graph()



[2017-07-24 15:57:59,126] VARIABLES collection name is deprecated, please use GLOBAL_VARIABLES instead; VARIABLES will be removed after 2017-03-02.




[2017-07-24 15:57:59,196] VARIABLES collection name is deprecated, please use GLOBAL_VARIABLES instead; VARIABLES will be removed after 2017-03-02.


0
1
1
3
3
6
6
6
7
7
11
12
12
12
15
17
17
17
17
19
24
27
30
35
39
49
56
60
68
72
78
87
97
112
116
126
131
144
165
178
190
201
218
240
266
277
289
316
339
359
387
427
442
464
done


In [11]:
current_settings = {
    #earlier objects are eaten by later objects (pred eat prey)
    'objects': [
        'prey',
        'pred',
        'cue',
    ],
    'colors': {
        'prey': [212, 211, 208],
        'pred':  [100, 37, 0],
        'cue': [0,0,0],
    },
    'object_reward': {
        'prey': {'prey': 0.1, 'pred': -0.1, 'cue': 0.0},
        'pred': {'prey': 1.0, 'pred': -1.0, 'cue': 0.0},
    },
    'hero_bounces_off_walls': False,
    'world_size': (500,300),   
    "maximum_velocity":      {'prey': 0, 'pred': 50},
    "object_radius": 10.0,
    "cue_types": 2,
    "num_objects": OrderedDict([('prey', 5), ('pred', 1), ('cue', 1)]),
    # active means that the objects are learning
    "num_objects_active": OrderedDict([('prey', 0), ('pred', 1)]), 
    #'multiple' to create each DQN for each prey/predator
    #'one' to use one DQN for all preys/predators
    # only really matters if the preys/predators are active
    "network_prey": 'one',
    "network_pred": 'multiple',
    "num_observation_lines" : 32,
    "observation_line_length": 75.,
    "tolerable_distance_to_wall": 50,
    "wall_distance_penalty":  -1.0,
    "delta_v": 50
}

#'new' to create new sim with values above
#'load' to load a previously trained graph
RUN = 'new'  

# First three for names for saving new runs
MODEL_NAME = 'model-1pred-2cues-500trials-18'
REPLAY_NAME = 'replay-1pred-2cues-500trials-18'
ELAPSE_NAME = 'elapse-1pred-2cues-500trials-18'
REWARDS_NAME = 'rewards-1pred-2cues-500trials-18'

# Last two for names for reloading model/replay buffers
MODEL_RE = 'model-1-191685'
REPLAY_RE = 'replay-1-191685'

# create the game simulator
g = KarpathyGame(current_settings)

tf.reset_default_graph()

all_act = []
all_train = []
all_update = []
all_debug = []
all_replay = []

# Build graphs
if current_settings['num_objects_active']['prey'] != 0:
    if current_settings['network_prey'] == 'one':
        network_prey = 1
    else:
        network_prey = current_settings['num_objects_active']['prey']

    for i in range(network_prey):
        name = 'prey' + str(i)
        with tf.variable_scope(name):
            brain_prey = deepq.models.mlp([200, 200])
            act, train, update_target, debug = deepq.build_train(
                make_obs_ph=lambda name: U.BatchInput((g.observation_size,), name=name),
                q_func=brain_prey,
                num_actions=g.num_actions,
                optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),
            )
        replay_buffer = ReplayBuffer(50000)
        all_replay.append(replay_buffer)
        all_act.append(act)
        all_train.append(train)
        all_update.append(update_target)
        all_debug.append(debug)

if current_settings['num_objects_active']['pred'] != 0:
    if current_settings['network_pred'] == 'one':
        network_pred = 1
    else:
        network_pred = current_settings['num_objects_active']['pred']

    for i in range(network_pred):
        # CHANGE THIS
        name = 'pred18'
        with tf.variable_scope(name):
            brain_pred = deepq.models.mlp([200, 200])
            act, train, update_target, debug = deepq.build_train(
                make_obs_ph=lambda name: U.BatchInput((g.observation_size,), name=name),
                q_func=brain_pred,
                num_actions=g.num_actions,
                optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),
            )
        replay_buffer = ReplayBuffer(50000)
        all_replay.append(replay_buffer)
        all_act.append(act)
        all_train.append(train)
        all_update.append(update_target)
        all_debug.append(debug)

# Simulate
FPS          = 30
ACTION_EVERY = 3
    
fast_mode = False
if fast_mode:
    WAIT, VISUALIZE_EVERY = False, 100
else:
    WAIT, VISUALIZE_EVERY = True, 1

elapsed = []
rewards = []
timesteps = [0]
    
# Initializing or reloading variables
# Start TensorFlow session with 2 CPUs
with U.make_session(2) as sess:
    
    # Initialize the parameters and copy them to the target network.
    U.initialize()
    for i in range(current_settings['num_objects_active']['prey']):
        name = 'prey' + str(i)
        with tf.variable_scope(name):
            update_target()
    for i in range(current_settings['num_objects_active']['pred']):
        name = 'pred18'
        with tf.variable_scope(name):
            update_target()
            
    if RUN == 'load':
#         # when only restoring a subset of variables
#         restore = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope = 'pred0')
#         saver = tf.train.Saver(restore)

        # reload models
        saver = tf.train.Saver()
        current_dir = os.getcwd()
        model_name = current_dir + '/saved_graphs/' + MODEL_RE + '.ckpt'
        saver.restore(sess, model_name)
        # reload replay buffers
        replay_name = current_dir + '/saved_graphs/' + REPLAY_RE + '.pkl'
        with open(replay_name, 'rb') as f:
            all_replay = pickle.load(f)
            
#         # remember to append buffer if restoring a subset of variables
#         all_replay.append(replay_buffer)
    
    # Run simulation
    try:
        simulate(simulation=g,
                 replay = all_replay,
                 act = all_act,
                 train = all_train,
                 update = all_update,
                 debug = all_debug,
                 fps=FPS,
                 visualize_every=VISUALIZE_EVERY,
                 action_every=ACTION_EVERY,
                 wait=WAIT,
                 disable_training=False,
                 simulation_resolution=.001,
                 save_path=None,
                 timesteps = timesteps,
                 elapsed = elapsed,
                 all_rewards = rewards)
        
    except KeyboardInterrupt:
        print("Interrupted")
        g.shut_down_graphics()
        print('graphics shut down')
        
    # Save models    
    model_name = 'saved_graphs/' + MODEL_NAME + '-' + str(timesteps[0]) + '.ckpt'
    saver = tf.train.Saver()
    saver.save(sess, model_name)
    
    # Save replay buffers
    replay_name = 'saved_graphs/' + REPLAY_NAME + '-' + str(timesteps[0]) + '.pkl'
    with open(replay_name, "wb") as f:
        pickle.dump(all_replay, f)
        
    # Save trial times
    elapse_name = 'saved_graphs/' + ELAPSE_NAME + '-' + str(timesteps[0]) + '.pkl'
    with open(elapse_name, "wb") as f:
        pickle.dump(elapsed, f)
        
    # Save rewards
    rewards_name = 'saved_graphs/' + REWARDS_NAME + '-' + str(timesteps[0]) + '.pkl'
    with open(rewards_name, "wb") as f:
        pickle.dump(rewards, f)

%reset -f 
from __future__ import print_function

import pickle
import os
import numpy as np
import tempfile
import tensorflow as tf
import matplotlib.pyplot as plt

from tf_rl.controller import DiscreteDeepQ, ModelController
from tf_rl.simulation import KarpathyGame
from tf_rl import simulate
from tf_rl.models import MLP
from collections import OrderedDict
from euclid import Vector2

from baselines import deepq
import baselines.common.tf_util as U
from baselines.deepq.replay_buffer import ReplayBuffer
from baselines.common.schedules import LinearSchedule
tf.reset_default_graph()



[2017-07-24 16:09:45,427] VARIABLES collection name is deprecated, please use GLOBAL_VARIABLES instead; VARIABLES will be removed after 2017-03-02.




[2017-07-24 16:09:45,506] VARIABLES collection name is deprecated, please use GLOBAL_VARIABLES instead; VARIABLES will be removed after 2017-03-02.


0
0
1
5
6
8
9
9
10
10
11
12
12
13
13
13
13
13
14
14
14
16
21
24
25
29
34
36
40
42
45
50
53
57
58
58
61
67
69
71
75
80
84
88
97
101
107
110
116
121
128
139
146
157
172
192
205
217
229
247
258
276
293
315
329
346
363
387
414
445
478
done


In [12]:
current_settings = {
    #earlier objects are eaten by later objects (pred eat prey)
    'objects': [
        'prey',
        'pred',
        'cue',
    ],
    'colors': {
        'prey': [212, 211, 208],
        'pred':  [100, 37, 0],
        'cue': [0,0,0],
    },
    'object_reward': {
        'prey': {'prey': 0.1, 'pred': -0.1, 'cue': 0.0},
        'pred': {'prey': 1.0, 'pred': -1.0, 'cue': 0.0},
    },
    'hero_bounces_off_walls': False,
    'world_size': (500,300),   
    "maximum_velocity":      {'prey': 0, 'pred': 50},
    "object_radius": 10.0,
    "cue_types": 2,
    "num_objects": OrderedDict([('prey', 5), ('pred', 1), ('cue', 1)]),
    # active means that the objects are learning
    "num_objects_active": OrderedDict([('prey', 0), ('pred', 1)]), 
    #'multiple' to create each DQN for each prey/predator
    #'one' to use one DQN for all preys/predators
    # only really matters if the preys/predators are active
    "network_prey": 'one',
    "network_pred": 'multiple',
    "num_observation_lines" : 32,
    "observation_line_length": 75.,
    "tolerable_distance_to_wall": 50,
    "wall_distance_penalty":  -1.0,
    "delta_v": 50
}

#'new' to create new sim with values above
#'load' to load a previously trained graph
RUN = 'new'  

# First three for names for saving new runs
MODEL_NAME = 'model-1pred-2cues-500trials-19'
REPLAY_NAME = 'replay-1pred-2cues-500trials-19'
ELAPSE_NAME = 'elapse-1pred-2cues-500trials-19'
REWARDS_NAME = 'rewards-1pred-2cues-500trials-19'

# Last two for names for reloading model/replay buffers
MODEL_RE = 'model-1-191685'
REPLAY_RE = 'replay-1-191685'

# create the game simulator
g = KarpathyGame(current_settings)

tf.reset_default_graph()

all_act = []
all_train = []
all_update = []
all_debug = []
all_replay = []

# Build graphs
if current_settings['num_objects_active']['prey'] != 0:
    if current_settings['network_prey'] == 'one':
        network_prey = 1
    else:
        network_prey = current_settings['num_objects_active']['prey']

    for i in range(network_prey):
        name = 'prey' + str(i)
        with tf.variable_scope(name):
            brain_prey = deepq.models.mlp([200, 200])
            act, train, update_target, debug = deepq.build_train(
                make_obs_ph=lambda name: U.BatchInput((g.observation_size,), name=name),
                q_func=brain_prey,
                num_actions=g.num_actions,
                optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),
            )
        replay_buffer = ReplayBuffer(50000)
        all_replay.append(replay_buffer)
        all_act.append(act)
        all_train.append(train)
        all_update.append(update_target)
        all_debug.append(debug)

if current_settings['num_objects_active']['pred'] != 0:
    if current_settings['network_pred'] == 'one':
        network_pred = 1
    else:
        network_pred = current_settings['num_objects_active']['pred']

    for i in range(network_pred):
        # CHANGE THIS
        name = 'pred19'
        with tf.variable_scope(name):
            brain_pred = deepq.models.mlp([200, 200])
            act, train, update_target, debug = deepq.build_train(
                make_obs_ph=lambda name: U.BatchInput((g.observation_size,), name=name),
                q_func=brain_pred,
                num_actions=g.num_actions,
                optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),
            )
        replay_buffer = ReplayBuffer(50000)
        all_replay.append(replay_buffer)
        all_act.append(act)
        all_train.append(train)
        all_update.append(update_target)
        all_debug.append(debug)

# Simulate
FPS          = 30
ACTION_EVERY = 3
    
fast_mode = False
if fast_mode:
    WAIT, VISUALIZE_EVERY = False, 100
else:
    WAIT, VISUALIZE_EVERY = True, 1

elapsed = []
rewards = []
timesteps = [0]
    
# Initializing or reloading variables
# Start TensorFlow session with 2 CPUs
with U.make_session(2) as sess:
    
    # Initialize the parameters and copy them to the target network.
    U.initialize()
    for i in range(current_settings['num_objects_active']['prey']):
        name = 'prey' + str(i)
        with tf.variable_scope(name):
            update_target()
    for i in range(current_settings['num_objects_active']['pred']):
        name = 'pred19'
        with tf.variable_scope(name):
            update_target()
            
    if RUN == 'load':
#         # when only restoring a subset of variables
#         restore = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope = 'pred0')
#         saver = tf.train.Saver(restore)

        # reload models
        saver = tf.train.Saver()
        current_dir = os.getcwd()
        model_name = current_dir + '/saved_graphs/' + MODEL_RE + '.ckpt'
        saver.restore(sess, model_name)
        # reload replay buffers
        replay_name = current_dir + '/saved_graphs/' + REPLAY_RE + '.pkl'
        with open(replay_name, 'rb') as f:
            all_replay = pickle.load(f)
            
#         # remember to append buffer if restoring a subset of variables
#         all_replay.append(replay_buffer)
    
    # Run simulation
    try:
        simulate(simulation=g,
                 replay = all_replay,
                 act = all_act,
                 train = all_train,
                 update = all_update,
                 debug = all_debug,
                 fps=FPS,
                 visualize_every=VISUALIZE_EVERY,
                 action_every=ACTION_EVERY,
                 wait=WAIT,
                 disable_training=False,
                 simulation_resolution=.001,
                 save_path=None,
                 timesteps = timesteps,
                 elapsed = elapsed,
                 all_rewards = rewards)
        
    except KeyboardInterrupt:
        print("Interrupted")
        g.shut_down_graphics()
        print('graphics shut down')
        
    # Save models    
    model_name = 'saved_graphs/' + MODEL_NAME + '-' + str(timesteps[0]) + '.ckpt'
    saver = tf.train.Saver()
    saver.save(sess, model_name)
    
    # Save replay buffers
    replay_name = 'saved_graphs/' + REPLAY_NAME + '-' + str(timesteps[0]) + '.pkl'
    with open(replay_name, "wb") as f:
        pickle.dump(all_replay, f)
        
    # Save trial times
    elapse_name = 'saved_graphs/' + ELAPSE_NAME + '-' + str(timesteps[0]) + '.pkl'
    with open(elapse_name, "wb") as f:
        pickle.dump(elapsed, f)
        
    # Save rewards
    rewards_name = 'saved_graphs/' + REWARDS_NAME + '-' + str(timesteps[0]) + '.pkl'
    with open(rewards_name, "wb") as f:
        pickle.dump(rewards, f)

%reset -f 
from __future__ import print_function

import pickle
import os
import numpy as np
import tempfile
import tensorflow as tf
import matplotlib.pyplot as plt

from tf_rl.controller import DiscreteDeepQ, ModelController
from tf_rl.simulation import KarpathyGame
from tf_rl import simulate
from tf_rl.models import MLP
from collections import OrderedDict
from euclid import Vector2

from baselines import deepq
import baselines.common.tf_util as U
from baselines.deepq.replay_buffer import ReplayBuffer
from baselines.common.schedules import LinearSchedule
tf.reset_default_graph()



[2017-07-24 16:23:52,999] VARIABLES collection name is deprecated, please use GLOBAL_VARIABLES instead; VARIABLES will be removed after 2017-03-02.




[2017-07-24 16:23:53,074] VARIABLES collection name is deprecated, please use GLOBAL_VARIABLES instead; VARIABLES will be removed after 2017-03-02.


0
2
4
6
9
11
12
12
12
12
12
14
14
15
16
17
17
17
21
22
24
25
28
29
29
29
29
30
32
34
35
35
37
37
37
37
39
41
41
41
44
44
51
57
65
70
75
77
78
80
85
86
86
87
90
94
95
95
99
105
108
113
114
119
132
143
158
178
190
198
215
232
246
255
259
272
289
302
325
346
363
399
428
451
478
done


In [13]:
current_settings = {
    #earlier objects are eaten by later objects (pred eat prey)
    'objects': [
        'prey',
        'pred',
        'cue',
    ],
    'colors': {
        'prey': [212, 211, 208],
        'pred':  [100, 37, 0],
        'cue': [0,0,0],
    },
    'object_reward': {
        'prey': {'prey': 0.1, 'pred': -0.1, 'cue': 0.0},
        'pred': {'prey': 1.0, 'pred': -1.0, 'cue': 0.0},
    },
    'hero_bounces_off_walls': False,
    'world_size': (500,300),   
    "maximum_velocity":      {'prey': 0, 'pred': 50},
    "object_radius": 10.0,
    "cue_types": 2,
    "num_objects": OrderedDict([('prey', 5), ('pred', 1), ('cue', 1)]),
    # active means that the objects are learning
    "num_objects_active": OrderedDict([('prey', 0), ('pred', 1)]), 
    #'multiple' to create each DQN for each prey/predator
    #'one' to use one DQN for all preys/predators
    # only really matters if the preys/predators are active
    "network_prey": 'one',
    "network_pred": 'multiple',
    "num_observation_lines" : 32,
    "observation_line_length": 75.,
    "tolerable_distance_to_wall": 50,
    "wall_distance_penalty":  -1.0,
    "delta_v": 50
}

#'new' to create new sim with values above
#'load' to load a previously trained graph
RUN = 'new'  

# First three for names for saving new runs
MODEL_NAME = 'model-1pred-2cues-500trials-20'
REPLAY_NAME = 'replay-1pred-2cues-500trials-20'
ELAPSE_NAME = 'elapse-1pred-2cues-500trials-20'
REWARDS_NAME = 'rewards-1pred-2cues-500trials-20'

# Last two for names for reloading model/replay buffers
MODEL_RE = 'model-1-201685'
REPLAY_RE = 'replay-1-201685'

# create the game simulator
g = KarpathyGame(current_settings)

tf.reset_default_graph()

all_act = []
all_train = []
all_update = []
all_debug = []
all_replay = []

# Build graphs
if current_settings['num_objects_active']['prey'] != 0:
    if current_settings['network_prey'] == 'one':
        network_prey = 1
    else:
        network_prey = current_settings['num_objects_active']['prey']

    for i in range(network_prey):
        name = 'prey' + str(i)
        with tf.variable_scope(name):
            brain_prey = deepq.models.mlp([200, 200])
            act, train, update_target, debug = deepq.build_train(
                make_obs_ph=lambda name: U.BatchInput((g.observation_size,), name=name),
                q_func=brain_prey,
                num_actions=g.num_actions,
                optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),
            )
        replay_buffer = ReplayBuffer(50000)
        all_replay.append(replay_buffer)
        all_act.append(act)
        all_train.append(train)
        all_update.append(update_target)
        all_debug.append(debug)

if current_settings['num_objects_active']['pred'] != 0:
    if current_settings['network_pred'] == 'one':
        network_pred = 1
    else:
        network_pred = current_settings['num_objects_active']['pred']

    for i in range(network_pred):
        # CHANGE THIS
        name = 'pred20'
        with tf.variable_scope(name):
            brain_pred = deepq.models.mlp([200, 200])
            act, train, update_target, debug = deepq.build_train(
                make_obs_ph=lambda name: U.BatchInput((g.observation_size,), name=name),
                q_func=brain_pred,
                num_actions=g.num_actions,
                optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),
            )
        replay_buffer = ReplayBuffer(50000)
        all_replay.append(replay_buffer)
        all_act.append(act)
        all_train.append(train)
        all_update.append(update_target)
        all_debug.append(debug)

# Simulate
FPS          = 30
ACTION_EVERY = 3
    
fast_mode = False
if fast_mode:
    WAIT, VISUALIZE_EVERY = False, 100
else:
    WAIT, VISUALIZE_EVERY = True, 1

elapsed = []
rewards = []
timesteps = [0]
    
# Initializing or reloading variables
# Start TensorFlow session with 2 CPUs
with U.make_session(2) as sess:
    
    # Initialize the parameters and copy them to the target network.
    U.initialize()
    for i in range(current_settings['num_objects_active']['prey']):
        name = 'prey' + str(i)
        with tf.variable_scope(name):
            update_target()
    for i in range(current_settings['num_objects_active']['pred']):
        name = 'pred20'
        with tf.variable_scope(name):
            update_target()
            
    if RUN == 'load':
#         # when only restoring a subset of variables
#         restore = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope = 'pred0')
#         saver = tf.train.Saver(restore)

        # reload models
        saver = tf.train.Saver()
        current_dir = os.getcwd()
        model_name = current_dir + '/saved_graphs/' + MODEL_RE + '.ckpt'
        saver.restore(sess, model_name)
        # reload replay buffers
        replay_name = current_dir + '/saved_graphs/' + REPLAY_RE + '.pkl'
        with open(replay_name, 'rb') as f:
            all_replay = pickle.load(f)
            
#         # remember to append buffer if restoring a subset of variables
#         all_replay.append(replay_buffer)
    
    # Run simulation
    try:
        simulate(simulation=g,
                 replay = all_replay,
                 act = all_act,
                 train = all_train,
                 update = all_update,
                 debug = all_debug,
                 fps=FPS,
                 visualize_every=VISUALIZE_EVERY,
                 action_every=ACTION_EVERY,
                 wait=WAIT,
                 disable_training=False,
                 simulation_resolution=.001,
                 save_path=None,
                 timesteps = timesteps,
                 elapsed = elapsed,
                 all_rewards = rewards)
        
    except KeyboardInterrupt:
        print("Interrupted")
        g.shut_down_graphics()
        print('graphics shut down')
        
    # Save models    
    model_name = 'saved_graphs/' + MODEL_NAME + '-' + str(timesteps[0]) + '.ckpt'
    saver = tf.train.Saver()
    saver.save(sess, model_name)
    
    # Save replay buffers
    replay_name = 'saved_graphs/' + REPLAY_NAME + '-' + str(timesteps[0]) + '.pkl'
    with open(replay_name, "wb") as f:
        pickle.dump(all_replay, f)
        
    # Save trial times
    elapse_name = 'saved_graphs/' + ELAPSE_NAME + '-' + str(timesteps[0]) + '.pkl'
    with open(elapse_name, "wb") as f:
        pickle.dump(elapsed, f)
        
    # Save rewards
    rewards_name = 'saved_graphs/' + REWARDS_NAME + '-' + str(timesteps[0]) + '.pkl'
    with open(rewards_name, "wb") as f:
        pickle.dump(rewards, f)

%reset -f 
from __future__ import print_function

import pickle
import os
import numpy as np
import tempfile
import tensorflow as tf
import matplotlib.pyplot as plt

from tf_rl.controller import DiscreteDeepQ, ModelController
from tf_rl.simulation import KarpathyGame
from tf_rl import simulate
from tf_rl.models import MLP
from collections import OrderedDict
from euclid import Vector2

from baselines import deepq
import baselines.common.tf_util as U
from baselines.deepq.replay_buffer import ReplayBuffer
from baselines.common.schedules import LinearSchedule
tf.reset_default_graph()



[2017-07-24 16:41:18,743] VARIABLES collection name is deprecated, please use GLOBAL_VARIABLES instead; VARIABLES will be removed after 2017-03-02.




[2017-07-24 16:41:18,810] VARIABLES collection name is deprecated, please use GLOBAL_VARIABLES instead; VARIABLES will be removed after 2017-03-02.


0
0
4
6
8
8
10
12
12
12
13
16
16
16
16
16
20
20
20
21
21
21
21
21
23
23
23
23
24
28
28
32
33
38
40
44
48
50
52
56
59
60
61
65
74
76
80
83
91
92
95
106
108
112
117
121
129
140
144
149
158
165
172
181
191
198
213
220
225
239
252
256
266
287
296
314
314
318
318
330
335
343
369
383
407
427
448
473
486
done


In [14]:
current_settings = {
    #earlier objects are eaten by later objects (pred eat prey)
    'objects': [
        'prey',
        'pred',
        'cue',
    ],
    'colors': {
        'prey': [212, 211, 208],
        'pred':  [100, 37, 0],
        'cue': [0,0,0],
    },
    'object_reward': {
        'prey': {'prey': 0.1, 'pred': -0.1, 'cue': 0.0},
        'pred': {'prey': 1.0, 'pred': -1.0, 'cue': 0.0},
    },
    'hero_bounces_off_walls': False,
    'world_size': (500,300),   
    "maximum_velocity":      {'prey': 0, 'pred': 50},
    "object_radius": 10.0,
    "cue_types": 2,
    "num_objects": OrderedDict([('prey', 5), ('pred', 1), ('cue', 1)]),
    # active means that the objects are learning
    "num_objects_active": OrderedDict([('prey', 0), ('pred', 1)]), 
    #'multiple' to create each DQN for each prey/predator
    #'one' to use one DQN for all preys/predators
    # only really matters if the preys/predators are active
    "network_prey": 'one',
    "network_pred": 'multiple',
    "num_observation_lines" : 32,
    "observation_line_length": 75.,
    "tolerable_distance_to_wall": 50,
    "wall_distance_penalty":  -1.0,
    "delta_v": 50
}

#'new' to create new sim with values above
#'load' to load a previously trained graph
RUN = 'new'  

# First three for names for saving new runs
MODEL_NAME = 'model-1pred-2cues-500trials-21'
REPLAY_NAME = 'replay-1pred-2cues-500trials-21'
ELAPSE_NAME = 'elapse-1pred-2cues-500trials-21'
REWARDS_NAME = 'rewards-1pred-2cues-500trials-21'

# Last two for names for reloading model/replay buffers
MODEL_RE = 'model-1-211685'
REPLAY_RE = 'replay-1-211685'

# create the game simulator
g = KarpathyGame(current_settings)

tf.reset_default_graph()

all_act = []
all_train = []
all_update = []
all_debug = []
all_replay = []

# Build graphs
if current_settings['num_objects_active']['prey'] != 0:
    if current_settings['network_prey'] == 'one':
        network_prey = 1
    else:
        network_prey = current_settings['num_objects_active']['prey']

    for i in range(network_prey):
        name = 'prey' + str(i)
        with tf.variable_scope(name):
            brain_prey = deepq.models.mlp([200, 200])
            act, train, update_target, debug = deepq.build_train(
                make_obs_ph=lambda name: U.BatchInput((g.observation_size,), name=name),
                q_func=brain_prey,
                num_actions=g.num_actions,
                optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),
            )
        replay_buffer = ReplayBuffer(50000)
        all_replay.append(replay_buffer)
        all_act.append(act)
        all_train.append(train)
        all_update.append(update_target)
        all_debug.append(debug)

if current_settings['num_objects_active']['pred'] != 0:
    if current_settings['network_pred'] == 'one':
        network_pred = 1
    else:
        network_pred = current_settings['num_objects_active']['pred']

    for i in range(network_pred):
        # CHANGE THIS
        name = 'pred21'
        with tf.variable_scope(name):
            brain_pred = deepq.models.mlp([200, 200])
            act, train, update_target, debug = deepq.build_train(
                make_obs_ph=lambda name: U.BatchInput((g.observation_size,), name=name),
                q_func=brain_pred,
                num_actions=g.num_actions,
                optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),
            )
        replay_buffer = ReplayBuffer(50000)
        all_replay.append(replay_buffer)
        all_act.append(act)
        all_train.append(train)
        all_update.append(update_target)
        all_debug.append(debug)

# Simulate
FPS          = 30
ACTION_EVERY = 3
    
fast_mode = False
if fast_mode:
    WAIT, VISUALIZE_EVERY = False, 100
else:
    WAIT, VISUALIZE_EVERY = True, 1

elapsed = []
rewards = []
timesteps = [0]
    
# Initializing or reloading variables
# Start TensorFlow session with 2 CPUs
with U.make_session(2) as sess:
    
    # Initialize the parameters and copy them to the target network.
    U.initialize()
    for i in range(current_settings['num_objects_active']['prey']):
        name = 'prey' + str(i)
        with tf.variable_scope(name):
            update_target()
    for i in range(current_settings['num_objects_active']['pred']):
        name = 'pred21'
        with tf.variable_scope(name):
            update_target()
            
    if RUN == 'load':
#         # when only restoring a subset of variables
#         restore = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope = 'pred0')
#         saver = tf.train.Saver(restore)

        # reload models
        saver = tf.train.Saver()
        current_dir = os.getcwd()
        model_name = current_dir + '/saved_graphs/' + MODEL_RE + '.ckpt'
        saver.restore(sess, model_name)
        # reload replay buffers
        replay_name = current_dir + '/saved_graphs/' + REPLAY_RE + '.pkl'
        with open(replay_name, 'rb') as f:
            all_replay = pickle.load(f)
            
#         # remember to append buffer if restoring a subset of variables
#         all_replay.append(replay_buffer)
    
    # Run simulation
    try:
        simulate(simulation=g,
                 replay = all_replay,
                 act = all_act,
                 train = all_train,
                 update = all_update,
                 debug = all_debug,
                 fps=FPS,
                 visualize_every=VISUALIZE_EVERY,
                 action_every=ACTION_EVERY,
                 wait=WAIT,
                 disable_training=False,
                 simulation_resolution=.001,
                 save_path=None,
                 timesteps = timesteps,
                 elapsed = elapsed,
                 all_rewards = rewards)
        
    except KeyboardInterrupt:
        print("Interrupted")
        g.shut_down_graphics()
        print('graphics shut down')
        
    # Save models    
    model_name = 'saved_graphs/' + MODEL_NAME + '-' + str(timesteps[0]) + '.ckpt'
    saver = tf.train.Saver()
    saver.save(sess, model_name)
    
    # Save replay buffers
    replay_name = 'saved_graphs/' + REPLAY_NAME + '-' + str(timesteps[0]) + '.pkl'
    with open(replay_name, "wb") as f:
        pickle.dump(all_replay, f)
        
    # Save trial times
    elapse_name = 'saved_graphs/' + ELAPSE_NAME + '-' + str(timesteps[0]) + '.pkl'
    with open(elapse_name, "wb") as f:
        pickle.dump(elapsed, f)
        
    # Save rewards
    rewards_name = 'saved_graphs/' + REWARDS_NAME + '-' + str(timesteps[0]) + '.pkl'
    with open(rewards_name, "wb") as f:
        pickle.dump(rewards, f)

%reset -f 
from __future__ import print_function

import pickle
import os
import numpy as np
import tempfile
import tensorflow as tf
import matplotlib.pyplot as plt

from tf_rl.controller import DiscreteDeepQ, ModelController
from tf_rl.simulation import KarpathyGame
from tf_rl import simulate
from tf_rl.models import MLP
from collections import OrderedDict
from euclid import Vector2

from baselines import deepq
import baselines.common.tf_util as U
from baselines.deepq.replay_buffer import ReplayBuffer
from baselines.common.schedules import LinearSchedule
tf.reset_default_graph()



[2017-07-24 16:59:11,541] VARIABLES collection name is deprecated, please use GLOBAL_VARIABLES instead; VARIABLES will be removed after 2017-03-02.




[2017-07-24 16:59:11,615] VARIABLES collection name is deprecated, please use GLOBAL_VARIABLES instead; VARIABLES will be removed after 2017-03-02.


0
0
4
9
12
13
16
18
20
20
31
34
35
36
37
40
46
53
60
61
69
82
87
92
101
103
103
111
113
115
119
133
140
144
144
149
152
158
175
194
205
215
223
232
235
251
262
276
288
298
304
315
342
365
402
448
486
done


In [None]:
current_settings = {
    #earlier objects are eaten by later objects (pred eat prey)
    'objects': [
        'prey',
        'pred',
        'cue',
    ],
    'colors': {
        'prey': [212, 211, 208],
        'pred':  [100, 37, 0],
        'cue': [0,0,0],
    },
    'object_reward': {
        'prey': {'prey': 0.1, 'pred': -0.1, 'cue': 0.0},
        'pred': {'prey': 1.0, 'pred': -1.0, 'cue': 0.0},
    },
    'hero_bounces_off_walls': False,
    'world_size': (500,300),   
    "maximum_velocity":      {'prey': 0, 'pred': 50},
    "object_radius": 10.0,
    "cue_types": 2,
    "num_objects": OrderedDict([('prey', 5), ('pred', 1), ('cue', 1)]),
    # active means that the objects are learning
    "num_objects_active": OrderedDict([('prey', 0), ('pred', 1)]), 
    #'multiple' to create each DQN for each prey/predator
    #'one' to use one DQN for all preys/predators
    # only really matters if the preys/predators are active
    "network_prey": 'one',
    "network_pred": 'multiple',
    "num_observation_lines" : 32,
    "observation_line_length": 75.,
    "tolerable_distance_to_wall": 50,
    "wall_distance_penalty":  -1.0,
    "delta_v": 50
}

#'new' to create new sim with values above
#'load' to load a previously trained graph
RUN = 'new'  

# First three for names for saving new runs
MODEL_NAME = 'model-1pred-2cues-500trials-22'
REPLAY_NAME = 'replay-1pred-2cues-500trials-22'
ELAPSE_NAME = 'elapse-1pred-2cues-500trials-22'
REWARDS_NAME = 'rewards-1pred-2cues-500trials-22'

# Last two for names for reloading model/replay buffers
MODEL_RE = 'model-1-221685'
REPLAY_RE = 'replay-1-221685'

# create the game simulator
g = KarpathyGame(current_settings)

tf.reset_default_graph()

all_act = []
all_train = []
all_update = []
all_debug = []
all_replay = []

# Build graphs
if current_settings['num_objects_active']['prey'] != 0:
    if current_settings['network_prey'] == 'one':
        network_prey = 1
    else:
        network_prey = current_settings['num_objects_active']['prey']

    for i in range(network_prey):
        name = 'prey' + str(i)
        with tf.variable_scope(name):
            brain_prey = deepq.models.mlp([200, 200])
            act, train, update_target, debug = deepq.build_train(
                make_obs_ph=lambda name: U.BatchInput((g.observation_size,), name=name),
                q_func=brain_prey,
                num_actions=g.num_actions,
                optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),
            )
        replay_buffer = ReplayBuffer(50000)
        all_replay.append(replay_buffer)
        all_act.append(act)
        all_train.append(train)
        all_update.append(update_target)
        all_debug.append(debug)

if current_settings['num_objects_active']['pred'] != 0:
    if current_settings['network_pred'] == 'one':
        network_pred = 1
    else:
        network_pred = current_settings['num_objects_active']['pred']

    for i in range(network_pred):
        # CHANGE THIS
        name = 'pred22'
        with tf.variable_scope(name):
            brain_pred = deepq.models.mlp([200, 200])
            act, train, update_target, debug = deepq.build_train(
                make_obs_ph=lambda name: U.BatchInput((g.observation_size,), name=name),
                q_func=brain_pred,
                num_actions=g.num_actions,
                optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),
            )
        replay_buffer = ReplayBuffer(50000)
        all_replay.append(replay_buffer)
        all_act.append(act)
        all_train.append(train)
        all_update.append(update_target)
        all_debug.append(debug)

# Simulate
FPS          = 30
ACTION_EVERY = 3
    
fast_mode = False
if fast_mode:
    WAIT, VISUALIZE_EVERY = False, 100
else:
    WAIT, VISUALIZE_EVERY = True, 1

elapsed = []
rewards = []
timesteps = [0]
    
# Initializing or reloading variables
# Start TensorFlow session with 2 CPUs
with U.make_session(2) as sess:
    
    # Initialize the parameters and copy them to the target network.
    U.initialize()
    for i in range(current_settings['num_objects_active']['prey']):
        name = 'prey' + str(i)
        with tf.variable_scope(name):
            update_target()
    for i in range(current_settings['num_objects_active']['pred']):
        name = 'pred22'
        with tf.variable_scope(name):
            update_target()
            
    if RUN == 'load':
#         # when only restoring a subset of variables
#         restore = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope = 'pred0')
#         saver = tf.train.Saver(restore)

        # reload models
        saver = tf.train.Saver()
        current_dir = os.getcwd()
        model_name = current_dir + '/saved_graphs/' + MODEL_RE + '.ckpt'
        saver.restore(sess, model_name)
        # reload replay buffers
        replay_name = current_dir + '/saved_graphs/' + REPLAY_RE + '.pkl'
        with open(replay_name, 'rb') as f:
            all_replay = pickle.load(f)
            
#         # remember to append buffer if restoring a subset of variables
#         all_replay.append(replay_buffer)
    
    # Run simulation
    try:
        simulate(simulation=g,
                 replay = all_replay,
                 act = all_act,
                 train = all_train,
                 update = all_update,
                 debug = all_debug,
                 fps=FPS,
                 visualize_every=VISUALIZE_EVERY,
                 action_every=ACTION_EVERY,
                 wait=WAIT,
                 disable_training=False,
                 simulation_resolution=.001,
                 save_path=None,
                 timesteps = timesteps,
                 elapsed = elapsed,
                 all_rewards = rewards)
        
    except KeyboardInterrupt:
        print("Interrupted")
        g.shut_down_graphics()
        print('graphics shut down')
        
    # Save models    
    model_name = 'saved_graphs/' + MODEL_NAME + '-' + str(timesteps[0]) + '.ckpt'
    saver = tf.train.Saver()
    saver.save(sess, model_name)
    
    # Save replay buffers
    replay_name = 'saved_graphs/' + REPLAY_NAME + '-' + str(timesteps[0]) + '.pkl'
    with open(replay_name, "wb") as f:
        pickle.dump(all_replay, f)
        
    # Save trial times
    elapse_name = 'saved_graphs/' + ELAPSE_NAME + '-' + str(timesteps[0]) + '.pkl'
    with open(elapse_name, "wb") as f:
        pickle.dump(elapsed, f)
        
    # Save rewards
    rewards_name = 'saved_graphs/' + REWARDS_NAME + '-' + str(timesteps[0]) + '.pkl'
    with open(rewards_name, "wb") as f:
        pickle.dump(rewards, f)

%reset -f 
from __future__ import print_function

import pickle
import os
import numpy as np
import tempfile
import tensorflow as tf
import matplotlib.pyplot as plt

from tf_rl.controller import DiscreteDeepQ, ModelController
from tf_rl.simulation import KarpathyGame
from tf_rl import simulate
from tf_rl.models import MLP
from collections import OrderedDict
from euclid import Vector2

from baselines import deepq
import baselines.common.tf_util as U
from baselines.deepq.replay_buffer import ReplayBuffer
from baselines.common.schedules import LinearSchedule
tf.reset_default_graph()



[2017-07-24 17:10:29,014] VARIABLES collection name is deprecated, please use GLOBAL_VARIABLES instead; VARIABLES will be removed after 2017-03-02.




[2017-07-24 17:10:29,081] VARIABLES collection name is deprecated, please use GLOBAL_VARIABLES instead; VARIABLES will be removed after 2017-03-02.


0
1
2
3
5
7
10
16
18
18
21
24
28
30
34
39
43
43
46
50
57
59
60
64
66
70
72
75
79
81
83
88
98
99
104
110
113
118
124
134
142
144
152
153
163
168
176
186
196
211
219
230
244
255
267
275
286
313
349
371
408
443
492
done


In [None]:
current_settings = {
    #earlier objects are eaten by later objects (pred eat prey)
    'objects': [
        'prey',
        'pred',
        'cue',
    ],
    'colors': {
        'prey': [212, 211, 208],
        'pred':  [100, 37, 0],
        'cue': [0,0,0],
    },
    'object_reward': {
        'prey': {'prey': 0.1, 'pred': -0.1, 'cue': 0.0},
        'pred': {'prey': 1.0, 'pred': -1.0, 'cue': 0.0},
    },
    'hero_bounces_off_walls': False,
    'world_size': (500,300),   
    "maximum_velocity":      {'prey': 0, 'pred': 50},
    "object_radius": 10.0,
    "cue_types": 2,
    "num_objects": OrderedDict([('prey', 5), ('pred', 1), ('cue', 1)]),
    # active means that the objects are learning
    "num_objects_active": OrderedDict([('prey', 0), ('pred', 1)]), 
    #'multiple' to create each DQN for each prey/predator
    #'one' to use one DQN for all preys/predators
    # only really matters if the preys/predators are active
    "network_prey": 'one',
    "network_pred": 'multiple',
    "num_observation_lines" : 32,
    "observation_line_length": 75.,
    "tolerable_distance_to_wall": 50,
    "wall_distance_penalty":  -1.0,
    "delta_v": 50
}

#'new' to create new sim with values above
#'load' to load a previously trained graph
RUN = 'new'  

# First three for names for saving new runs
MODEL_NAME = 'model-1pred-2cues-500trials-23'
REPLAY_NAME = 'replay-1pred-2cues-500trials-23'
ELAPSE_NAME = 'elapse-1pred-2cues-500trials-23'
REWARDS_NAME = 'rewards-1pred-2cues-500trials-23'

# Last two for names for reloading model/replay buffers
MODEL_RE = 'model-1-231685'
REPLAY_RE = 'replay-1-231685'

# create the game simulator
g = KarpathyGame(current_settings)

tf.reset_default_graph()

all_act = []
all_train = []
all_update = []
all_debug = []
all_replay = []

# Build graphs
if current_settings['num_objects_active']['prey'] != 0:
    if current_settings['network_prey'] == 'one':
        network_prey = 1
    else:
        network_prey = current_settings['num_objects_active']['prey']

    for i in range(network_prey):
        name = 'prey' + str(i)
        with tf.variable_scope(name):
            brain_prey = deepq.models.mlp([200, 200])
            act, train, update_target, debug = deepq.build_train(
                make_obs_ph=lambda name: U.BatchInput((g.observation_size,), name=name),
                q_func=brain_prey,
                num_actions=g.num_actions,
                optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),
            )
        replay_buffer = ReplayBuffer(50000)
        all_replay.append(replay_buffer)
        all_act.append(act)
        all_train.append(train)
        all_update.append(update_target)
        all_debug.append(debug)

if current_settings['num_objects_active']['pred'] != 0:
    if current_settings['network_pred'] == 'one':
        network_pred = 1
    else:
        network_pred = current_settings['num_objects_active']['pred']

    for i in range(network_pred):
        # CHANGE THIS
        name = 'pred23'
        with tf.variable_scope(name):
            brain_pred = deepq.models.mlp([200, 200])
            act, train, update_target, debug = deepq.build_train(
                make_obs_ph=lambda name: U.BatchInput((g.observation_size,), name=name),
                q_func=brain_pred,
                num_actions=g.num_actions,
                optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),
            )
        replay_buffer = ReplayBuffer(50000)
        all_replay.append(replay_buffer)
        all_act.append(act)
        all_train.append(train)
        all_update.append(update_target)
        all_debug.append(debug)

# Simulate
FPS          = 30
ACTION_EVERY = 3
    
fast_mode = False
if fast_mode:
    WAIT, VISUALIZE_EVERY = False, 100
else:
    WAIT, VISUALIZE_EVERY = True, 1

elapsed = []
rewards = []
timesteps = [0]
    
# Initializing or reloading variables
# Start TensorFlow session with 2 CPUs
with U.make_session(2) as sess:
    
    # Initialize the parameters and copy them to the target network.
    U.initialize()
    for i in range(current_settings['num_objects_active']['prey']):
        name = 'prey' + str(i)
        with tf.variable_scope(name):
            update_target()
    for i in range(current_settings['num_objects_active']['pred']):
        name = 'pred23'
        with tf.variable_scope(name):
            update_target()
            
    if RUN == 'load':
#         # when only restoring a subset of variables
#         restore = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope = 'pred0')
#         saver = tf.train.Saver(restore)

        # reload models
        saver = tf.train.Saver()
        current_dir = os.getcwd()
        model_name = current_dir + '/saved_graphs/' + MODEL_RE + '.ckpt'
        saver.restore(sess, model_name)
        # reload replay buffers
        replay_name = current_dir + '/saved_graphs/' + REPLAY_RE + '.pkl'
        with open(replay_name, 'rb') as f:
            all_replay = pickle.load(f)
            
#         # remember to append buffer if restoring a subset of variables
#         all_replay.append(replay_buffer)
    
    # Run simulation
    try:
        simulate(simulation=g,
                 replay = all_replay,
                 act = all_act,
                 train = all_train,
                 update = all_update,
                 debug = all_debug,
                 fps=FPS,
                 visualize_every=VISUALIZE_EVERY,
                 action_every=ACTION_EVERY,
                 wait=WAIT,
                 disable_training=False,
                 simulation_resolution=.001,
                 save_path=None,
                 timesteps = timesteps,
                 elapsed = elapsed,
                 all_rewards = rewards)
        
    except KeyboardInterrupt:
        print("Interrupted")
        g.shut_down_graphics()
        print('graphics shut down')
        
    # Save models    
    model_name = 'saved_graphs/' + MODEL_NAME + '-' + str(timesteps[0]) + '.ckpt'
    saver = tf.train.Saver()
    saver.save(sess, model_name)
    
    # Save replay buffers
    replay_name = 'saved_graphs/' + REPLAY_NAME + '-' + str(timesteps[0]) + '.pkl'
    with open(replay_name, "wb") as f:
        pickle.dump(all_replay, f)
        
    # Save trial times
    elapse_name = 'saved_graphs/' + ELAPSE_NAME + '-' + str(timesteps[0]) + '.pkl'
    with open(elapse_name, "wb") as f:
        pickle.dump(elapsed, f)
        
    # Save rewards
    rewards_name = 'saved_graphs/' + REWARDS_NAME + '-' + str(timesteps[0]) + '.pkl'
    with open(rewards_name, "wb") as f:
        pickle.dump(rewards, f)

%reset -f 
from __future__ import print_function

import pickle
import os
import numpy as np
import tempfile
import tensorflow as tf
import matplotlib.pyplot as plt

from tf_rl.controller import DiscreteDeepQ, ModelController
from tf_rl.simulation import KarpathyGame
from tf_rl import simulate
from tf_rl.models import MLP
from collections import OrderedDict
from euclid import Vector2

from baselines import deepq
import baselines.common.tf_util as U
from baselines.deepq.replay_buffer import ReplayBuffer
from baselines.common.schedules import LinearSchedule
tf.reset_default_graph()



[2017-07-24 17:23:21,534] VARIABLES collection name is deprecated, please use GLOBAL_VARIABLES instead; VARIABLES will be removed after 2017-03-02.




[2017-07-24 17:23:21,605] VARIABLES collection name is deprecated, please use GLOBAL_VARIABLES instead; VARIABLES will be removed after 2017-03-02.


0
2
6
9
9
10
12
14
15
16
16
18
19
21
23
28
29
31
33
37
40
44
47
54
61
74
80
83
90
98
105
115
126
148
165
183
201
226
251
278
314


In [None]:
current_settings = {
    #earlier objects are eaten by later objects (pred eat prey)
    'objects': [
        'prey',
        'pred',
        'cue',
    ],
    'colors': {
        'prey': [212, 211, 208],
        'pred':  [100, 37, 0],
        'cue': [0,0,0],
    },
    'object_reward': {
        'prey': {'prey': 0.1, 'pred': -0.1, 'cue': 0.0},
        'pred': {'prey': 1.0, 'pred': -1.0, 'cue': 0.0},
    },
    'hero_bounces_off_walls': False,
    'world_size': (500,300),   
    "maximum_velocity":      {'prey': 0, 'pred': 50},
    "object_radius": 10.0,
    "cue_types": 2,
    "num_objects": OrderedDict([('prey', 5), ('pred', 1), ('cue', 1)]),
    # active means that the objects are learning
    "num_objects_active": OrderedDict([('prey', 0), ('pred', 1)]), 
    #'multiple' to create each DQN for each prey/predator
    #'one' to use one DQN for all preys/predators
    # only really matters if the preys/predators are active
    "network_prey": 'one',
    "network_pred": 'multiple',
    "num_observation_lines" : 32,
    "observation_line_length": 75.,
    "tolerable_distance_to_wall": 50,
    "wall_distance_penalty":  -1.0,
    "delta_v": 50
}

#'new' to create new sim with values above
#'load' to load a previously trained graph
RUN = 'new'  

# First three for names for saving new runs
MODEL_NAME = 'model-1pred-2cues-500trials-24'
REPLAY_NAME = 'replay-1pred-2cues-500trials-24'
ELAPSE_NAME = 'elapse-1pred-2cues-500trials-24'
REWARDS_NAME = 'rewards-1pred-2cues-500trials-24'

# Last two for names for reloading model/replay buffers
MODEL_RE = 'model-1-241685'
REPLAY_RE = 'replay-1-241685'

# create the game simulator
g = KarpathyGame(current_settings)

tf.reset_default_graph()

all_act = []
all_train = []
all_update = []
all_debug = []
all_replay = []

# Build graphs
if current_settings['num_objects_active']['prey'] != 0:
    if current_settings['network_prey'] == 'one':
        network_prey = 1
    else:
        network_prey = current_settings['num_objects_active']['prey']

    for i in range(network_prey):
        name = 'prey' + str(i)
        with tf.variable_scope(name):
            brain_prey = deepq.models.mlp([200, 200])
            act, train, update_target, debug = deepq.build_train(
                make_obs_ph=lambda name: U.BatchInput((g.observation_size,), name=name),
                q_func=brain_prey,
                num_actions=g.num_actions,
                optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),
            )
        replay_buffer = ReplayBuffer(50000)
        all_replay.append(replay_buffer)
        all_act.append(act)
        all_train.append(train)
        all_update.append(update_target)
        all_debug.append(debug)

if current_settings['num_objects_active']['pred'] != 0:
    if current_settings['network_pred'] == 'one':
        network_pred = 1
    else:
        network_pred = current_settings['num_objects_active']['pred']

    for i in range(network_pred):
        # CHANGE THIS
        name = 'pred24'
        with tf.variable_scope(name):
            brain_pred = deepq.models.mlp([200, 200])
            act, train, update_target, debug = deepq.build_train(
                make_obs_ph=lambda name: U.BatchInput((g.observation_size,), name=name),
                q_func=brain_pred,
                num_actions=g.num_actions,
                optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),
            )
        replay_buffer = ReplayBuffer(50000)
        all_replay.append(replay_buffer)
        all_act.append(act)
        all_train.append(train)
        all_update.append(update_target)
        all_debug.append(debug)

# Simulate
FPS          = 30
ACTION_EVERY = 3
    
fast_mode = False
if fast_mode:
    WAIT, VISUALIZE_EVERY = False, 100
else:
    WAIT, VISUALIZE_EVERY = True, 1

elapsed = []
rewards = []
timesteps = [0]
    
# Initializing or reloading variables
# Start TensorFlow session with 2 CPUs
with U.make_session(2) as sess:
    
    # Initialize the parameters and copy them to the target network.
    U.initialize()
    for i in range(current_settings['num_objects_active']['prey']):
        name = 'prey' + str(i)
        with tf.variable_scope(name):
            update_target()
    for i in range(current_settings['num_objects_active']['pred']):
        name = 'pred24'
        with tf.variable_scope(name):
            update_target()
            
    if RUN == 'load':
#         # when only restoring a subset of variables
#         restore = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope = 'pred0')
#         saver = tf.train.Saver(restore)

        # reload models
        saver = tf.train.Saver()
        current_dir = os.getcwd()
        model_name = current_dir + '/saved_graphs/' + MODEL_RE + '.ckpt'
        saver.restore(sess, model_name)
        # reload replay buffers
        replay_name = current_dir + '/saved_graphs/' + REPLAY_RE + '.pkl'
        with open(replay_name, 'rb') as f:
            all_replay = pickle.load(f)
            
#         # remember to append buffer if restoring a subset of variables
#         all_replay.append(replay_buffer)
    
    # Run simulation
    try:
        simulate(simulation=g,
                 replay = all_replay,
                 act = all_act,
                 train = all_train,
                 update = all_update,
                 debug = all_debug,
                 fps=FPS,
                 visualize_every=VISUALIZE_EVERY,
                 action_every=ACTION_EVERY,
                 wait=WAIT,
                 disable_training=False,
                 simulation_resolution=.001,
                 save_path=None,
                 timesteps = timesteps,
                 elapsed = elapsed,
                 all_rewards = rewards)
        
    except KeyboardInterrupt:
        print("Interrupted")
        g.shut_down_graphics()
        print('graphics shut down')
        
    # Save models    
    model_name = 'saved_graphs/' + MODEL_NAME + '-' + str(timesteps[0]) + '.ckpt'
    saver = tf.train.Saver()
    saver.save(sess, model_name)
    
    # Save replay buffers
    replay_name = 'saved_graphs/' + REPLAY_NAME + '-' + str(timesteps[0]) + '.pkl'
    with open(replay_name, "wb") as f:
        pickle.dump(all_replay, f)
        
    # Save trial times
    elapse_name = 'saved_graphs/' + ELAPSE_NAME + '-' + str(timesteps[0]) + '.pkl'
    with open(elapse_name, "wb") as f:
        pickle.dump(elapsed, f)
        
    # Save rewards
    rewards_name = 'saved_graphs/' + REWARDS_NAME + '-' + str(timesteps[0]) + '.pkl'
    with open(rewards_name, "wb") as f:
        pickle.dump(rewards, f)

%reset -f 
from __future__ import print_function

import pickle
import os
import numpy as np
import tempfile
import tensorflow as tf
import matplotlib.pyplot as plt

from tf_rl.controller import DiscreteDeepQ, ModelController
from tf_rl.simulation import KarpathyGame
from tf_rl import simulate
from tf_rl.models import MLP
from collections import OrderedDict
from euclid import Vector2

from baselines import deepq
import baselines.common.tf_util as U
from baselines.deepq.replay_buffer import ReplayBuffer
from baselines.common.schedules import LinearSchedule
tf.reset_default_graph()