# 1. Start the environment
- Prepare unity environment

In [1]:
from unityagents import UnityEnvironment
import numpy as np

env = UnityEnvironment(file_name="Banana_Windows_x86_64/Banana.exe")

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: BananaBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 37
        Number of stacked Vector Observation: 1
        Vector Action space type: discrete
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


# 2.  Get the default brain

In [2]:
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

# 3. Training through DQN and tensorflow
  - Implement DQN with experience memory   
  - number of hidden layers and hidden nodes
  - hidden layers = 2 and hidden nodes = 2/3 * input(37) + output(4) = 27.xx(28)
    https://web.archive.org/web/20140721050413/http://www.heatonresearch.com/node/707

In [3]:
import tensorflow as tf

class DQNetwork:
    def __init__(self, learning_rate=0.01, state_size=37, hidden_size = 256, action_size=4, name='DQNetwork'):
        self.state_size = state_size
        self.action_size = action_size
        self.learning_rate = learning_rate
        
        with tf.device("/device:GPU:0"):
        #with tf.variable_scope(name):
            self.inputs = tf.placeholder(tf.float32, [None, state_size], name='inputs')
            self.actions = tf.placeholder(tf.int32, [None], name='actions')
            self.one_hot_actions = tf.one_hot(self.actions, action_size)
            
            self.hidden1 = tf.layers.dense(inputs=self.inputs, units=hidden_size, activation=tf.nn.relu)
            #self.hidden2 = tf.layers.dense(inputs=self.hidden1, units=hidden_size, activation=tf.nn.relu)

            self.output = tf.layers.dense(inputs=self.hidden1, units=action_size, activation=None)

            self.target_Q = tf.placeholder(tf.float32, [None], name='target')
                        
            self.Qmul = tf.multiply(self.output, self.one_hot_actions)
            self.Q = tf.reduce_sum(tf.multiply(self.output, self.one_hot_actions), axis=1)

            self.loss = tf.reduce_mean(tf.square(self.target_Q - self.Q))
            #self.opt = tf.train.AdamOptimizer(learning_rate).minimize(self.loss)
            self.opt = tf.train.RMSPropOptimizer(self.learning_rate).minimize(self.loss)

## 3.1 Hyperparameters

In [4]:
state_size = 37
action_size = 4

train_episodes = 1000          # max number of episodes to learn from
max_steps = 200                # max steps in an episode
gamma = 0.99                   # future reward discount

# Exploration parameters
explore_start = 1.0            # exploration probability at start
explore_stop = 0.01            # minimum exploration probability 
decay_rate = 0.00001            # exponential decay rate for exploration prob

# Network parameters
learning_rate = 0.0001         # Q-network learning rate

# Memory parameters
memory_size = 1000000            # memory capacity
batch_size = 64                # experience mini-batch size
pretrain_length = batch_size   # number experiences to pretrain the memory

#Target network update
#C = 1000
C = 10000
hidden_size = 512

## 3.2 Define the experience memory

In [5]:
from collections import deque

class Memory():
    def __init__(self, max_size=1000):
        self.buffer = deque(maxlen=max_size)
    
    def add(self, experience):
        if len(self.buffer) >= self.buffer.maxlen:
            self.buffer.popleft()         
        self.buffer.append(experience)
        
    def sample(self, batch_size):
        idx = np.random.choice(np.arange(len(self.buffer)),
                               size=batch_size, 
                               replace=False)
        return [self.buffer[i] for i in idx]

## 3.3 Pretrain the experience memory

In [6]:
#Allocate memory
memory = Memory(max_size=memory_size)

#Reset the environment
env_info = env.reset(train_mode=True)[brain_name]
state = env_info.vector_observations[0]

for i in range(pretrain_length):    
    #Choose an action using random policy
    action = np.random.choice(4)
    
    #Take the action
    env_info = env.step(action)[brain_name]
    
    #Observe reward and next state
    reward = env_info.rewards[0]
    next_state = env_info.vector_observations[0]
    done = env_info.local_done[0]                    

    if done:
        print('Done')
        next_state = np.zeros(state_size)
        memory.add((state, action, reward, next_state, done))

        #Reset the environment
        env_info = env.reset(train_mode=True)[brain_name]
        state = env_info.vector_observations[0]
        continue
    else:
        memory.add((state, action, reward, next_state, done))
        
    #S <- S'
    state = next_state

## 3.4 Training

In [7]:
def get_copy_var_ops(*, dest_scope_name="target", src_scope_name="main"):
    with tf.device("/device:GPU:0"):        
        op_holder = []

        src_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=src_scope_name)
        dest_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=dest_scope_name)

        for src_var, dest_var in zip(src_vars, dest_vars):
            #op_holder.append(dest_var.assign(src_var.value()))
            op_holder.append(dest_var.assign(src_var))
        return op_holder

In [11]:
def predict_action(explore_start, explore_stop, decay_rate, step, state):
    explore_p = explore_stop + (explore_start - explore_stop)*np.exp(-decay_rate*step) 
    if explore_p > np.random.rand():
        # Make a random action
        action = np.random.choice(action_size)
    else:
        # Get action from Q-network
        next_Q = sess.run(mainDQN.output, feed_dict={mainDQN.inputs: state.reshape(1, state_size)})
        action = np.asscalar(np.argmax(next_Q[0]))
        #action = np.random.choice(action_size)
        
    return action, explore_p

In [14]:
def validation():
    with tf.Session(config=tf.ConfigProto(log_device_placement=True, allow_soft_placement=True)) as sess:    
        scores = []
        for ep in range(100):
            env_info = env.reset(train_mode=False)[brain_name]
            state = env_info.vector_observations[0]

            score = 0                                          # initialize the score        
            while True:
                Qs = sess.run(mainDQN.output, feed_dict={mainDQN.inputs: state.reshape((1, *state.shape))})
                print(Qs)
                action = np.asscalar(np.argmax(Qs[0]))
                env_info = env.step(action)[brain_name]

                #Observe reward and next state
                reward = env_info.rewards[0]
                next_state = env_info.vector_observations[0]

                score += reward

                if done:
                    break
                state = next_state
            print("episode: {}".format(ep+1), "Score: {}".format(score))
            scores.append(score)
        print("Average score in 100 episodes: {}".format(np.sum(scores) / 100.0))

In [15]:
rewards_list = []
with tf.Session(config=tf.ConfigProto(log_device_placement=True, allow_soft_placement=True)) as sess:
    validation()

FailedPreconditionError: Attempting to use uninitialized value dense_1/bias
	 [[Node: dense_1/bias/read = Identity[T=DT_FLOAT, _class=["loc:@dense_1/bias"], _device="/job:localhost/replica:0/task:0/device:CPU:0"](dense_1/bias)]]

Caused by op 'dense_1/bias/read', defined at:
  File "C:\Users\Lab\Anaconda3\envs\p1\lib\runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "C:\Users\Lab\Anaconda3\envs\p1\lib\runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "C:\Users\Lab\Anaconda3\envs\p1\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "C:\Users\Lab\Anaconda3\envs\p1\lib\site-packages\traitlets\config\application.py", line 658, in launch_instance
    app.start()
  File "C:\Users\Lab\Anaconda3\envs\p1\lib\site-packages\ipykernel\kernelapp.py", line 497, in start
    self.io_loop.start()
  File "C:\Users\Lab\Anaconda3\envs\p1\lib\site-packages\tornado\platform\asyncio.py", line 132, in start
    self.asyncio_loop.run_forever()
  File "C:\Users\Lab\Anaconda3\envs\p1\lib\asyncio\base_events.py", line 422, in run_forever
    self._run_once()
  File "C:\Users\Lab\Anaconda3\envs\p1\lib\asyncio\base_events.py", line 1434, in _run_once
    handle._run()
  File "C:\Users\Lab\Anaconda3\envs\p1\lib\asyncio\events.py", line 145, in _run
    self._callback(*self._args)
  File "C:\Users\Lab\Anaconda3\envs\p1\lib\site-packages\tornado\platform\asyncio.py", line 122, in _handle_events
    handler_func(fileobj, events)
  File "C:\Users\Lab\Anaconda3\envs\p1\lib\site-packages\tornado\stack_context.py", line 300, in null_wrapper
    return fn(*args, **kwargs)
  File "C:\Users\Lab\Anaconda3\envs\p1\lib\site-packages\zmq\eventloop\zmqstream.py", line 450, in _handle_events
    self._handle_recv()
  File "C:\Users\Lab\Anaconda3\envs\p1\lib\site-packages\zmq\eventloop\zmqstream.py", line 480, in _handle_recv
    self._run_callback(callback, msg)
  File "C:\Users\Lab\Anaconda3\envs\p1\lib\site-packages\zmq\eventloop\zmqstream.py", line 432, in _run_callback
    callback(*args, **kwargs)
  File "C:\Users\Lab\Anaconda3\envs\p1\lib\site-packages\tornado\stack_context.py", line 300, in null_wrapper
    return fn(*args, **kwargs)
  File "C:\Users\Lab\Anaconda3\envs\p1\lib\site-packages\ipykernel\kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "C:\Users\Lab\Anaconda3\envs\p1\lib\site-packages\ipykernel\kernelbase.py", line 233, in dispatch_shell
    handler(stream, idents, msg)
  File "C:\Users\Lab\Anaconda3\envs\p1\lib\site-packages\ipykernel\kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "C:\Users\Lab\Anaconda3\envs\p1\lib\site-packages\ipykernel\ipkernel.py", line 208, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "C:\Users\Lab\Anaconda3\envs\p1\lib\site-packages\ipykernel\zmqshell.py", line 537, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "C:\Users\Lab\Anaconda3\envs\p1\lib\site-packages\IPython\core\interactiveshell.py", line 2662, in run_cell
    raw_cell, store_history, silent, shell_futures)
  File "C:\Users\Lab\Anaconda3\envs\p1\lib\site-packages\IPython\core\interactiveshell.py", line 2785, in _run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "C:\Users\Lab\Anaconda3\envs\p1\lib\site-packages\IPython\core\interactiveshell.py", line 2901, in run_ast_nodes
    if self.run_code(code, result):
  File "C:\Users\Lab\Anaconda3\envs\p1\lib\site-packages\IPython\core\interactiveshell.py", line 2961, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-12-13e2520f986f>", line 3, in <module>
    mainDQN = DQNetwork(name="main", hidden_size=hidden_size, learning_rate=learning_rate)
  File "<ipython-input-3-a1182b3f2970>", line 18, in __init__
    self.output = tf.layers.dense(inputs=self.hidden1, units=action_size, activation=None)
  File "C:\Users\Lab\Anaconda3\envs\p1\lib\site-packages\tensorflow\python\layers\core.py", line 253, in dense
    return layer.apply(inputs)
  File "C:\Users\Lab\Anaconda3\envs\p1\lib\site-packages\tensorflow\python\layers\base.py", line 825, in apply
    return self.__call__(inputs, *args, **kwargs)
  File "C:\Users\Lab\Anaconda3\envs\p1\lib\site-packages\tensorflow\python\layers\base.py", line 696, in __call__
    self.build(input_shapes)
  File "C:\Users\Lab\Anaconda3\envs\p1\lib\site-packages\tensorflow\python\layers\core.py", line 146, in build
    trainable=True)
  File "C:\Users\Lab\Anaconda3\envs\p1\lib\site-packages\tensorflow\python\layers\base.py", line 546, in add_variable
    partitioner=partitioner)
  File "C:\Users\Lab\Anaconda3\envs\p1\lib\site-packages\tensorflow\python\training\checkpointable.py", line 415, in _add_variable_with_custom_getter
    **kwargs_for_getter)
  File "C:\Users\Lab\Anaconda3\envs\p1\lib\site-packages\tensorflow\python\ops\variable_scope.py", line 1297, in get_variable
    constraint=constraint)
  File "C:\Users\Lab\Anaconda3\envs\p1\lib\site-packages\tensorflow\python\ops\variable_scope.py", line 1093, in get_variable
    constraint=constraint)
  File "C:\Users\Lab\Anaconda3\envs\p1\lib\site-packages\tensorflow\python\ops\variable_scope.py", line 439, in get_variable
    constraint=constraint)
  File "C:\Users\Lab\Anaconda3\envs\p1\lib\site-packages\tensorflow\python\ops\variable_scope.py", line 408, in _true_getter
    use_resource=use_resource, constraint=constraint)
  File "C:\Users\Lab\Anaconda3\envs\p1\lib\site-packages\tensorflow\python\ops\variable_scope.py", line 800, in _get_single_variable
    use_resource=use_resource)
  File "C:\Users\Lab\Anaconda3\envs\p1\lib\site-packages\tensorflow\python\ops\variable_scope.py", line 2157, in variable
    use_resource=use_resource)
  File "C:\Users\Lab\Anaconda3\envs\p1\lib\site-packages\tensorflow\python\ops\variable_scope.py", line 2147, in <lambda>
    previous_getter = lambda **kwargs: default_variable_creator(None, **kwargs)
  File "C:\Users\Lab\Anaconda3\envs\p1\lib\site-packages\tensorflow\python\ops\variable_scope.py", line 2130, in default_variable_creator
    constraint=constraint)
  File "C:\Users\Lab\Anaconda3\envs\p1\lib\site-packages\tensorflow\python\ops\variables.py", line 235, in __init__
    constraint=constraint)
  File "C:\Users\Lab\Anaconda3\envs\p1\lib\site-packages\tensorflow\python\ops\variables.py", line 391, in _init_from_args
    self._snapshot = array_ops.identity(self._variable, name="read")
  File "C:\Users\Lab\Anaconda3\envs\p1\lib\site-packages\tensorflow\python\ops\array_ops.py", line 142, in identity
    return gen_array_ops.identity(input, name=name)
  File "C:\Users\Lab\Anaconda3\envs\p1\lib\site-packages\tensorflow\python\ops\gen_array_ops.py", line 3658, in identity
    "Identity", input=input, name=name)
  File "C:\Users\Lab\Anaconda3\envs\p1\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "C:\Users\Lab\Anaconda3\envs\p1\lib\site-packages\tensorflow\python\framework\ops.py", line 3290, in create_op
    op_def=op_def)
  File "C:\Users\Lab\Anaconda3\envs\p1\lib\site-packages\tensorflow\python\framework\ops.py", line 1654, in __init__
    self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access

FailedPreconditionError (see above for traceback): Attempting to use uninitialized value dense_1/bias
	 [[Node: dense_1/bias/read = Identity[T=DT_FLOAT, _class=["loc:@dense_1/bias"], _device="/job:localhost/replica:0/task:0/device:CPU:0"](dense_1/bias)]]


In [12]:
with tf.device("/device:GPU:0"):
    tf.reset_default_graph()
    mainDQN = DQNetwork(name="main", hidden_size=hidden_size, learning_rate=learning_rate)
    #targetDQN = DQNetwork(name="target", hidden_size=hidden_size, learning_rate=learning_rate)

In [13]:
rewards_list = []
with tf.Session(config=tf.ConfigProto(log_device_placement=True, allow_soft_placement=True)) as sess:
#with tf.Session() as sess:
    # Initialize variables
    sess.run(tf.global_variables_initializer())
    
    #Needs to be same weights between main and target in start
    #copy_ops = get_copy_var_ops(dest_scope_name="target", src_scope_name="main")
    #sess.run(copy_ops)
    
    step = 0
    for ep in range(0, train_episodes):
    #for ep in range(0, 1):
        env_info = env.reset(train_mode=True)[brain_name] # reset the environment
        state = env_info.vector_observations[0]
        
        total_reward = 0
        while True:
            step += 1
            
            #Select an action with probability e
            action, ex_p = predict_action(explore_start, explore_stop, decay_rate, step, state)
            
            #Execute action
            env_info = env.step(action)[brain_name]
            
            #Observe reward and next state
            reward = env_info.rewards[0]
            next_state = env_info.vector_observations[0]
            done = env_info.local_done[0]

            total_reward += reward
            
            if done:
                next_state = np.zeros(state_size)
            
            #Store transition (st, at, rt, st+1, done) in D
            memory.add((state, action, reward, next_state, done))
            
            # Sample mini-batch from memory
            batch = memory.sample(batch_size)
            
            #states = np.array([each[0].reshape(1, *each[0].shape) for each in batch])
            states = [each[0] for each in batch]
            actions = [each[1] for each in batch]
            rewards = [each[2] for each in batch]
            next_states = [each[3] for each in batch]
            dones = [each[4] for each in batch]
            targets = []

            #next_Qs = sess.run(targetDQN.output, feed_dict={targetDQN.inputs: next_states})
            next_Qs = sess.run(mainDQN.output, feed_dict={mainDQN.inputs: next_states})
            
            for i in range(len(batch)):
                if dones[i]:
                    target = rewards[i]
                else:
                    target = rewards[i] + gamma * np.max(next_Qs[i])                    
                targets.append(target)

            loss, _ = sess.run([mainDQN.loss, mainDQN.opt], feed_dict={mainDQN.inputs: states, mainDQN.target_Q: targets, mainDQN.actions: actions})                   

            """
            if step % C == 0:
                copy_ops = get_copy_var_ops(dest_scope_name="target", src_scope_name="main")
                sess.run(copy_ops)
            """
            if done:
                print('Episode: {}'.format(ep),
                      'Step: {}'.format(step),
                      'Total reward: {}'.format(total_reward),
                      'Training loss: {:.4f}'.format(loss),
                      'Explore P: {:.4f}'.format(ex_p))
                if ep % 100 == 0:
                    #Qs = sess.run(mainDQN.output, feed_dict={mainDQN.inputs: states})
                    #print(Qs)
                    validation()
                rewards_list.append((ep, total_reward))              
                break
            else:
                state = next_state


Episode: 0 Step: 300 Total reward: 1.0 Training loss: 2.9699 Explore P: 0.9970
[[4.5943675 5.3294854 3.2046866 3.1603777]
 [7.944023  4.3194623 4.7484026 4.498185 ]
 [6.846814  3.5674865 4.3562427 3.3723652]
 [7.2145023 3.9676092 4.6181912 3.7733212]
 [5.008889  3.1177785 2.8903613 2.3879185]
 [3.9745803 3.3708763 2.5378127 2.3254042]
 [5.1605244 7.303274  4.261751  4.508261 ]
 [4.4340906 4.9364247 3.0416093 3.6619143]
 [9.137946  4.8083463 5.81684   4.9352255]
 [4.384675  5.3765717 3.296164  3.0554485]
 [4.1635075 4.5745883 2.9408143 2.9118452]
 [9.079639  4.7138877 5.6797037 4.733161 ]
 [3.3951483 3.3976843 2.1190996 1.9494985]
 [3.6946673 4.547998  2.5740614 2.6980965]
 [3.3560114 4.05257   2.2425976 2.2120473]
 [5.6104727 3.279546  2.939683  3.047233 ]
 [6.804757  3.6524189 4.0024967 3.514903 ]
 [3.8017979 3.2434516 2.5931134 2.1953077]
 [5.6742673 8.323836  4.417793  4.562245 ]
 [5.1507683 7.4517713 4.1765275 4.2849364]
 [3.919662  5.137943  2.7202942 2.6587715]
 [4.9110594 6.7109

Episode: 66 Step: 20100 Total reward: 1.0 Training loss: 546.0077 Explore P: 0.8197
Episode: 67 Step: 20400 Total reward: -1.0 Training loss: 1.1745 Explore P: 0.8173
Episode: 68 Step: 20700 Total reward: 1.0 Training loss: 1.3673 Explore P: 0.8149
Episode: 69 Step: 21000 Total reward: -4.0 Training loss: 172.5473 Explore P: 0.8125
Episode: 70 Step: 21300 Total reward: -3.0 Training loss: 1.1160 Explore P: 0.8101
Episode: 71 Step: 21600 Total reward: -1.0 Training loss: 0.9799 Explore P: 0.8077
Episode: 72 Step: 21900 Total reward: 0.0 Training loss: 1.0722 Explore P: 0.8053
Episode: 73 Step: 22200 Total reward: 0.0 Training loss: 0.8191 Explore P: 0.8029
Episode: 74 Step: 22500 Total reward: -1.0 Training loss: 0.8246 Explore P: 0.8005
Episode: 75 Step: 22800 Total reward: 1.0 Training loss: 0.9014 Explore P: 0.7982
Episode: 76 Step: 23100 Total reward: 0.0 Training loss: 0.7501 Explore P: 0.7958
Episode: 77 Step: 23400 Total reward: 1.0 Training loss: 0.9764 Explore P: 0.7934
Episode

Episode: 132 Step: 39900 Total reward: 1.0 Training loss: 0.0225 Explore P: 0.6743
Episode: 133 Step: 40200 Total reward: 0.0 Training loss: 0.0349 Explore P: 0.6723
Episode: 134 Step: 40500 Total reward: 0.0 Training loss: 0.0254 Explore P: 0.6703
Episode: 135 Step: 40800 Total reward: 1.0 Training loss: 0.0289 Explore P: 0.6683
Episode: 136 Step: 41100 Total reward: 1.0 Training loss: 0.0340 Explore P: 0.6664
Episode: 137 Step: 41400 Total reward: 0.0 Training loss: 0.0101 Explore P: 0.6644
Episode: 138 Step: 41700 Total reward: 0.0 Training loss: 0.0414 Explore P: 0.6624
Episode: 139 Step: 42000 Total reward: 1.0 Training loss: 0.0245 Explore P: 0.6605
Episode: 140 Step: 42300 Total reward: 2.0 Training loss: 0.0078 Explore P: 0.6585
Episode: 141 Step: 42600 Total reward: -1.0 Training loss: 0.0094 Explore P: 0.6566
Episode: 142 Step: 42900 Total reward: 0.0 Training loss: 0.0082 Explore P: 0.6546
Episode: 143 Step: 43200 Total reward: 0.0 Training loss: 0.0079 Explore P: 0.6527
Epi

Episode: 201 Step: 60600 Total reward: 0.0 Training loss: 0.0047 Explore P: 0.5501
Episode: 202 Step: 60900 Total reward: 3.0 Training loss: 0.0049 Explore P: 0.5485
Episode: 203 Step: 61200 Total reward: 3.0 Training loss: 0.0064 Explore P: 0.5468
Episode: 204 Step: 61500 Total reward: -1.0 Training loss: 0.0045 Explore P: 0.5452
Episode: 205 Step: 61800 Total reward: 1.0 Training loss: 0.0048 Explore P: 0.5436
Episode: 206 Step: 62100 Total reward: 3.0 Training loss: 0.3298 Explore P: 0.5420
Episode: 207 Step: 62400 Total reward: 1.0 Training loss: 0.3317 Explore P: 0.5404
Episode: 208 Step: 62700 Total reward: 2.0 Training loss: 0.0053 Explore P: 0.5389
Episode: 209 Step: 63000 Total reward: 2.0 Training loss: 0.0055 Explore P: 0.5373
Episode: 210 Step: 63300 Total reward: 5.0 Training loss: 0.0055 Explore P: 0.5357
Episode: 211 Step: 63600 Total reward: 5.0 Training loss: 0.0056 Explore P: 0.5341
Episode: 212 Step: 63900 Total reward: 1.0 Training loss: 0.0095 Explore P: 0.5325
Epi

Episode: 300 Step: 90300 Total reward: 3.0 Training loss: 0.0093 Explore P: 0.4113
[[5.4182587 5.4903746 5.420751  5.3342366]
 [5.112575  5.1782117 5.172853  5.169282 ]
 [5.1260657 5.074095  5.1716204 5.1305065]
 [5.146574  5.268119  5.3786025 5.2481422]
 [5.1045094 5.1741505 5.225743  5.1806955]
 [5.1949167 5.2543674 5.317104  5.2322626]
 [5.200549  5.189224  5.2944713 5.1808763]
 [5.1207094 5.0574408 5.2026215 5.138609 ]
 [5.5345097 5.5321803 5.3916063 5.3933787]
 [5.2493687 5.319669  5.2346635 5.39453  ]
 [5.038951  5.1189265 5.1189528 5.17536  ]
 [5.4635453 5.4307284 5.520057  5.462821 ]
 [5.2949014 5.3066764 5.371245  5.34429  ]
 [5.0859895 5.2005553 5.2810287 5.1281986]
 [5.1208005 5.21648   5.1959176 5.3149695]
 [5.339264  5.3397727 5.2850056 5.315044 ]
 [5.4625406 5.2748647 5.4850183 5.240383 ]
 [5.3015347 5.486179  5.3895926 5.39131  ]
 [5.3391414 5.3737316 5.4860682 5.3829784]
 [5.1650753 5.22003   5.323519  5.214961 ]
 [5.0704594 5.1434636 5.1416144 5.2086763]
 [5.0902157 5.

Episode: 366 Step: 110100 Total reward: 7.0 Training loss: 0.0103 Explore P: 0.3392
Episode: 367 Step: 110400 Total reward: 2.0 Training loss: 0.0227 Explore P: 0.3382
Episode: 368 Step: 110700 Total reward: 1.0 Training loss: 0.5493 Explore P: 0.3372
Episode: 369 Step: 111000 Total reward: 4.0 Training loss: 0.0088 Explore P: 0.3363
Episode: 370 Step: 111300 Total reward: 6.0 Training loss: 0.0062 Explore P: 0.3353
Episode: 371 Step: 111600 Total reward: 2.0 Training loss: 0.0362 Explore P: 0.3343
Episode: 372 Step: 111900 Total reward: 4.0 Training loss: 0.6433 Explore P: 0.3333
Episode: 373 Step: 112200 Total reward: 6.0 Training loss: 0.0058 Explore P: 0.3324
Episode: 374 Step: 112500 Total reward: 6.0 Training loss: 0.0143 Explore P: 0.3314
Episode: 375 Step: 112800 Total reward: 13.0 Training loss: 0.5001 Explore P: 0.3304
Episode: 376 Step: 113100 Total reward: 6.0 Training loss: 0.5683 Explore P: 0.3295
Episode: 377 Step: 113400 Total reward: 5.0 Training loss: 0.0101 Explore P

Episode: 431 Step: 129600 Total reward: 12.0 Training loss: 0.0104 Explore P: 0.2809
Episode: 432 Step: 129900 Total reward: 10.0 Training loss: 0.0116 Explore P: 0.2801
Episode: 433 Step: 130200 Total reward: 3.0 Training loss: 0.0077 Explore P: 0.2793
Episode: 434 Step: 130500 Total reward: 2.0 Training loss: 0.0213 Explore P: 0.2785
Episode: 435 Step: 130800 Total reward: 5.0 Training loss: 0.6209 Explore P: 0.2777
Episode: 436 Step: 131100 Total reward: 6.0 Training loss: 0.0092 Explore P: 0.2769
Episode: 437 Step: 131400 Total reward: 3.0 Training loss: 0.0184 Explore P: 0.2761
Episode: 438 Step: 131700 Total reward: 8.0 Training loss: 0.0085 Explore P: 0.2753
Episode: 439 Step: 132000 Total reward: 6.0 Training loss: 0.0160 Explore P: 0.2745
Episode: 440 Step: 132300 Total reward: 1.0 Training loss: 0.0207 Explore P: 0.2737
Episode: 441 Step: 132600 Total reward: 6.0 Training loss: 0.0180 Explore P: 0.2729
Episode: 442 Step: 132900 Total reward: 2.0 Training loss: 0.0127 Explore 

Episode: 501 Step: 150600 Total reward: 13.0 Training loss: 0.0106 Explore P: 0.2296
Episode: 502 Step: 150900 Total reward: 7.0 Training loss: 0.0272 Explore P: 0.2289
Episode: 503 Step: 151200 Total reward: 13.0 Training loss: 0.0071 Explore P: 0.2283
Episode: 504 Step: 151500 Total reward: 7.0 Training loss: 0.0380 Explore P: 0.2276
Episode: 505 Step: 151800 Total reward: 6.0 Training loss: 0.0149 Explore P: 0.2270
Episode: 506 Step: 152100 Total reward: 8.0 Training loss: 0.0092 Explore P: 0.2263
Episode: 507 Step: 152400 Total reward: 8.0 Training loss: 0.0483 Explore P: 0.2257
Episode: 508 Step: 152700 Total reward: 4.0 Training loss: 0.0206 Explore P: 0.2250
Episode: 509 Step: 153000 Total reward: 7.0 Training loss: 0.0266 Explore P: 0.2244
Episode: 510 Step: 153300 Total reward: 5.0 Training loss: 1.2601 Explore P: 0.2237
Episode: 511 Step: 153600 Total reward: 1.0 Training loss: 0.0191 Explore P: 0.2231
Episode: 512 Step: 153900 Total reward: 5.0 Training loss: 0.5553 Explore 

Episode: 599 Step: 180000 Total reward: 8.0 Training loss: 0.0106 Explore P: 0.1736
Episode: 600 Step: 180300 Total reward: 11.0 Training loss: 0.0314 Explore P: 0.1732
[[6.8907027 6.689128  6.7615776 6.811649 ]
 [6.789767  6.4702835 6.6327047 6.820863 ]
 [7.555445  7.0636396 7.0939302 7.2217197]
 [6.945875  6.768117  6.8879795 6.757513 ]
 [6.558544  6.4925356 6.553587  6.4386463]
 [6.9100604 6.7527657 6.691964  6.8273664]
 [6.893442  6.6835794 6.742337  6.5538473]
 [7.245818  6.899577  6.9208975 6.8831763]
 [6.6127963 6.384124  6.5778027 6.606642 ]
 [6.621251  6.4734106 6.613615  6.6464643]
 [6.67159   6.5605288 6.669571  6.589736 ]
 [7.381959  6.9919844 6.8782988 7.0358915]
 [7.3520103 7.048824  6.836019  6.8938427]
 [6.586057  6.4531674 6.6112213 6.523432 ]
 [5.501063  6.2254124 6.111953  6.0229387]
 [7.129411  6.784399  6.7839537 6.7329874]
 [6.735307  6.6599026 6.633748  6.6885643]
 [6.6895094 6.631706  6.691147  6.642028 ]
 [6.7737584 6.5607357 6.646698  6.6695957]
 [6.73528   6.

Episode: 664 Step: 199500 Total reward: 4.0 Training loss: 0.0069 Explore P: 0.1447
Episode: 665 Step: 199800 Total reward: 10.0 Training loss: 0.0218 Explore P: 0.1443
Episode: 666 Step: 200100 Total reward: 5.0 Training loss: 0.0130 Explore P: 0.1438
Episode: 667 Step: 200400 Total reward: 12.0 Training loss: 0.0062 Explore P: 0.1434
Episode: 668 Step: 200700 Total reward: 4.0 Training loss: 0.0099 Explore P: 0.1430
Episode: 669 Step: 201000 Total reward: 8.0 Training loss: 0.0272 Explore P: 0.1426
Episode: 670 Step: 201300 Total reward: 8.0 Training loss: 0.0151 Explore P: 0.1423
Episode: 671 Step: 201600 Total reward: 1.0 Training loss: 0.0208 Explore P: 0.1419
Episode: 672 Step: 201900 Total reward: 8.0 Training loss: 0.0093 Explore P: 0.1415
Episode: 673 Step: 202200 Total reward: 10.0 Training loss: 0.0279 Explore P: 0.1411
Episode: 674 Step: 202500 Total reward: 8.0 Training loss: 0.6322 Explore P: 0.1407
Episode: 675 Step: 202800 Total reward: 7.0 Training loss: 1.3080 Explore

Episode: 729 Step: 219000 Total reward: 7.0 Training loss: 0.0199 Explore P: 0.1208
Episode: 730 Step: 219300 Total reward: 12.0 Training loss: 0.0226 Explore P: 0.1205
Episode: 731 Step: 219600 Total reward: 7.0 Training loss: 0.0174 Explore P: 0.1201
Episode: 732 Step: 219900 Total reward: 11.0 Training loss: 0.0181 Explore P: 0.1198
Episode: 733 Step: 220200 Total reward: 9.0 Training loss: 0.0091 Explore P: 0.1195
Episode: 734 Step: 220500 Total reward: 8.0 Training loss: 0.0086 Explore P: 0.1191
Episode: 735 Step: 220800 Total reward: 11.0 Training loss: 0.0094 Explore P: 0.1188
Episode: 736 Step: 221100 Total reward: 10.0 Training loss: 0.0244 Explore P: 0.1185
Episode: 737 Step: 221400 Total reward: 8.0 Training loss: 0.0349 Explore P: 0.1182
Episode: 738 Step: 221700 Total reward: 4.0 Training loss: 0.0097 Explore P: 0.1178
Episode: 739 Step: 222000 Total reward: 3.0 Training loss: 0.0221 Explore P: 0.1175
Episode: 740 Step: 222300 Total reward: 3.0 Training loss: 0.0379 Explor

Episode: 801 Step: 240600 Total reward: 15.0 Training loss: 0.0073 Explore P: 0.0993
Episode: 802 Step: 240900 Total reward: 7.0 Training loss: 0.7205 Explore P: 0.0990
Episode: 803 Step: 241200 Total reward: 10.0 Training loss: 0.0133 Explore P: 0.0987
Episode: 804 Step: 241500 Total reward: 2.0 Training loss: 0.0183 Explore P: 0.0985
Episode: 805 Step: 241800 Total reward: 10.0 Training loss: 0.0113 Explore P: 0.0982
Episode: 806 Step: 242100 Total reward: 8.0 Training loss: 0.0081 Explore P: 0.0979
Episode: 807 Step: 242400 Total reward: 8.0 Training loss: 0.7574 Explore P: 0.0977
Episode: 808 Step: 242700 Total reward: 2.0 Training loss: 0.0080 Explore P: 0.0974
Episode: 809 Step: 243000 Total reward: 4.0 Training loss: 0.0395 Explore P: 0.0972
Episode: 810 Step: 243300 Total reward: 10.0 Training loss: 0.0274 Explore P: 0.0969
Episode: 811 Step: 243600 Total reward: 7.0 Training loss: 0.0196 Explore P: 0.0966
Episode: 812 Step: 243900 Total reward: 9.0 Training loss: 0.0649 Explor

Episode: 899 Step: 270000 Total reward: 8.0 Training loss: 0.0237 Explore P: 0.0765
Episode: 900 Step: 270300 Total reward: 8.0 Training loss: 0.0230 Explore P: 0.0763
[[6.2342644 6.2780814 6.3764734 6.304111 ]
 [7.0287304 6.946236  6.884027  6.9791727]
 [6.732742  6.651694  6.5572276 6.706773 ]
 [6.484576  6.4636035 6.6676445 6.5562277]
 [6.709205  6.648992  6.5243134 6.7498736]
 [6.3245296 6.4577303 6.47974   6.4938655]
 [6.4721847 6.5750275 6.557279  6.5728745]
 [6.7713146 6.7090626 6.757108  6.6826115]
 [6.4175267 6.4988494 6.539616  6.497197 ]
 [6.3582    6.4573517 6.460789  6.5458055]
 [6.504364  6.5195293 6.6154866 6.5664034]
 [6.165149  6.2635145 6.240793  6.343864 ]
 [6.450278  6.489879  6.634521  6.507358 ]
 [6.4561906 6.5473213 6.701424  6.496718 ]
 [6.0114517 6.344222  6.2417293 6.3820825]
 [6.24392   6.330599  6.353925  6.5063515]
 [6.736348  6.779067  6.577601  6.9418015]
 [6.48073   6.5300736 6.509212  6.5505404]
 [6.2600794 6.303651  6.3366656 6.4237075]
 [6.3734145 6.4

Episode: 964 Step: 289500 Total reward: 6.0 Training loss: 0.0607 Explore P: 0.0647
Episode: 965 Step: 289800 Total reward: 7.0 Training loss: 0.0365 Explore P: 0.0646
Episode: 966 Step: 290100 Total reward: 14.0 Training loss: 0.0136 Explore P: 0.0644
Episode: 967 Step: 290400 Total reward: 9.0 Training loss: 0.0438 Explore P: 0.0643
Episode: 968 Step: 290700 Total reward: 12.0 Training loss: 0.0181 Explore P: 0.0641
Episode: 969 Step: 291000 Total reward: 5.0 Training loss: 0.6210 Explore P: 0.0639
Episode: 970 Step: 291300 Total reward: 7.0 Training loss: 0.0121 Explore P: 0.0638
Episode: 971 Step: 291600 Total reward: 11.0 Training loss: 0.0126 Explore P: 0.0636
Episode: 972 Step: 291900 Total reward: 2.0 Training loss: 0.7874 Explore P: 0.0634
Episode: 973 Step: 292200 Total reward: 17.0 Training loss: 0.0323 Explore P: 0.0633
Episode: 974 Step: 292500 Total reward: 9.0 Training loss: 0.6886 Explore P: 0.0631
Episode: 975 Step: 292800 Total reward: 3.0 Training loss: 0.0232 Explor

In [None]:
safe_file = './layer1_512_weight.ckpt'
with tf.Session(config=tf.ConfigProto(log_device_placement=True, allow_soft_placement=True)) as sess:
    saver = tf.train.Saver()
    saver.save(sess, safe_file)

Layer 1
   - hidden size : 512, performance : 
Layer 2
   - hidden size : 
Dueling Network


# 4. Validation
  - The task is episodic, and in order to solve the environment, your agent must get an average score of +13 over 100 consecutive episodes.

In [None]:
with tf.Session(config=tf.ConfigProto(log_device_placement=True, allow_soft_placement=True)) as sess:    
    scores = []
    for ep in range(100):
        env_info = env.reset(train_mode=False)[brain_name]
        state = env_info.vector_observations[0]

        score = 0                                          # initialize the score        
        while True:
            Qs = sess.run(mainDQN.output, feed_dict={mainDQN.inputs: state.reshape((1, *state.shape))})
            print(Qs)
            action = np.asscalar(np.argmax(Qs[0]))
            env_info = env.step(action)[brain_name]

            #Observe reward and next state
            reward = env_info.rewards[0]
            next_state = env_info.vector_observations[0]

            score += reward

            if done:
                break
            state = next_state
        print("episode: {}".format(ep+1), "Score: {}".format(score))
        scores.append(score)
    print("Average score in 100 episodes: {}".format(np.sum(scores) / 100.0))

In [None]:
"""
env_info = env.reset(train_mode=True)[brain_name] # reset the environment
state = env_info.vector_observations[0]            # get the current state
score = 0                                          # initialize the score
while True:
    action = np.random.randint(4)        # select an action
    env_info = env.step(action)[brain_name]        # send the action to the environment
    next_state = env_info.vector_observations[0]   # get the next state
    reward = env_info.rewards[0]                   # get the reward
    done = env_info.local_done[0]                  # see if episode has finished
    score += reward                                # update the score
    state = next_state                             # roll over the state to next time step
    if done:                                       # exit loop if episode finished
        break
    
print("Score: {}".format(score))
"""

In [None]:
#env.close()