# 1. Start the environment
- Prepare unity environment

In [1]:
from unityagents import UnityEnvironment
import numpy as np
import tensorflow as tf
from collections import deque

env = UnityEnvironment(file_name="Banana_Windows_x86_64/Banana.exe")

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: BananaBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 37
        Number of stacked Vector Observation: 1
        Vector Action space type: discrete
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


# 2.  Get the default brain

In [2]:
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

# 3. Hyperparameters

In [3]:
state_size = 37
action_size = 4
learning_rate = 0.001
hidden_size = 256   

train_episodes = 1000          # max number of episodes to learn from
gamma = 0.99                   # future reward discount

# Exploration parameters
explore_start = 1.0            # exploration probability at start
explore_stop = 0.01            # minimum exploration probability 
decay_rate = 0.00001            # exponential decay rate for exploration prob

# Memory parameters
memory_size = 100000            # memory capacity
batch_size = 64                # experience mini-batch size
pretrain_length = batch_size   # number experiences to pretrain the memory

# 4. DQN

In [4]:
class DQNetwork:
    def __init__(self, name='main'):
        with tf.device("/device:GPU:0"):
            self.inputs = tf.placeholder(tf.float32, [None, state_size], name='inputs')
            self.target = tf.placeholder(tf.float32, [None], name='target')
            self.actions = tf.placeholder(tf.int32, [None], name='actions')
            self.one_hot_actions = tf.one_hot(self.actions, action_size)
            
            self.hidden = tf.layers.dense(inputs=self.inputs, units=hidden_size, activation=tf.nn.relu, name='hidden')
            self.output = tf.layers.dense(inputs=self.hidden, units=action_size, activation=None)
            
            self.Q = tf.reduce_sum(tf.multiply(self.output, self.one_hot_actions))

            self.loss = tf.reduce_mean(tf.square(self.target - self.Q))
            #self.opt = tf.train.AdamOptimizer(learning_rate).minimize(self.loss)
            self.opt = tf.train.RMSPropOptimizer(learning_rate).minimize(self.loss)

# 5. The replay memory

In [5]:
class Memory():
    def __init__(self, max_size=1000):
        self.buffer = deque(maxlen=max_size)
    
    def add(self, experience):
        if len(self.buffer) >= self.buffer.maxlen:
            self.buffer.popleft()         
        self.buffer.append(experience)
        
    def sample(self, batch_size):
        if batch_size > len(self.buffer):
            batch_size = len(self.buffer)

        idx = np.random.choice(np.arange(len(self.buffer)),
                               size=batch_size, 
                               replace=False)
        return [self.buffer[i] for i in idx]

## 5.1. Pretrain in the replay memory

In [6]:
memory = Memory(max_size=memory_size)

env_info = env.reset(train_mode=True)[brain_name]
state = env_info.vector_observations[0]

for i in range(pretrain_length):    
    action = np.random.choice(action_size)
    
    env_info = env.step(action)[brain_name]
    
    #Observe reward and next state
    reward = env_info.rewards[0]
    next_state = env_info.vector_observations[0]
    done = env_info.local_done[0]                    
    
    if done:
        next_state = np.zeros(37)
        memory.add((state, action, reward, next_state, done))
        
        env_info = env.reset(train_mode=True)[brain_name]
        state = env_info.vector_observations[0]
        continue
    else:
        memory.add((state, action, reward, next_state, done))
        
    #S <- S'
    state = next_state

# 6. Train

In [8]:
def predict_action(explore_start, explore_stop, decay_rate, step, state):    
    explore_p = explore_stop + (explore_start - explore_stop)*np.exp(-decay_rate*step) 
    if explore_p > np.random.rand():
        action = np.random.choice(action_size)
    else:
        next_Q = sess.run(mainDQN.output, feed_dict={mainDQN.inputs: state.reshape(1, state_size)})
        action = np.asscalar(np.argmax(next_Q[0]))
    return action, explore_p

with tf.device("/device:GPU:0"):
    tf.reset_default_graph()
    mainDQN = DQNetwork(name="main")    

In [9]:
init_op = tf.global_variables_initializer()

save_file = './Model/basic.ckpt'
saver = tf.train.Saver()

with tf.Session(config=tf.ConfigProto(log_device_placement=True, allow_soft_placement=True)) as sess:
    sess.run(init_op)
   
    step = 0
    for ep in range(0, train_episodes):
        env_info = env.reset(train_mode=True)[brain_name] # reset the environment
        state = env_info.vector_observations[0]
                      
        total_reward = 0
        while True:
            step += 1
            
            #Select an action with probability e
            action, ex_p = predict_action(explore_start, explore_stop, decay_rate, step, state)
            
            #Execute action
            env_info = env.step(action)[brain_name]
            
            #Observe reward and next state
            reward = env_info.rewards[0]
            next_state = env_info.vector_observations[0]
            done = env_info.local_done[0]
            
            total_reward += reward

            if done:
                next_state = np.zeros(state_size)

            #Store transition (st, at, rt, st+1, done) in D
            memory.add((state, action, reward, next_state, done))
            
            # Sample mini-batch from memory
            batch = memory.sample(batch_size)
            
            states = [each[0] for each in batch]
            actions = [each[1] for each in batch]
            rewards = [each[2] for each in batch]
            next_states = [each[3] for each in batch]
            dones = [each[4] for each in batch]
            targets= []
                        
            next_Qs = sess.run(mainDQN.output, feed_dict={mainDQN.inputs: next_states})
            batch_sum = 0
            for i in range(len(batch)):
                if dones[i]:
                    target = rewards[i]
                else:
                    target = rewards[i] + gamma * np.max(next_Qs[i])                    
                targets.append(target)
                batch_sum += rewards[i]

            loss, _ = sess.run([mainDQN.loss, mainDQN.opt], feed_dict={mainDQN.inputs: states, mainDQN.target: targets, mainDQN.actions: actions})                   

            if done:
                print('Episode: {}'.format(ep),
                      'Total reward: {}'.format(total_reward),
                      'Training sum: {:.2f}'.format(batch_sum),
                      'Training loss: {:.3f}'.format(loss),
                      'Explore P: {:.4f}'.format(ex_p))
                #if ep % 50 == 0:
                    #Qs = sess.run(mainDQN.output, feed_dict={mainDQN.inputs: states})
                    #print(Qs)
                #rewards_list.append((ep, total_reward))              
                break
            state = next_state
    
    save_path = saver.save(sess, save_file)
    print("Model saved in path: %s" % save_path)
    with tf.variable_scope('hidden', reuse=True):
        w = tf.get_variable('kernel')
        print(w)

Episode: 0 Total reward: 2.0 Training sum: 1.00 Training loss: 13.769 Explore P: 0.9970
Episode: 1 Total reward: 1.0 Training sum: 0.00 Training loss: 0.188 Explore P: 0.9941
Episode: 2 Total reward: 0.0 Training sum: 1.00 Training loss: 3.125 Explore P: 0.9911
Episode: 3 Total reward: 0.0 Training sum: 0.00 Training loss: 2.828 Explore P: 0.9882
Episode: 4 Total reward: -1.0 Training sum: 0.00 Training loss: 1.758 Explore P: 0.9853
Episode: 5 Total reward: -2.0 Training sum: 0.00 Training loss: 3.010 Explore P: 0.9823
Episode: 6 Total reward: 2.0 Training sum: 0.00 Training loss: 0.690 Explore P: 0.9794
Episode: 7 Total reward: -1.0 Training sum: -1.00 Training loss: 0.753 Explore P: 0.9765
Episode: 8 Total reward: 0.0 Training sum: 0.00 Training loss: 0.676 Explore P: 0.9736
Episode: 9 Total reward: 1.0 Training sum: -1.00 Training loss: 2.126 Explore P: 0.9707
Episode: 10 Total reward: 1.0 Training sum: 0.00 Training loss: 0.443 Explore P: 0.9679
Episode: 11 Total reward: -1.0 Train

Episode: 93 Total reward: -2.0 Training sum: 0.00 Training loss: 0.001 Explore P: 0.7567
Episode: 94 Total reward: 1.0 Training sum: 0.00 Training loss: 0.000 Explore P: 0.7545
Episode: 95 Total reward: 0.0 Training sum: 0.00 Training loss: 0.002 Explore P: 0.7523
Episode: 96 Total reward: -2.0 Training sum: 0.00 Training loss: 0.000 Explore P: 0.7500
Episode: 97 Total reward: 0.0 Training sum: 0.00 Training loss: 0.000 Explore P: 0.7478
Episode: 98 Total reward: 0.0 Training sum: 0.00 Training loss: 0.000 Explore P: 0.7456
Episode: 99 Total reward: 2.0 Training sum: 0.00 Training loss: 0.000 Explore P: 0.7434
Episode: 100 Total reward: 0.0 Training sum: 0.00 Training loss: 0.001 Explore P: 0.7412
Episode: 101 Total reward: 0.0 Training sum: 0.00 Training loss: 0.000 Explore P: 0.7390
Episode: 102 Total reward: -1.0 Training sum: -1.00 Training loss: 0.016 Explore P: 0.7368
Episode: 103 Total reward: 0.0 Training sum: 1.00 Training loss: 0.016 Explore P: 0.7347
Episode: 104 Total rewar

Episode: 185 Total reward: 0.0 Training sum: 1.00 Training loss: 0.016 Explore P: 0.5766
Episode: 186 Total reward: -2.0 Training sum: 0.00 Training loss: 0.000 Explore P: 0.5749
Episode: 187 Total reward: 2.0 Training sum: 1.00 Training loss: 0.016 Explore P: 0.5732
Episode: 188 Total reward: 1.0 Training sum: 1.00 Training loss: 0.016 Explore P: 0.5716
Episode: 189 Total reward: 2.0 Training sum: 1.00 Training loss: 0.015 Explore P: 0.5699
Episode: 190 Total reward: 0.0 Training sum: 1.00 Training loss: 0.016 Explore P: 0.5682
Episode: 191 Total reward: 0.0 Training sum: 1.00 Training loss: 0.016 Explore P: 0.5665
Episode: 192 Total reward: 1.0 Training sum: 1.00 Training loss: 0.015 Explore P: 0.5649
Episode: 193 Total reward: -1.0 Training sum: 0.00 Training loss: 0.000 Explore P: 0.5632
Episode: 194 Total reward: 0.0 Training sum: 1.00 Training loss: 0.016 Explore P: 0.5615
Episode: 195 Total reward: -1.0 Training sum: 0.00 Training loss: 0.000 Explore P: 0.5599
Episode: 196 Total

Episode: 277 Total reward: 1.0 Training sum: 0.00 Training loss: 0.001 Explore P: 0.4400
Episode: 278 Total reward: 2.0 Training sum: 0.00 Training loss: 0.000 Explore P: 0.4387
Episode: 279 Total reward: 1.0 Training sum: 0.00 Training loss: 0.000 Explore P: 0.4374
Episode: 280 Total reward: 2.0 Training sum: 1.00 Training loss: 0.016 Explore P: 0.4361
Episode: 281 Total reward: 3.0 Training sum: 0.00 Training loss: 0.000 Explore P: 0.4348
Episode: 282 Total reward: 0.0 Training sum: 0.00 Training loss: 0.032 Explore P: 0.4336
Episode: 283 Total reward: -1.0 Training sum: 0.00 Training loss: 0.000 Explore P: 0.4323
Episode: 284 Total reward: -1.0 Training sum: 0.00 Training loss: 0.000 Explore P: 0.4310
Episode: 285 Total reward: -1.0 Training sum: 0.00 Training loss: 0.000 Explore P: 0.4298
Episode: 286 Total reward: 0.0 Training sum: -2.00 Training loss: 0.032 Explore P: 0.4285
Episode: 287 Total reward: 0.0 Training sum: -1.00 Training loss: 0.016 Explore P: 0.4273
Episode: 288 Tot

Episode: 369 Total reward: 0.0 Training sum: 0.00 Training loss: 0.000 Explore P: 0.3363
Episode: 370 Total reward: 1.0 Training sum: -1.00 Training loss: 0.016 Explore P: 0.3353
Episode: 371 Total reward: 1.0 Training sum: 0.00 Training loss: 0.000 Explore P: 0.3343
Episode: 372 Total reward: 0.0 Training sum: 0.00 Training loss: 0.000 Explore P: 0.3333
Episode: 373 Total reward: 0.0 Training sum: 0.00 Training loss: 0.000 Explore P: 0.3324
Episode: 374 Total reward: 2.0 Training sum: 1.00 Training loss: 0.016 Explore P: 0.3314
Episode: 375 Total reward: 1.0 Training sum: -1.00 Training loss: 0.016 Explore P: 0.3304
Episode: 376 Total reward: 0.0 Training sum: 0.00 Training loss: 0.000 Explore P: 0.3295
Episode: 377 Total reward: 2.0 Training sum: 0.00 Training loss: 0.000 Explore P: 0.3285
Episode: 378 Total reward: 4.0 Training sum: 0.00 Training loss: 0.000 Explore P: 0.3276
Episode: 379 Total reward: 0.0 Training sum: 1.00 Training loss: 0.016 Explore P: 0.3266
Episode: 380 Total 

Episode: 461 Total reward: 0.0 Training sum: 0.00 Training loss: 0.000 Explore P: 0.2576
Episode: 462 Total reward: 2.0 Training sum: 0.00 Training loss: 0.000 Explore P: 0.2568
Episode: 463 Total reward: 1.0 Training sum: 0.00 Training loss: 0.000 Explore P: 0.2561
Episode: 464 Total reward: -2.0 Training sum: 0.00 Training loss: 0.000 Explore P: 0.2554
Episode: 465 Total reward: 0.0 Training sum: 1.00 Training loss: 0.016 Explore P: 0.2546
Episode: 466 Total reward: -1.0 Training sum: -1.00 Training loss: 0.016 Explore P: 0.2539
Episode: 467 Total reward: -1.0 Training sum: 0.00 Training loss: 0.000 Explore P: 0.2532
Episode: 468 Total reward: 1.0 Training sum: 0.00 Training loss: 0.000 Explore P: 0.2524
Episode: 469 Total reward: -2.0 Training sum: -1.00 Training loss: 0.017 Explore P: 0.2517
Episode: 470 Total reward: 0.0 Training sum: -1.00 Training loss: 0.016 Explore P: 0.2510
Episode: 471 Total reward: -3.0 Training sum: 0.00 Training loss: 0.000 Explore P: 0.2503
Episode: 472 

KeyboardInterrupt: 

# 7. Validation

In [7]:
save_file = './Model/basic.ckpt'

tf.reset_default_graph()
DQN = DQNetwork(name="main")

saver = tf.train.Saver()

#with tf.device("/device:GPU:0") as sess:
with tf.Session(config=tf.ConfigProto(log_device_placement=True, allow_soft_placement=True)) as sess:
    saver.restore(sess, save_file)
    
    scores = []
    print("Validation Start")
    for ep in range(1, 101):
        env_info = env.reset(train_mode=False)[brain_name]
        state = env_info.vector_observations[0]

        score = 0                                          # initialize the score        
        while True:
            #Choose action
            """
            # Get action from Q-network            
            Qs = sess.run(DQN.output, feed_dict={DQN.inputs: state.reshape((1, *state.shape))})
            action = np.asscalar(np.argmax(Qs[0]))
            next_Qs = sess.run(DQN.output, feed_dict={DQN.inputs: state.reshape((1, *state.shape))})
            
            Qs[0] = [q / sum(Qs[0]) for q in Qs[0]]

            if Qs[0, action] > np.random.uniform():
                pass
            else:
                action = np.random.choice(4) # roll the dice!
            """
            Qs = sess.run(DQN.output, feed_dict={DQN.inputs: state.reshape((1, *state.shape))})
            action = np.asscalar(np.argmax(Qs[0]))
            #print("action: ", action, ", Q : ",Qs[0])
            #sleep(0.1)
            
            #Execute action
            env_info = env.step(action)[brain_name]

            #Observe reward and next state
            reward = env_info.rewards[0]
            done = env_info.local_done[0]
            next_state = env_info.vector_observations[0]

            #Store reward
            score += reward

            if done:
                break
            """
            next_Qs = sess.run(mainDQN.output, feed_dict={mainDQN.inputs: next_state})
            target = reward + gamma * np.max(next_Qs[0])
            loss, _ = sess.run([mainDQN.loss, mainDQN.opt], feed_dict={mainDQN.inputs: state.reshape((1, *state.shape)), mainDQN.target: target, mainDQN.actions: action})
            """
            state = next_state
        print("episode: {}, Score : {}".format(ep, score))
        scores.append(score)
    print("Average score in 100 episodes: {}".format(np.sum(scores) / 100.0))    

INFO:tensorflow:Restoring parameters from ./Model/basic.ckpt


NotFoundError: Unsuccessful TensorSliceReader constructor: Failed to find any matching files for ./Model/basic.ckpt
	 [[Node: save/RestoreV2 = RestoreV2[dtypes=[DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT], _device="/job:localhost/replica:0/task:0/device:CPU:0"](_arg_save/Const_0_0, save/RestoreV2/tensor_names, save/RestoreV2/shape_and_slices)]]

Caused by op 'save/RestoreV2', defined at:
  File "C:\Users\AHI\Anaconda3\envs\P1_Navigation\lib\runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "C:\Users\AHI\Anaconda3\envs\P1_Navigation\lib\runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "C:\Users\AHI\Anaconda3\envs\P1_Navigation\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "C:\Users\AHI\Anaconda3\envs\P1_Navigation\lib\site-packages\traitlets\config\application.py", line 658, in launch_instance
    app.start()
  File "C:\Users\AHI\Anaconda3\envs\P1_Navigation\lib\site-packages\ipykernel\kernelapp.py", line 499, in start
    self.io_loop.start()
  File "C:\Users\AHI\Anaconda3\envs\P1_Navigation\lib\site-packages\tornado\platform\asyncio.py", line 132, in start
    self.asyncio_loop.run_forever()
  File "C:\Users\AHI\Anaconda3\envs\P1_Navigation\lib\asyncio\base_events.py", line 422, in run_forever
    self._run_once()
  File "C:\Users\AHI\Anaconda3\envs\P1_Navigation\lib\asyncio\base_events.py", line 1434, in _run_once
    handle._run()
  File "C:\Users\AHI\Anaconda3\envs\P1_Navigation\lib\asyncio\events.py", line 145, in _run
    self._callback(*self._args)
  File "C:\Users\AHI\Anaconda3\envs\P1_Navigation\lib\site-packages\tornado\ioloop.py", line 758, in _run_callback
    ret = callback()
  File "C:\Users\AHI\Anaconda3\envs\P1_Navigation\lib\site-packages\tornado\stack_context.py", line 300, in null_wrapper
    return fn(*args, **kwargs)
  File "C:\Users\AHI\Anaconda3\envs\P1_Navigation\lib\site-packages\tornado\gen.py", line 1233, in inner
    self.run()
  File "C:\Users\AHI\Anaconda3\envs\P1_Navigation\lib\site-packages\tornado\gen.py", line 1147, in run
    yielded = self.gen.send(value)
  File "C:\Users\AHI\Anaconda3\envs\P1_Navigation\lib\site-packages\ipykernel\kernelbase.py", line 359, in dispatch_queue
    yield self.process_one()
  File "C:\Users\AHI\Anaconda3\envs\P1_Navigation\lib\site-packages\tornado\gen.py", line 346, in wrapper
    runner = Runner(result, future, yielded)
  File "C:\Users\AHI\Anaconda3\envs\P1_Navigation\lib\site-packages\tornado\gen.py", line 1080, in __init__
    self.run()
  File "C:\Users\AHI\Anaconda3\envs\P1_Navigation\lib\site-packages\tornado\gen.py", line 1147, in run
    yielded = self.gen.send(value)
  File "C:\Users\AHI\Anaconda3\envs\P1_Navigation\lib\site-packages\ipykernel\kernelbase.py", line 346, in process_one
    yield gen.maybe_future(dispatch(*args))
  File "C:\Users\AHI\Anaconda3\envs\P1_Navigation\lib\site-packages\tornado\gen.py", line 326, in wrapper
    yielded = next(result)
  File "C:\Users\AHI\Anaconda3\envs\P1_Navigation\lib\site-packages\ipykernel\kernelbase.py", line 259, in dispatch_shell
    yield gen.maybe_future(handler(stream, idents, msg))
  File "C:\Users\AHI\Anaconda3\envs\P1_Navigation\lib\site-packages\tornado\gen.py", line 326, in wrapper
    yielded = next(result)
  File "C:\Users\AHI\Anaconda3\envs\P1_Navigation\lib\site-packages\ipykernel\kernelbase.py", line 513, in execute_request
    user_expressions, allow_stdin,
  File "C:\Users\AHI\Anaconda3\envs\P1_Navigation\lib\site-packages\tornado\gen.py", line 326, in wrapper
    yielded = next(result)
  File "C:\Users\AHI\Anaconda3\envs\P1_Navigation\lib\site-packages\ipykernel\ipkernel.py", line 294, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "C:\Users\AHI\Anaconda3\envs\P1_Navigation\lib\site-packages\ipykernel\zmqshell.py", line 536, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "C:\Users\AHI\Anaconda3\envs\P1_Navigation\lib\site-packages\IPython\core\interactiveshell.py", line 2817, in run_cell
    raw_cell, store_history, silent, shell_futures)
  File "C:\Users\AHI\Anaconda3\envs\P1_Navigation\lib\site-packages\IPython\core\interactiveshell.py", line 2843, in _run_cell
    return runner(coro)
  File "C:\Users\AHI\Anaconda3\envs\P1_Navigation\lib\site-packages\IPython\core\async_helpers.py", line 67, in _pseudo_sync_runner
    coro.send(None)
  File "C:\Users\AHI\Anaconda3\envs\P1_Navigation\lib\site-packages\IPython\core\interactiveshell.py", line 3018, in run_cell_async
    interactivity=interactivity, compiler=compiler, result=result)
  File "C:\Users\AHI\Anaconda3\envs\P1_Navigation\lib\site-packages\IPython\core\interactiveshell.py", line 3183, in run_ast_nodes
    if (yield from self.run_code(code, result)):
  File "C:\Users\AHI\Anaconda3\envs\P1_Navigation\lib\site-packages\IPython\core\interactiveshell.py", line 3265, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-7-5b92b678fa1f>", line 6, in <module>
    saver = tf.train.Saver()
  File "C:\Users\AHI\Anaconda3\envs\P1_Navigation\lib\site-packages\tensorflow\python\training\saver.py", line 1311, in __init__
    self.build()
  File "C:\Users\AHI\Anaconda3\envs\P1_Navigation\lib\site-packages\tensorflow\python\training\saver.py", line 1320, in build
    self._build(self._filename, build_save=True, build_restore=True)
  File "C:\Users\AHI\Anaconda3\envs\P1_Navigation\lib\site-packages\tensorflow\python\training\saver.py", line 1357, in _build
    build_save=build_save, build_restore=build_restore)
  File "C:\Users\AHI\Anaconda3\envs\P1_Navigation\lib\site-packages\tensorflow\python\training\saver.py", line 809, in _build_internal
    restore_sequentially, reshape)
  File "C:\Users\AHI\Anaconda3\envs\P1_Navigation\lib\site-packages\tensorflow\python\training\saver.py", line 448, in _AddRestoreOps
    restore_sequentially)
  File "C:\Users\AHI\Anaconda3\envs\P1_Navigation\lib\site-packages\tensorflow\python\training\saver.py", line 860, in bulk_restore
    return io_ops.restore_v2(filename_tensor, names, slices, dtypes)
  File "C:\Users\AHI\Anaconda3\envs\P1_Navigation\lib\site-packages\tensorflow\python\ops\gen_io_ops.py", line 1541, in restore_v2
    shape_and_slices=shape_and_slices, dtypes=dtypes, name=name)
  File "C:\Users\AHI\Anaconda3\envs\P1_Navigation\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "C:\Users\AHI\Anaconda3\envs\P1_Navigation\lib\site-packages\tensorflow\python\framework\ops.py", line 3290, in create_op
    op_def=op_def)
  File "C:\Users\AHI\Anaconda3\envs\P1_Navigation\lib\site-packages\tensorflow\python\framework\ops.py", line 1654, in __init__
    self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access

NotFoundError (see above for traceback): Unsuccessful TensorSliceReader constructor: Failed to find any matching files for ./Model/basic.ckpt
	 [[Node: save/RestoreV2 = RestoreV2[dtypes=[DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT], _device="/job:localhost/replica:0/task:0/device:CPU:0"](_arg_save/Const_0_0, save/RestoreV2/tensor_names, save/RestoreV2/shape_and_slices)]]


Basic Code
일단 Hidden size 팍 줄이고, Layer 늘려볼것.
그리고 나서 Target network 적용
그리고 나서 prioritized 방식 적용

메모
1. Validation 시 좌우, 앞뒤와 같은 반복움직임;
   - Local maxima인거 같은데 음;;;
2. Memory replay 개선 : prioritized 방식
   - 학습 Convergence 속도 때문
3. Target Q 다시 적용
   - 이게 가장 시급한거 같음;
4. Dueling:
5. 4 stack
