## Grid World Enviroment setting
* states, actions, transition probability

In [2]:
# set state
import numpy as np
nCols = 3
nRows = 4
nWalls = 1
states = []
for i in range(nCols*nRows-nWalls):
    states.append(i)
N_STATES = len(states)

terminal_state = [3,6]
win_state = [3]
lose_state = [6]
start_state = [x for x in states if x not in terminal_state]
#print(N_STATES)
#print(states)

# set map
map = -np.ones((nCols+2,nRows+2))
for i in range(nCols):
    for j in range(nRows):
        map[i+1,j+1] = 0
map[2,2] = -1 # add wall
#print(map)

# set action
actions = [0, 1, 2, 3]
N_ACTIONS = len(actions)

# states -> location
locations = []
index = 0
for i in range(nCols):
    for j in range(nRows):
        if map[i+1,j+1]==0:
            locations.append((i+1,j+1))
            index = index + 1
#print(locations) # match index with states
# action -> move
move = [(0,-1),(-1,0),(0,1),(1,0)] # match index with actions
#print(move)

# set transition probability
P = np.zeros((N_STATES,N_ACTIONS,N_STATES)) # P[S,A,S']
for s in range(N_STATES):
    for a in range(N_ACTIONS):
        current_location = locations[s]
        # heading collectly  ####################################################################################
        next_location = (current_location[0] + move[a][0],current_location[1] + move[a][1])
        
        if map[next_location[0],next_location[1]] == -1: # there is barrier or wall
            next_location = current_location
            next_s = states[locations.index(next_location)]
        else:
            next_s = states[locations.index(next_location)]
        P[s,a,next_s] = P[s,a,next_s] + 0.8
        # left error ############################################################################################
        next_location = (current_location[0] + move[a-1][0],current_location[1] + move[a-1][1])
        if map[next_location[0],next_location[1]] == -1: # there is barrier or wall
            next_location = current_location
            next_s = states[locations.index(next_location)]
        else:
            next_s = states[locations.index(next_location)]
        P[s,a,next_s] = P[s,a,next_s] + 0.1
        # right error ############################################################################################
        next_location = (current_location[0] + move[(a+1)%4][0],current_location[1] + move[(a+1)%4][1])
        
        if map[next_location[0],next_location[1]] == -1: # there is barrier or wall
            next_location = current_location
            next_s = states[locations.index(next_location)]
        else:
            next_s = states[locations.index(next_location)]
        P[s,a,next_s] = P[s,a,next_s] + 0.1
        
# rewards s,a ---  R(s,a)  ---> s'
if True:
    R = -0.02*np.ones((N_STATES,N_ACTIONS))
else:
    R = -0.5*np.ones((N_STATES,N_ACTIONS))
R[3,:] = 1
R[6,:] = -1
#print(R)
# discount factor
gamma = 0.99

# policy : given state which action would u choose
# assume that we know the policy
bad_policy = np.zeros((N_STATES,N_ACTIONS))
bad_policy[0,2] = 1
bad_policy[1,2] = 1
bad_policy[2,2] = 1
bad_policy[3,2] = 1
bad_policy[4,3] = 1
bad_policy[5,2] = 1
bad_policy[6,2] = 1
bad_policy[7,2] = 1
bad_policy[8,2] = 1
bad_policy[9,2] = 1
bad_policy[10,1] = 1

random_policy = 0.25*np.ones((N_STATES,N_ACTIONS))

optimal_policy = np.zeros((N_STATES,N_ACTIONS))
optimal_policy[0,2] = 1
optimal_policy[1,2] = 1
optimal_policy[2,2] = 1
optimal_policy[3,2] = 1
optimal_policy[4,1] = 1
optimal_policy[5,1] = 1
optimal_policy[6,1] = 1
optimal_policy[7,1] = 1
optimal_policy[8,0] = 1
optimal_policy[9,0] = 1
optimal_policy[10,0] = 1
#print(optimal_policy)

optimalWithNoise_policy = np.zeros((N_STATES,N_ACTIONS))
ep = 0.1
optimalWithNoise_policy[0,2] = 1
optimalWithNoise_policy[1,2] = 1
optimalWithNoise_policy[2,2] = 1
optimalWithNoise_policy[3,2] = 1
optimalWithNoise_policy[4,1] = 1
optimalWithNoise_policy[5,1] = 1
optimalWithNoise_policy[6,1] = 1
optimalWithNoise_policy[7,1] = 1
optimalWithNoise_policy[8,0] = 1
optimalWithNoise_policy[9,0] = 1
optimalWithNoise_policy[10,0] = 1
optimalWithNoise_policy = optimalWithNoise_policy + (ep/4)*np.ones((N_STATES,N_ACTIONS))
optimalWithNoise_policy = optimalWithNoise_policy / np.sum(optimalWithNoise_policy,axis = 1).reshape((N_STATES,1))

In [3]:
import tensorflow as tf

## Every visit Monte Carlo Policy Evaluation
* Function Approximation(linear combination)
$$V(s) = X(s)^{T}w$$
* loss function
$$J(w) = E_{\pi}[(G_{t}\:-\:X(s)^{T}w)^{2}]$$
* gradient descent
$$\nabla_{w}J(w) = -2\:*\:E_{\pi}[(G_{t}\:-\:X(s)^{T}w)]\:*\:X(s)$$
* stochastic gradient descent(batch size 1)
$$\nabla_{w}J(w) = -2\:*(G_{t}\:-\:X(s)^{T}w)\:*\:X(s)$$
* update parameter vector w
$$\Delta w = \alpha\:*\:(G_{t}\:-\:X(s)^{T}w)*X(s) $$

In [9]:
# Every-visit Monte Carlro Policy Evaluation for V

tf.reset_default_graph()
import time
start = time.time()
print("Tensorflow version : ")
print(tf.__version__)
print()


## set HyperParemeters
epoch = 1000
lr_rate = 0.01
policy = optimalWithNoise_policy # Evaluation -> follow given policy
## MC evaluation
num_visit = np.zeros(N_STATES) # N(s)
cum_gain = np.zeros(N_STATES) # S(s)
with tf.device('/cpu:0'):
    ## set tensorflow variable
    state_tf = tf.placeholder(tf.int32,shape=[None],name = "state")
    gain_tf = tf.placeholder(tf.float32,shape=[None],name = 'gain')
    #### number state -> matrix ex. 3 -> [0 0 0 1 0 0 0 0 0 0 0]
    W = tf.get_variable(name='W', \
                        shape = [N_STATES,1],\
                        dtype = tf.float32, \
                        initializer=tf.random_uniform_initializer(-1.0,1.0))
state_tf_one_hot = tf.one_hot(state_tf,N_STATES)
V = tf.matmul(state_tf_one_hot, W) # linear combination reprentaion of state value function
MC_error = gain_tf - V
loss = tf.reduce_mean(tf.square(MC_error)) #mean-square-error
opt = tf.train.GradientDescentOptimizer(learning_rate=lr_rate)
train_ops = opt.minimize(loss)

with tf.Session() as sess:
    tf.global_variables_initializer().run()
    
    for _ in range(epoch):
        
        #print(str(_+1)+"th iteration")
        done = False
        reward_history = []
        simulation_history = []
        gain_history = []   
        s = np.random.choice(start_state) # random initial state
        
        while not done:
            simulation_history.append(s)
            a = np.random.choice(actions,p=policy[s,:])
            reward_history.append(R[s,a])
            s1 = np.random.choice(states,p=P[s,a,:])
            
            if s1 in terminal_state:
                done = True
                simulation_history.append(s1)
                reward_history.append(R[s1,0])
            
            else:
                s = s1
                
        # After finish one simulation update value function -> offline
        # evaluate G(t)
        for i,r in enumerate(reward_history[::-1]):
            # G(t-1) = reward(t) + gamma * G(t)
            # if terminal G(T) = r(T)
            # To implent, i use reverse ordering
            if i==0:
                gain_history.append(r)
            else:
                gain_history.append(gamma * gain_history[i-1] + r)

        gain_history = gain_history[::-1]
        ##-------------------- This is for Exact MC
        # add G(t) to s(t)
        for i,s in enumerate(simulation_history):
            # i for find G(t)
            # S(s) = S(s) + G(t) for only first visit.
            num_visit[s]+=1
            cum_gain[s]= cum_gain[s] + gain_history[i]
        ##-------------------- This is for Function approximation MC
        for i in range(len(simulation_history)):
            feed_dict = {state_tf: [simulation_history[i]],\
                        gain_tf: [gain_history[i]]}
            sess.run(train_ops, feed_dict=feed_dict)
    # after finish all epoch
    V_final=[]
    for s in states:
        feed_dict = {state_tf: [s]}
        V_now = sess.run(V,feed_dict=feed_dict)
        V_final.append(V_now[0][0])
    print("Function Approximation result")
    print(V_final)
V = np.zeros(N_STATES)
V = cum_gain/(num_visit+1.0e-8)
print()
print()
print("Exact Value Function from MC")
print(V)
print()
print()
print("it takes "+str(round(time.time()-start))+" sec")

Tensorflow version : 
1.15.0



InvalidArgumentError: Cannot assign a device for operation W/Initializer/random_uniform/RandomUniform: Could not satisfy explicit device specification '' because the node node W/Initializer/random_uniform/RandomUniform (defined at /home/donghyun/anaconda2/envs/RL_STUDY/lib/python3.6/site-packages/tensorflow_core/python/framework/ops.py:1748) placed on device Device assignments active during op 'W/Initializer/random_uniform/RandomUniform' creation:
  with tf.device(None): </home/donghyun/anaconda2/envs/RL_STUDY/lib/python3.6/site-packages/tensorflow_core/python/ops/variables.py:1816>
  with tf.device(/GPU:0): <<ipython-input-9-298df1d72c53>:18>  was colocated with a group of nodes that required incompatible device '/device:GPU:0'. All available devices [/job:localhost/replica:0/task:0/device:CPU:0, /job:localhost/replica:0/task:0/device:XLA_CPU:0, /job:localhost/replica:0/task:0/device:XLA_GPU:0]. 
Colocation Debug Info:
Colocation group had the following types and supported devices: 
Root Member(assigned_device_name_index_=-1 requested_device_name_='/device:GPU:0' assigned_device_name_='' resource_device_name_='/device:GPU:0' supported_device_types_=[CPU] possible_devices_=[]
Assign: CPU 
ApplyGradientDescent: CPU 
RandomUniform: CPU XLA_CPU XLA_GPU 
Const: CPU XLA_CPU XLA_GPU 
Mul: CPU XLA_CPU XLA_GPU 
Sub: CPU XLA_CPU XLA_GPU 
Add: CPU XLA_CPU XLA_GPU 
Identity: CPU XLA_CPU XLA_GPU 
VariableV2: CPU 

Colocation members, user-requested devices, and framework assigned devices, if any:
  W/Initializer/random_uniform/shape (Const) 
  W/Initializer/random_uniform/min (Const) 
  W/Initializer/random_uniform/max (Const) 
  W/Initializer/random_uniform/RandomUniform (RandomUniform) 
  W/Initializer/random_uniform/sub (Sub) 
  W/Initializer/random_uniform/mul (Mul) 
  W/Initializer/random_uniform (Add) 
  W (VariableV2) /device:GPU:0
  W/Assign (Assign) /device:GPU:0
  W/read (Identity) /device:GPU:0
  GradientDescent/update_W/ApplyGradientDescent (ApplyGradientDescent) /device:GPU:0

	 [[node W/Initializer/random_uniform/RandomUniform (defined at /home/donghyun/anaconda2/envs/RL_STUDY/lib/python3.6/site-packages/tensorflow_core/python/framework/ops.py:1748) ]]Additional information about colocations:No node-device colocations were active during op 'W/Initializer/random_uniform/RandomUniform' creation.
Device assignments active during op 'W/Initializer/random_uniform/RandomUniform' creation:
  with tf.device(None): </home/donghyun/anaconda2/envs/RL_STUDY/lib/python3.6/site-packages/tensorflow_core/python/ops/variables.py:1816>
  with tf.device(/GPU:0): <<ipython-input-9-298df1d72c53>:18>

Original stack trace for 'W/Initializer/random_uniform/RandomUniform':
  File "/home/donghyun/anaconda2/envs/RL_STUDY/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/home/donghyun/anaconda2/envs/RL_STUDY/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/home/donghyun/anaconda2/envs/RL_STUDY/lib/python3.6/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/home/donghyun/anaconda2/envs/RL_STUDY/lib/python3.6/site-packages/traitlets/config/application.py", line 664, in launch_instance
    app.start()
  File "/home/donghyun/anaconda2/envs/RL_STUDY/lib/python3.6/site-packages/ipykernel/kernelapp.py", line 583, in start
    self.io_loop.start()
  File "/home/donghyun/anaconda2/envs/RL_STUDY/lib/python3.6/site-packages/tornado/platform/asyncio.py", line 148, in start
    self.asyncio_loop.run_forever()
  File "/home/donghyun/anaconda2/envs/RL_STUDY/lib/python3.6/asyncio/base_events.py", line 442, in run_forever
    self._run_once()
  File "/home/donghyun/anaconda2/envs/RL_STUDY/lib/python3.6/asyncio/base_events.py", line 1462, in _run_once
    handle._run()
  File "/home/donghyun/anaconda2/envs/RL_STUDY/lib/python3.6/asyncio/events.py", line 145, in _run
    self._callback(*self._args)
  File "/home/donghyun/anaconda2/envs/RL_STUDY/lib/python3.6/site-packages/tornado/ioloop.py", line 690, in <lambda>
    lambda f: self._run_callback(functools.partial(callback, future))
  File "/home/donghyun/anaconda2/envs/RL_STUDY/lib/python3.6/site-packages/tornado/ioloop.py", line 743, in _run_callback
    ret = callback()
  File "/home/donghyun/anaconda2/envs/RL_STUDY/lib/python3.6/site-packages/tornado/gen.py", line 787, in inner
    self.run()
  File "/home/donghyun/anaconda2/envs/RL_STUDY/lib/python3.6/site-packages/tornado/gen.py", line 748, in run
    yielded = self.gen.send(value)
  File "/home/donghyun/anaconda2/envs/RL_STUDY/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 361, in process_one
    yield gen.maybe_future(dispatch(*args))
  File "/home/donghyun/anaconda2/envs/RL_STUDY/lib/python3.6/site-packages/tornado/gen.py", line 209, in wrapper
    yielded = next(result)
  File "/home/donghyun/anaconda2/envs/RL_STUDY/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 268, in dispatch_shell
    yield gen.maybe_future(handler(stream, idents, msg))
  File "/home/donghyun/anaconda2/envs/RL_STUDY/lib/python3.6/site-packages/tornado/gen.py", line 209, in wrapper
    yielded = next(result)
  File "/home/donghyun/anaconda2/envs/RL_STUDY/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 541, in execute_request
    user_expressions, allow_stdin,
  File "/home/donghyun/anaconda2/envs/RL_STUDY/lib/python3.6/site-packages/tornado/gen.py", line 209, in wrapper
    yielded = next(result)
  File "/home/donghyun/anaconda2/envs/RL_STUDY/lib/python3.6/site-packages/ipykernel/ipkernel.py", line 300, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/home/donghyun/anaconda2/envs/RL_STUDY/lib/python3.6/site-packages/ipykernel/zmqshell.py", line 536, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/home/donghyun/anaconda2/envs/RL_STUDY/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2848, in run_cell
    raw_cell, store_history, silent, shell_futures)
  File "/home/donghyun/anaconda2/envs/RL_STUDY/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2874, in _run_cell
    return runner(coro)
  File "/home/donghyun/anaconda2/envs/RL_STUDY/lib/python3.6/site-packages/IPython/core/async_helpers.py", line 68, in _pseudo_sync_runner
    coro.send(None)
  File "/home/donghyun/anaconda2/envs/RL_STUDY/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3051, in run_cell_async
    interactivity=interactivity, compiler=compiler, result=result)
  File "/home/donghyun/anaconda2/envs/RL_STUDY/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3242, in run_ast_nodes
    if (await self.run_code(code, result,  async_=asy)):
  File "/home/donghyun/anaconda2/envs/RL_STUDY/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3319, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-9-298df1d72c53>", line 26, in <module>
    initializer=tf.random_uniform_initializer(-1.0,1.0))
  File "/home/donghyun/anaconda2/envs/RL_STUDY/lib/python3.6/site-packages/tensorflow_core/python/ops/variable_scope.py", line 1500, in get_variable
    aggregation=aggregation)
  File "/home/donghyun/anaconda2/envs/RL_STUDY/lib/python3.6/site-packages/tensorflow_core/python/ops/variable_scope.py", line 1243, in get_variable
    aggregation=aggregation)
  File "/home/donghyun/anaconda2/envs/RL_STUDY/lib/python3.6/site-packages/tensorflow_core/python/ops/variable_scope.py", line 567, in get_variable
    aggregation=aggregation)
  File "/home/donghyun/anaconda2/envs/RL_STUDY/lib/python3.6/site-packages/tensorflow_core/python/ops/variable_scope.py", line 519, in _true_getter
    aggregation=aggregation)
  File "/home/donghyun/anaconda2/envs/RL_STUDY/lib/python3.6/site-packages/tensorflow_core/python/ops/variable_scope.py", line 933, in _get_single_variable
    aggregation=aggregation)
  File "/home/donghyun/anaconda2/envs/RL_STUDY/lib/python3.6/site-packages/tensorflow_core/python/ops/variables.py", line 258, in __call__
    return cls._variable_v1_call(*args, **kwargs)
  File "/home/donghyun/anaconda2/envs/RL_STUDY/lib/python3.6/site-packages/tensorflow_core/python/ops/variables.py", line 219, in _variable_v1_call
    shape=shape)
  File "/home/donghyun/anaconda2/envs/RL_STUDY/lib/python3.6/site-packages/tensorflow_core/python/ops/variables.py", line 197, in <lambda>
    previous_getter = lambda **kwargs: default_variable_creator(None, **kwargs)
  File "/home/donghyun/anaconda2/envs/RL_STUDY/lib/python3.6/site-packages/tensorflow_core/python/ops/variable_scope.py", line 2519, in default_variable_creator
    shape=shape)
  File "/home/donghyun/anaconda2/envs/RL_STUDY/lib/python3.6/site-packages/tensorflow_core/python/ops/variables.py", line 262, in __call__
    return super(VariableMetaclass, cls).__call__(*args, **kwargs)
  File "/home/donghyun/anaconda2/envs/RL_STUDY/lib/python3.6/site-packages/tensorflow_core/python/ops/variables.py", line 1688, in __init__
    shape=shape)
  File "/home/donghyun/anaconda2/envs/RL_STUDY/lib/python3.6/site-packages/tensorflow_core/python/ops/variables.py", line 1818, in _init_from_args
    initial_value(), name="initial_value", dtype=dtype)
  File "/home/donghyun/anaconda2/envs/RL_STUDY/lib/python3.6/site-packages/tensorflow_core/python/ops/variable_scope.py", line 905, in <lambda>
    partition_info=partition_info)
  File "/home/donghyun/anaconda2/envs/RL_STUDY/lib/python3.6/site-packages/tensorflow_core/python/ops/init_ops.py", line 283, in __call__
    shape, self.minval, self.maxval, dtype, seed=self.seed)
  File "/home/donghyun/anaconda2/envs/RL_STUDY/lib/python3.6/site-packages/tensorflow_core/python/ops/random_ops.py", line 245, in random_uniform
    rnd = gen_random_ops.random_uniform(shape, dtype, seed=seed1, seed2=seed2)
  File "/home/donghyun/anaconda2/envs/RL_STUDY/lib/python3.6/site-packages/tensorflow_core/python/ops/gen_random_ops.py", line 822, in random_uniform
    name=name)
  File "/home/donghyun/anaconda2/envs/RL_STUDY/lib/python3.6/site-packages/tensorflow_core/python/framework/op_def_library.py", line 794, in _apply_op_helper
    op_def=op_def)
  File "/home/donghyun/anaconda2/envs/RL_STUDY/lib/python3.6/site-packages/tensorflow_core/python/util/deprecation.py", line 507, in new_func
    return func(*args, **kwargs)
  File "/home/donghyun/anaconda2/envs/RL_STUDY/lib/python3.6/site-packages/tensorflow_core/python/framework/ops.py", line 3357, in create_op
    attrs, op_def, compute_device)
  File "/home/donghyun/anaconda2/envs/RL_STUDY/lib/python3.6/site-packages/tensorflow_core/python/framework/ops.py", line 3426, in _create_op_internal
    op_def=op_def)
  File "/home/donghyun/anaconda2/envs/RL_STUDY/lib/python3.6/site-packages/tensorflow_core/python/framework/ops.py", line 1748, in __init__
    self._traceback = tf_stack.extract_stack()


In [23]:
# Every-visit Monte Carlro Policy Evaluation for Q

tf.reset_default_graph()
import time
start = time.time()
print("Tensorflow version : ")
print(tf.__version__)
print()


## set HyperParemeters
epoch = 1000
lr_rate = 0.01
policy = optimalWithNoise_policy # Evaluation -> follow given policy

with tf.device('/cpu:0'):
    ## set tensorflow variable
    state_tf = tf.placeholder(tf.int32,shape=[None],name = "state")
    action_tf = tf.placeholder(tf.int32,shape=[None],name = "action")
    gain_tf = tf.placeholder(tf.float32,shape=[None],name = 'gain')
    #### number state -> matrix ex. 3 -> [0 0 0 1 0 0 0 0 0 0 0]
    W = tf.get_variable(name='W', \
                        shape = [N_STATES+N_ACTIONS,1],\
                        dtype = tf.float32, \
                        initializer=tf.random_uniform_initializer(-1.0,1.0))
state_tf_one_hot = tf.one_hot(state_tf,N_STATES)
action_tf_one_hot = tf.one_hot(action_tf,N_ACTIONS)

state_action_tf_one_hot = tf.concat([state_tf_one_hot, action_tf_one_hot], 1)

Q = tf.matmul(state_action_tf_one_hot, W) # linear combination reprentaion of state value function
MC_error = gain_tf - Q
loss = tf.reduce_mean(tf.square(MC_error)) #mean-square-error
opt = tf.train.GradientDescentOptimizer(learning_rate=lr_rate)
train_ops = opt.minimize(loss)

with tf.Session() as sess:
    tf.global_variables_initializer().run()
    
    for _ in range(epoch):
        reward_history = []
        simulation_history = []
        gain_history = []   
        #print(str(_+1)+"th iteration")
        done = False
        s = np.random.choice(start_state) # random initial state
        
        while not done:
            simulation_history.append((s,a))
            a = np.random.choice(actions,p=policy[s,:])
            reward_history.append(R[s,a])
            s1 = np.random.choice(states,p=P[s,a,:])
            
            if s1 in terminal_state:
                done = True
                simulation_history.append((s1,0))
                reward_history.append(R[s1,0])
            
            else:
                s = s1
                
        # After finish one simulation update value function -> offline
        # evaluate G(t)
        for i,r in enumerate(reward_history[::-1]):
            # G(t-1) = reward(t) + gamma * G(t)
            # if terminal G(T) = r(T)
            # To implent, i use reverse ordering
            if i==0:
                gain_history.append(r)
            else:
                gain_history.append(gamma * gain_history[i-1] + r)

        gain_history = gain_history[::-1]

        ##-------------------- This is for Function approximation MC
        for i,(s,a) in enumerate(simulation_history):
            feed_dict = {state_tf: [s],\
                         action_tf: [a],\
                        gain_tf: [gain_history[i]]}
            sess.run(train_ops, feed_dict=feed_dict)
            
    # after finish all epoch
    Q_final= np.empty((N_STATES,N_ACTIONS))
    for s in states:
        for a in actions:
            feed_dict = {state_tf: [s],action_tf: [a]}
            Q_now = sess.run(Q,feed_dict=feed_dict)
            Q_final[s,a] = Q_now[0][0]
    print("Function Approximation result")
    print(Q_final)

print("it takes "+str(round(time.time()-start))+" sec")

Tensorflow version : 
1.5.0

Function Approximation result
[[ 0.76324111  0.79037249  0.77992487  0.69187081]
 [ 0.8347652   0.86189657  0.85144901  0.76339489]
 [ 0.91392899  0.9410603   0.93061274  0.84255862]
 [ 1.01648712  1.04361844  1.03317094  0.94511676]
 [ 0.77556163  0.80269301  0.79224539  0.70419133]
 [ 0.51105231  0.53818369  0.52773613  0.43968201]
 [-0.54534531 -0.51821393 -0.52866149 -0.61671561]
 [ 0.75443184  0.78156316  0.77111566  0.68306148]
 [ 0.71992838  0.7470597   0.73661214  0.64855802]
 [ 0.70071876  0.72785008  0.71740258  0.6293484 ]
 [ 0.41266191  0.43979326  0.4293457   0.34129158]]
it takes 7 sec
