In [1]:
import os
import math
import random
import numpy as np

import tensorflow as tf

from gym_2048.envs.game2048_env import Game2048Env

  from ._conv import register_converters as _register_converters


In [2]:
# env = Game2048Env()

In [3]:
# obs = env.reset()
# print(obs)
# print(env.render())

In [4]:
# state = preprocess_observation(obs)

In [5]:
# obs, reward, done, info = env.step(0)
# env.render()
# print(reward, done)

In [6]:
class ReplayMemory:
    def __init__(self, maxlen):
        self.maxlen = maxlen
        self.buf = np.empty(shape=maxlen, dtype=np.object)
        self.index = 0
        self.length = 0
        
    def append(self, data):
        self.buf[self.index] = data
        self.length = min(self.length + 1, self.maxlen)
        self.index = (self.index + 1) % self.maxlen
    
    def sample(self, batch_size, with_replacement=True):
        if with_replacement:
            indices = np.random.randint(self.length, size=batch_size) # faster
        else:
            indices = np.random.permutation(self.length)[:batch_size]
        return self.buf[indices]

In [7]:
class Stats:
    def __init__(self, maxlen):
        self.maxlen = maxlen
        self.score = np.zeros(shape=maxlen, dtype=np.int32)
        self.highest = np.zeros(shape=maxlen, dtype=np.int32)
        self.index = 0
        self.total_game = 0
        self.reach2048 = 0
        self.highest_reached = 0
        
    def append(self, score, highest):
        self.score[self.index] = score
        self.highest[self.index] = highest
        self.index = (self.index + 1) % self.maxlen
        self.total_game += 1
        self.reach2048 += 1 if highest == 2048 else 0
        self.highest_reached = max(self.highest_reached, highest)
        
    def getStat(self):
        return [int(self.score.mean()), int(self.highest.mean()), self.total_game, self.reach2048, self.highest_reached]

In [8]:
def reset_graph(seed=42):
    tf.reset_default_graph()
    tf.set_random_seed(seed)
    np.random.seed(seed)
    
def q_network(X_state, name):
    with tf.variable_scope(name) as scope:
        hidden1 = tf.layers.dense(inputs = X_state, 
                                 units = n_hidden,
                                 activation=tf.nn.relu,
                                 kernel_initializer=initializer)
        hidden2 = tf.layers.dense(inputs = hidden1, 
                                 units = n_hidden,
                                 activation=tf.nn.relu,
                                 kernel_initializer=initializer)
        hidden3 = tf.layers.dense(inputs = hidden2, 
                                 units = n_hidden,
                                 activation=tf.nn.relu,
                                 kernel_initializer=initializer)
        hidden4 = tf.layers.dense(inputs = hidden3, 
                                 units = n_hidden,
                                 activation=tf.nn.relu,
                                 kernel_initializer=initializer)
        outputs = tf.layers.dense(inputs = hidden4, 
                                  units = n_outputs,
                                  kernel_initializer=initializer)
    trainable_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                       scope=scope.name)
    trainable_vars_by_name = {var.name[len(scope.name):]: var
                              for var in trainable_vars}
    return outputs, trainable_vars_by_name

# def q_network(X_state, name):
#     with tf.variable_scope(name) as scope:
#         hidden1 = tf.layers.conv2d(X_state, filters=32, 
#                                    kernel_size=(2, 2), strides=(1, 1), padding="SAME", 
#                                    activation=tf.nn.relu, kernel_initializer=initializer)
#         hidden2 = tf.layers.conv2d(hidden1, filters=32, 
#                                    kernel_size=(2, 2), strides=(1, 1), padding="SAME", 
#                                    activation=tf.nn.relu, kernel_initializer=initializer)
#         hidden3 = tf.layers.conv2d(hidden2, filters=32, 
#                                    kernel_size=(2, 2), strides=(1, 1), padding="SAME", 
#                                    activation=tf.nn.relu, kernel_initializer=initializer)
#         last_conv_layer_flat = tf.reshape(hidden3, shape=[-1, 4*4*32])
#         dense1 = tf.layers.dense(inputs = last_conv_layer_flat, 
#                                   units = 100,
#                                   kernel_initializer=initializer)
#         outputs = tf.layers.dense(inputs = dense1, 
#                                   units = n_outputs,
#                                   kernel_initializer=initializer)
#     trainable_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
#                                        scope=scope.name)
#     trainable_vars_by_name = {var.name[len(scope.name):]: var
#                               for var in trainable_vars}
#     return outputs, trainable_vars_by_name

def sample_memories(batch_size):
    cols = [[], [], [], [], []] # state, action, reward, next_state, continue
    for memory in replay_memory.sample(batch_size):
        for col, value in zip(cols, memory):
            col.append(value)
    cols = [np.array(col) for col in cols]
    return cols[0], cols[1].reshape(-1, 1), cols[2].reshape(-1, 1), cols[3], cols[4].reshape(-1, 1)

def epsilon_greedy(q_values, step):
    epsilon = max(eps_min, eps_max - (eps_max-eps_min) * step/eps_decay_steps)
    if np.random.rand() < epsilon:
        return np.random.randint(n_outputs) # random action
    else:
        return np.argmax(q_values) # optimal action

# def preprocess_observation(obs):
#     return obs/2048

def preprocess_observation(obs):
    result = np.zeros((4, 4, 16), dtype = np.uint8)
    for i, value in enumerate(obs):
        if value != 0:
            depth = int(math.log(value, 2)-1)
            result[i%4, i//4, depth] = 1
    return result.flatten()

In [9]:
# import matplotlib.pyplot as plt

# X = range(0, 400000, 1000)
# y = [max(eps_min, eps_max - (eps_max-eps_min) * x/eps_decay_steps) for x in X]

# plt.plot(X, y)
# plt.show()

In [10]:
replay_memory_size = 50000
replay_memory = ReplayMemory(replay_memory_size)
stat = Stats(10)

n_input = [None, 256]
# n_input = [None, 4, 4, 16]
n_hidden = 100
n_outputs = 4

eps_min = 0.1
eps_max = 1.0
eps_decay_steps = 200000

learning_rate = 0.01
momentum = 0.95

n_steps = eps_decay_steps*2  # total number of training steps
training_start = replay_memory_size  # start training after 10,000 game iterations
training_interval = 4  # run a training step every 4 game iterations
save_steps = 1000  # save the model every 1,000 training steps
copy_steps = 3000  # copy online DQN to target DQN every 10,000 training steps
discount_rate = 0.95
batch_size = 50
iteration = 0  # game iterations
checkpoint_path = "F:/training_data/DQN/Pacman/my_dqn_2048.ckpt" #"./my_dqn.ckpt"
done = True # env needs to be reset

loss_val = np.infty
game_length = 0
total_max_q = 0
mean_max_q = 0.0

total_reward = []
max_reward = 0

config = tf.ConfigProto(device_count = {'GPU': 0})

In [11]:
reset_graph()

initializer = tf.variance_scaling_initializer()

X_state = tf.placeholder(tf.float32, shape=n_input)

online_q_values, online_vars = q_network(X_state, name="q_networks/online")
target_q_values, target_vars = q_network(X_state, name="q_networks/target")

copy_ops = [target_var.assign(online_vars[var_name])
            for var_name, target_var in target_vars.items()]
copy_online_to_target = tf.group(*copy_ops)

In [12]:
with tf.variable_scope("train"):
    X_action = tf.placeholder(tf.int32, shape=[None, 1])  # action taken (shape batch x 1)
    y = tf.placeholder(tf.float32, shape=[None, 1])       # Q value computed (shape batch x 1)
    q_value = tf.reduce_sum(online_q_values * tf.one_hot(X_action, n_outputs), axis=1, keepdims=True)  # element-wise product Q-value x Action_OHE(batch x 4)
    error = tf.abs(y - q_value)
    clipped_error = tf.clip_by_value(error, 0.0, 1.0)
    linear_error = 2 * (error - clipped_error)
    loss = tf.reduce_mean(tf.square(clipped_error) + linear_error)

    global_step = tf.Variable(0, trainable=False, name='global_step')
    optimizer = tf.train.MomentumOptimizer(learning_rate, momentum, use_nesterov=True)
    training_op = optimizer.minimize(loss, global_step=global_step)

init = tf.global_variables_initializer()
saver = tf.train.Saver()

In [19]:
env = Game2048Env()

with tf.Session(config = config) as sess:  #config = config
#     if os.path.isfile(checkpoint_path + ".index"):
#         saver.restore(sess, checkpoint_path)
#     else:
#         init.run()
#         copy_online_to_target.run() 
    init.run()
    copy_online_to_target.run()
    
    while True:
        step = global_step.eval()
        if step >= n_steps:
            break
        iteration += 1
        avg_score, avg_highest, count_game, count_success, highest_reached = stat.getStat()
#         print("""\r Iter {}\tTraining step {}/{} ({:.1f})%\tLoss {:2f}\tMean Max-Q {:2f}\t AVG Score {}\tAVG Highest {}\t Game {} \t Win {}\t Best {} \t""".format(iteration, step, n_steps, step * 100 / n_steps, loss_val, mean_max_q, avg_score, avg_highest, count_game, count_success, highest_reached), end="\r")
#         print("Iter {}\tTraining step {}/{} ({:.1f})%\tLoss {:2f}\tMean Max-Q {:2f}".format(iteration, step, n_steps, step * 100 / n_steps, loss_val, mean_max_q), end="\r")
        print("\r Iter {}\t{}".format(iteration, highest_reached), end="")
    
        if done: # game over, start again
            obs = env.reset()
            state = preprocess_observation(obs)
            print(state.shape)

        # Online DQN evaluates what to do
        q_values = online_q_values.eval(feed_dict={X_state: [state]})
        action = epsilon_greedy(q_values, step)

        # Online DQN plays
        obs, reward, done, info = env.step(action)
        next_state = preprocess_observation(obs)

        # Let's memorize what happened
        replay_memory.append((state, action, reward, next_state, 1.0 - done))
        state = next_state

        # Compute statistics for tracking progress (not shown in the book)
        total_max_q += q_values.max()
        game_length += 1
        if done:
            mean_max_q = total_max_q / game_length
            total_max_q = 0.0
            game_length = 0
            stat.append(env.score, env.highest())

        if iteration < training_start or iteration % training_interval != 0:
            continue # oly tranin after warmup period and at regular intervals
        
        # Sample memories and use the target DQN to produce the target Q-Value
        X_state_val, X_action_val, rewards, X_next_state_val, continues = sample_memories(batch_size)
        next_q_values = target_q_values.eval(feed_dict={X_state: X_next_state_val})
        y_val = rewards + continues * discount_rate * np.max(next_q_values, axis=1, keepdims=True)

        # Train the online DQN
        _, loss_val = sess.run([training_op, loss], feed_dict={
            X_state: X_state_val, X_action: X_action_val, y: y_val})

        # Regularly copy the online DQN to the target DQN
        if step % copy_steps == 0:
            copy_online_to_target.run()

#         # And save regularly
#         if step % save_steps == 0:
#             saver.save(sess, checkpoint_path)

 Iter 121495	256(256,)
 Iter 121579	256(256,)
 Iter 121721	256(256,)
 Iter 121834	256(256,)
 Iter 121958	256(256,)
 Iter 122169	256(256,)
 Iter 122345	256(256,)
 Iter 122436	256(256,)
 Iter 122517	256(256,)
 Iter 122646	256(256,)
 Iter 122750	256(256,)
 Iter 122878	256(256,)
 Iter 122950	256(256,)
 Iter 123033	256(256,)
 Iter 123169	256(256,)
 Iter 123292	256(256,)
 Iter 123366	256

KeyboardInterrupt: 

In [20]:
env = Game2048Env()

with tf.Session(config=config) as sess:
#     saver.restore(sess, checkpoint_path)
    obs = env.reset()
    state = preprocess_observation(obs)
    while True:
        q_values = online_q_values.eval(feed_dict={X_state: [state]})
        action = np.argmax(q_values)
        obs, reward, done, info = env.step(action)
        state = preprocess_observation(obs)
        print(action, reward)
#         env.render()
        if done:
            env.close()
            break

(256,)


FailedPreconditionError: Attempting to use uninitialized value q_networks/online/dense_1/kernel
	 [[Node: q_networks/online/dense_1/kernel/read = Identity[T=DT_FLOAT, _class=["loc:@q_networks/online/dense_1/kernel"], _device="/job:localhost/replica:0/task:0/device:CPU:0"](q_networks/online/dense_1/kernel)]]

Caused by op 'q_networks/online/dense_1/kernel/read', defined at:
  File "C:\python36\envs\machine_learning\lib\runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "C:\python36\envs\machine_learning\lib\runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "C:\python36\envs\machine_learning\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "C:\python36\envs\machine_learning\lib\site-packages\traitlets\config\application.py", line 658, in launch_instance
    app.start()
  File "C:\python36\envs\machine_learning\lib\site-packages\ipykernel\kernelapp.py", line 477, in start
    ioloop.IOLoop.instance().start()
  File "C:\python36\envs\machine_learning\lib\site-packages\zmq\eventloop\ioloop.py", line 177, in start
    super(ZMQIOLoop, self).start()
  File "C:\python36\envs\machine_learning\lib\site-packages\tornado\ioloop.py", line 888, in start
    handler_func(fd_obj, events)
  File "C:\python36\envs\machine_learning\lib\site-packages\tornado\stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "C:\python36\envs\machine_learning\lib\site-packages\zmq\eventloop\zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "C:\python36\envs\machine_learning\lib\site-packages\zmq\eventloop\zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "C:\python36\envs\machine_learning\lib\site-packages\zmq\eventloop\zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "C:\python36\envs\machine_learning\lib\site-packages\tornado\stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "C:\python36\envs\machine_learning\lib\site-packages\ipykernel\kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "C:\python36\envs\machine_learning\lib\site-packages\ipykernel\kernelbase.py", line 235, in dispatch_shell
    handler(stream, idents, msg)
  File "C:\python36\envs\machine_learning\lib\site-packages\ipykernel\kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "C:\python36\envs\machine_learning\lib\site-packages\ipykernel\ipkernel.py", line 196, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "C:\python36\envs\machine_learning\lib\site-packages\ipykernel\zmqshell.py", line 533, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "C:\python36\envs\machine_learning\lib\site-packages\IPython\core\interactiveshell.py", line 2698, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "C:\python36\envs\machine_learning\lib\site-packages\IPython\core\interactiveshell.py", line 2802, in run_ast_nodes
    if self.run_code(code, result):
  File "C:\python36\envs\machine_learning\lib\site-packages\IPython\core\interactiveshell.py", line 2862, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-11-6a6ed56f5baf>", line 7, in <module>
    online_q_values, online_vars = q_network(X_state, name="q_networks/online")
  File "<ipython-input-8-511916cb5e00>", line 15, in q_network
    kernel_initializer=initializer)
  File "C:\python36\envs\machine_learning\lib\site-packages\tensorflow\python\layers\core.py", line 253, in dense
    return layer.apply(inputs)
  File "C:\python36\envs\machine_learning\lib\site-packages\tensorflow\python\layers\base.py", line 762, in apply
    return self.__call__(inputs, *args, **kwargs)
  File "C:\python36\envs\machine_learning\lib\site-packages\tensorflow\python\layers\base.py", line 636, in __call__
    self.build(input_shapes)
  File "C:\python36\envs\machine_learning\lib\site-packages\tensorflow\python\layers\core.py", line 137, in build
    trainable=True)
  File "C:\python36\envs\machine_learning\lib\site-packages\tensorflow\python\layers\base.py", line 504, in add_variable
    partitioner=partitioner)
  File "C:\python36\envs\machine_learning\lib\site-packages\tensorflow\python\ops\variable_scope.py", line 1262, in get_variable
    constraint=constraint)
  File "C:\python36\envs\machine_learning\lib\site-packages\tensorflow\python\ops\variable_scope.py", line 1097, in get_variable
    constraint=constraint)
  File "C:\python36\envs\machine_learning\lib\site-packages\tensorflow\python\ops\variable_scope.py", line 435, in get_variable
    constraint=constraint)
  File "C:\python36\envs\machine_learning\lib\site-packages\tensorflow\python\ops\variable_scope.py", line 404, in _true_getter
    use_resource=use_resource, constraint=constraint)
  File "C:\python36\envs\machine_learning\lib\site-packages\tensorflow\python\ops\variable_scope.py", line 806, in _get_single_variable
    constraint=constraint)
  File "C:\python36\envs\machine_learning\lib\site-packages\tensorflow\python\ops\variables.py", line 229, in __init__
    constraint=constraint)
  File "C:\python36\envs\machine_learning\lib\site-packages\tensorflow\python\ops\variables.py", line 376, in _init_from_args
    self._snapshot = array_ops.identity(self._variable, name="read")
  File "C:\python36\envs\machine_learning\lib\site-packages\tensorflow\python\ops\array_ops.py", line 127, in identity
    return gen_array_ops.identity(input, name=name)
  File "C:\python36\envs\machine_learning\lib\site-packages\tensorflow\python\ops\gen_array_ops.py", line 2728, in identity
    "Identity", input=input, name=name)
  File "C:\python36\envs\machine_learning\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "C:\python36\envs\machine_learning\lib\site-packages\tensorflow\python\framework\ops.py", line 3160, in create_op
    op_def=op_def)
  File "C:\python36\envs\machine_learning\lib\site-packages\tensorflow\python\framework\ops.py", line 1625, in __init__
    self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access

FailedPreconditionError (see above for traceback): Attempting to use uninitialized value q_networks/online/dense_1/kernel
	 [[Node: q_networks/online/dense_1/kernel/read = Identity[T=DT_FLOAT, _class=["loc:@q_networks/online/dense_1/kernel"], _device="/job:localhost/replica:0/task:0/device:CPU:0"](q_networks/online/dense_1/kernel)]]


In [17]:
obs = env.reset()
state = preprocess_observation(obs)
state

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=uint8)