In [1]:
import os
import math
import random
import numpy as np

import tensorflow as tf

from gym_2048.envs.game2048_env import Game2048Env

  from ._conv import register_converters as _register_converters


In [2]:
class ReplayMemory:
    def __init__(self, maxlen):
        self.maxlen = maxlen
        self.reset()

    def append(self, state, action, reward, state_final, cont):
        self.state_init[self.index] = state
        self.action[self.index] = action
        self.reward[self.index] = reward
        self.state_final[self.index] = state_final
        self.not_over[self.index] = cont
        self.length = min(self.length + 1, self.maxlen)
        self.index = (self.index + 1) % self.maxlen

    @property
    def isFull(self):
        return self.length == self.maxlen

    def sample(self, batch_size, with_replacement=True):
        if with_replacement:
            indices = np.random.randint(self.length, size=batch_size) # faster
        else:
            indices = np.random.permutation(self.length)[:batch_size]

        return self.state_init[indices], self.action[indices], self.reward[indices], self.state_final[indices], self.not_over[indices]

    def reset(self):
        self.state_init = np.empty(shape=(self.maxlen, 256), dtype=np.uint8)
        self.action = np.empty(shape=(self.maxlen, 1), dtype=np.uint8)
        self.reward = np.empty(shape=(self.maxlen, 1), dtype=np.int32)
        self.state_final = np.empty(shape=(self.maxlen, 256), dtype=np.uint8)
        self.not_over = np.empty(shape=(self.maxlen, 1), dtype=np.bool_)
        self.index = 0
        self.length = 0

In [3]:
class Stats:
    def __init__(self, maxlen):
        self.maxlen = maxlen
        self.reset()

    def append(self, score, highest):
        self.score[self.index] = score
        self.highest[self.index] = highest
        self.index = (self.index + 1) % self.maxlen
        self.total_game += 1
        self.reach2048 += 1 if highest == 2048 else 0
        self.highest_reached = max(self.highest_reached, highest)

    def getStat(self):
        return [self.score.mean(),
                self.highest.mean(),
                self.total_game,
                self.reach2048,
                self.highest_reached]

    def reset(self):
        self.score = np.zeros(shape=self.maxlen, dtype=np.int32)
        self.highest = np.zeros(shape=self.maxlen, dtype=np.int32)
        self.index = 0
        self.total_game = 0
        self.reach2048 = 0
        self.highest_reached = 0

In [11]:
class DQN:
    def __init__(self):
        self.stat = Stats(maxlen = 10)
        self.replay_memory_size = 5000
        self.replay_memory = ReplayMemory(maxlen = self.replay_memory_size)

        self.n_input = [None, 256]
        self.n_hidden = [100, 100, 100]
        self.names = ["H1", "H2", "H3"]
        self.n_outputs = 4

        self.learning_rate = 0.01
        self.momentum = 0.95
        self.discount_rate = 0.95
        self.batch_size = 50

        self.eps_min = 0.1
        self.eps_max = 1.0
        self.eps_decay_steps = 200000

        self.n_steps = self.eps_decay_steps * 2
        self.training_start = self.replay_memory_size  # start learning only when replay memory is full
        self.training_interval = 1

        self.save_steps = 1000
        self.copy_steps = 3000

        self.loss_val = np.infty
        self.game_length = 0
        self.total_max_q = 0
        self.mean_max_q = 0.0

        self.config = tf.ConfigProto(device_count = {'GPU': 0})

        self.checkpoint_path = "F:/training_data/DQN/Pacman/my_dqn_2048.ckpt"

        self.reset_graph()

    def reset_graph(self, seed=42):
        tf.reset_default_graph()
        tf.set_random_seed(seed)
        np.random.seed(seed)

        self.replay_memory.reset()
        self.stat.reset()

        self.initializer = tf.variance_scaling_initializer()

        self.X_state = tf.placeholder(tf.float32, shape=self.n_input)
        self.actor_Q_values, self.weight_actor = self.createModel(name = "Actor", prev_layer = self.X_state)
        self.critic_Q_values, self.weight_critic = self.createModel(name="Critic", prev_layer = self.X_state)

        self.transfertLearning = self.CopyCriticToActor()

        # with tf.variable_scope("train"):
        self.X_action = tf.placeholder(tf.int32, shape=[None, 1])  # action taken (shape batch x 1)
        self.y = tf.placeholder(tf.float32, shape=[None, 1])       # Q-value computed (shape batch x 1)
        self.pred_q_value = tf.reduce_sum(self.actor_Q_values * tf.one_hot(self.X_action, self.n_outputs), axis=1,
                                keepdims=True)         # element-wise product Q-value x Action_OHE(batch x 4)

        # clipped loss
        error = tf.abs(self.y - self.pred_q_value)
        clipped_error = tf.clip_by_value(error, -1.0, 1.0)
        linear_error = 2 * (error - clipped_error)
        self.loss = tf.reduce_mean(tf.square(clipped_error) + linear_error)

        self.global_step = tf.Variable(0, trainable=False, name='global_step')
        self.optimizer = tf.train.MomentumOptimizer(self.learning_rate, self.momentum, use_nesterov=True)
        self.training_operation = self.optimizer.minimize(self.loss, global_step=self.global_step)

        self.init = tf.global_variables_initializer()
        self.saver = tf.train.Saver()

    def CopyCriticToActor(self):
        copy_operations = [target_var.assign(self.weight_actor[var_name])
            for var_name, target_var in self.weight_critic.items()]
        return tf.group(*copy_operations)

    def createModel(self, name, prev_layer):
        with tf.variable_scope(name) as scope:
            for name_layer, n_unit in zip(self.names, self.n_hidden):
                prev_layer = tf.layers.dense(inputs=prev_layer,
                                             units=n_unit,
                                             activation=tf.nn.relu,
                                             # name=name+ "/" + name_layer,
                                             kernel_initializer=self.initializer)
        outputs = tf.layers.dense(inputs=prev_layer,
                                  units=self.n_outputs,
                                  kernel_initializer=self.initializer)
        tensors = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope.name)
        dict_weight = {}
        for var in tensors:
            dict_weight[var.name[len(scope.name):]] = var  # keep only the end the name
        return outputs, dict_weight

    def epsilon_greedy(self, q_values, step):
        epsilon = max(self.eps_min, self.eps_max - (self.eps_max - self.eps_min) * step / self.eps_decay_steps)
        if np.random.rand() < epsilon:
            return np.random.randint(self.n_outputs)  # random action
        else:
            return np.argmax(q_values)  # optimal action

    def preprocess_observation(self, obs):
        result = np.zeros((16, 4, 4), dtype=np.uint8)
        for i, value in enumerate(obs):
            if value != 0:
                depth = int(math.log(value, 2) - 1)
                result[depth, i % 4, i // 4] = 1
        return result.flatten()

    def train(self):
        env = Game2048Env()
        done = True  # to directly reset the game

        with tf.Session(config=self.config) as sess:
            if os.path.isfile(self.checkpoint_path + ".index"):
                self.saver.restore(sess, self.checkpoint_path)
            else:
                self.init.run()
                self.transfertLearning.run()

            step = self.global_step.eval()
            iter = 0
            while step < self.n_steps:
                step = self.global_step.eval()

                if done:  # game over, start again
                    self.stat.append(env.score, env.highest())
                    obs = env.reset()
                    state = self.preprocess_observation(obs)

                avg_score, avg_highest, count_game, count_success, highest_reached = self.stat.getStat()
#                 print("\rIter {}\tTraining step {}/{} ({:.1f})%\tAVG Score {}"
#                       "\tAVG Highest {}\tGame {}\t Win {}\t Best {}\t".format(
#                         iter, step, self.n_steps, step * 100 / self.n_steps, avg_score, avg_highest,
#                         count_game, count_success, highest_reached), end="")

                print("\rIter {}\tTraining step {}/{} ({:.1f})%\tGame {}t Best {}".format(
                        iter, step, self.n_steps, step * 100 / self.n_steps, count_game, highest_reached), end="")

                q_values = self.actor_Q_values.eval(feed_dict={self.X_state: [state]})
                action = self.epsilon_greedy(q_values, step)

                obs, reward, done, info = env.step(action)
                next_state = self.preprocess_observation(obs)

                self.replay_memory.append(state, action, reward, next_state, 1.0 - done)

                state = next_state
                iter += 1
                if self.replay_memory.isFull:
                    X_state_val, X_action_val, Rewards, X_next_state_val, Continues = self.replay_memory.sample(self.batch_size)
                    next_q_values = self.critic_Q_values.eval(feed_dict={self.X_state: X_next_state_val})
                    y_val = Rewards + Continues * self.discount_rate * np.max(next_q_values, axis=1, keepdims=True)

                    # Train the online DQN
                    _, loss_val, b = sess.run([self.training_operation, self.loss, self.pred_q_value ], feed_dict={
                        self.X_state: X_state_val, self.X_action: X_action_val, self.y: y_val})

                    # Regularly copy the online DQN to the target DQN
                    if step % self.copy_steps == 0:
                        self.transfertLearning.run()

                    if step % self.save_steps == 0:
                        self.saver.save(sess, self.checkpoint_path)

    def play(self, n_iter = 1):
        env = Game2048Env()
        i = 0
        with tf.Session(config=self.config) as sess:
            self.saver.restore(sess, self.checkpoint_path)
            for i in range(n_iter):
                obs = env.reset()
                state = self.preprocess_observation(obs)
                while True:
                    q_values = self.actor_Q_values.eval(feed_dict={self.X_state: [state]})
                    action = np.argmax(q_values)
                    obs, reward, done, info = env.step(action)
                    if reward < 0:
                        i+=1
                    if i == 10:
                        break
                    state = self.preprocess_observation(obs)
                    print(action, reward)
                    # env.render()
                    if done:
                        print("Game {} - Max Reached = {} - Score {}".format(i, env.highest(), env.score))
                        break
            env.close()

In [8]:
dqn = DQN()

In [9]:
dqn.train()

Iter 0	Training step 0/400000 (0.0)%	Game 1t Best 2Iter 1	Training step 0/400000 (0.0)%	Game 1t Best 2Iter 2	Training step 0/400000 (0.0)%	Game 1t Best 2Iter 3	Training step 0/400000 (0.0)%	Game 1t Best 2Iter 4	Training step 0/400000 (0.0)%	Game 1t Best 2Iter 5	Training step 0/400000 (0.0)%	Game 1t Best 2Iter 6	Training step 0/400000 (0.0)%	Game 1t Best 2Iter 7	Training step 0/400000 (0.0)%	Game 1t Best 2Iter 8	Training step 0/400000 (0.0)%	Game 1t Best 2Iter 9	Training step 0/400000 (0.0)%	Game 1t Best 2Iter 10	Training step 0/400000 (0.0)%	Game 1t Best 2Iter 11	Training step 0/400000 (0.0)%	Game 1t Best 2Iter 12	Training step 0/400000 (0.0)%	Game 1t Best 2Iter 13	Training step 0/400000 (0.0)%	Game 1t Best 2Iter 14	Training step 0/400000 (0.0)%	Game 1t Best 2Iter 15	Training step 0/400000 (0.0)%	Game 1t Best 2Iter 16	Training step 0/400000 (0.0)%	Game 1t Best 2Iter 17	Training step 0/400000 (0.0)%	Game 1t Best 2Iter 18	Training step 0/400000 (0.0)%	Game 1t Best 2It

Iter 307	Training step 0/400000 (0.0)%	Game 3t Best 128Iter 308	Training step 0/400000 (0.0)%	Game 3t Best 128Iter 309	Training step 0/400000 (0.0)%	Game 3t Best 128Iter 310	Training step 0/400000 (0.0)%	Game 3t Best 128Iter 311	Training step 0/400000 (0.0)%	Game 3t Best 128Iter 312	Training step 0/400000 (0.0)%	Game 3t Best 128Iter 313	Training step 0/400000 (0.0)%	Game 3t Best 128Iter 314	Training step 0/400000 (0.0)%	Game 3t Best 128Iter 315	Training step 0/400000 (0.0)%	Game 3t Best 128Iter 316	Training step 0/400000 (0.0)%	Game 3t Best 128Iter 317	Training step 0/400000 (0.0)%	Game 3t Best 128Iter 318	Training step 0/400000 (0.0)%	Game 3t Best 128Iter 319	Training step 0/400000 (0.0)%	Game 3t Best 128Iter 320	Training step 0/400000 (0.0)%	Game 3t Best 128Iter 321	Training step 0/400000 (0.0)%	Game 3t Best 128Iter 322	Training step 0/400000 (0.0)%	Game 3t Best 128Iter 323	Training step 0/400000 (0.0)%	Game 3t Best 128Iter 324	Training step 0/400000 (0.0)%	Game 3t 

Iter 599	Training step 0/400000 (0.0)%	Game 5t Best 128Iter 600	Training step 0/400000 (0.0)%	Game 5t Best 128Iter 601	Training step 0/400000 (0.0)%	Game 5t Best 128Iter 602	Training step 0/400000 (0.0)%	Game 5t Best 128Iter 603	Training step 0/400000 (0.0)%	Game 5t Best 128Iter 604	Training step 0/400000 (0.0)%	Game 5t Best 128Iter 605	Training step 0/400000 (0.0)%	Game 5t Best 128Iter 606	Training step 0/400000 (0.0)%	Game 5t Best 128Iter 607	Training step 0/400000 (0.0)%	Game 5t Best 128Iter 608	Training step 0/400000 (0.0)%	Game 5t Best 128Iter 609	Training step 0/400000 (0.0)%	Game 5t Best 128Iter 610	Training step 0/400000 (0.0)%	Game 5t Best 128Iter 611	Training step 0/400000 (0.0)%	Game 5t Best 128Iter 612	Training step 0/400000 (0.0)%	Game 5t Best 128Iter 613	Training step 0/400000 (0.0)%	Game 5t Best 128Iter 614	Training step 0/400000 (0.0)%	Game 5t Best 128Iter 615	Training step 0/400000 (0.0)%	Game 5t Best 128Iter 616	Training step 0/400000 (0.0)%	Game 5t 

Iter 891	Training step 0/400000 (0.0)%	Game 7t Best 128Iter 892	Training step 0/400000 (0.0)%	Game 7t Best 128Iter 893	Training step 0/400000 (0.0)%	Game 7t Best 128Iter 894	Training step 0/400000 (0.0)%	Game 7t Best 128Iter 895	Training step 0/400000 (0.0)%	Game 7t Best 128Iter 896	Training step 0/400000 (0.0)%	Game 7t Best 128Iter 897	Training step 0/400000 (0.0)%	Game 7t Best 128Iter 898	Training step 0/400000 (0.0)%	Game 7t Best 128Iter 899	Training step 0/400000 (0.0)%	Game 7t Best 128Iter 900	Training step 0/400000 (0.0)%	Game 7t Best 128Iter 901	Training step 0/400000 (0.0)%	Game 7t Best 128Iter 902	Training step 0/400000 (0.0)%	Game 7t Best 128Iter 903	Training step 0/400000 (0.0)%	Game 7t Best 128Iter 904	Training step 0/400000 (0.0)%	Game 8t Best 128Iter 905	Training step 0/400000 (0.0)%	Game 8t Best 128Iter 906	Training step 0/400000 (0.0)%	Game 8t Best 128Iter 907	Training step 0/400000 (0.0)%	Game 8t Best 128Iter 908	Training step 0/400000 (0.0)%	Game 8t 

Iter 1183	Training step 0/400000 (0.0)%	Game 10t Best 128Iter 1184	Training step 0/400000 (0.0)%	Game 10t Best 128Iter 1185	Training step 0/400000 (0.0)%	Game 10t Best 128Iter 1186	Training step 0/400000 (0.0)%	Game 10t Best 128Iter 1187	Training step 0/400000 (0.0)%	Game 10t Best 128Iter 1188	Training step 0/400000 (0.0)%	Game 10t Best 128Iter 1189	Training step 0/400000 (0.0)%	Game 10t Best 128Iter 1190	Training step 0/400000 (0.0)%	Game 10t Best 128Iter 1191	Training step 0/400000 (0.0)%	Game 10t Best 128Iter 1192	Training step 0/400000 (0.0)%	Game 10t Best 128Iter 1193	Training step 0/400000 (0.0)%	Game 10t Best 128Iter 1194	Training step 0/400000 (0.0)%	Game 10t Best 128Iter 1195	Training step 0/400000 (0.0)%	Game 10t Best 128Iter 1196	Training step 0/400000 (0.0)%	Game 10t Best 128Iter 1197	Training step 0/400000 (0.0)%	Game 10t Best 128Iter 1198	Training step 0/400000 (0.0)%	Game 10t Best 128Iter 1199	Training step 0/400000 (0.0)%	Game 10t Best 128Iter 1200	Tra

Iter 1475	Training step 0/400000 (0.0)%	Game 12t Best 128Iter 1476	Training step 0/400000 (0.0)%	Game 12t Best 128Iter 1477	Training step 0/400000 (0.0)%	Game 12t Best 128Iter 1478	Training step 0/400000 (0.0)%	Game 12t Best 128Iter 1479	Training step 0/400000 (0.0)%	Game 12t Best 128Iter 1480	Training step 0/400000 (0.0)%	Game 12t Best 128Iter 1481	Training step 0/400000 (0.0)%	Game 12t Best 128Iter 1482	Training step 0/400000 (0.0)%	Game 12t Best 128Iter 1483	Training step 0/400000 (0.0)%	Game 12t Best 128Iter 1484	Training step 0/400000 (0.0)%	Game 12t Best 128Iter 1485	Training step 0/400000 (0.0)%	Game 12t Best 128Iter 1486	Training step 0/400000 (0.0)%	Game 12t Best 128Iter 1487	Training step 0/400000 (0.0)%	Game 12t Best 128Iter 1488	Training step 0/400000 (0.0)%	Game 12t Best 128Iter 1489	Training step 0/400000 (0.0)%	Game 12t Best 128Iter 1490	Training step 0/400000 (0.0)%	Game 12t Best 128Iter 1491	Training step 0/400000 (0.0)%	Game 12t Best 128Iter 1492	Tra

Iter 1782	Training step 0/400000 (0.0)%	Game 14t Best 128Iter 1783	Training step 0/400000 (0.0)%	Game 14t Best 128Iter 1784	Training step 0/400000 (0.0)%	Game 14t Best 128Iter 1785	Training step 0/400000 (0.0)%	Game 14t Best 128Iter 1786	Training step 0/400000 (0.0)%	Game 14t Best 128Iter 1787	Training step 0/400000 (0.0)%	Game 14t Best 128Iter 1788	Training step 0/400000 (0.0)%	Game 14t Best 128Iter 1789	Training step 0/400000 (0.0)%	Game 14t Best 128Iter 1790	Training step 0/400000 (0.0)%	Game 14t Best 128Iter 1791	Training step 0/400000 (0.0)%	Game 14t Best 128Iter 1792	Training step 0/400000 (0.0)%	Game 14t Best 128Iter 1793	Training step 0/400000 (0.0)%	Game 14t Best 128Iter 1794	Training step 0/400000 (0.0)%	Game 14t Best 128Iter 1795	Training step 0/400000 (0.0)%	Game 14t Best 128Iter 1796	Training step 0/400000 (0.0)%	Game 14t Best 128Iter 1797	Training step 0/400000 (0.0)%	Game 14t Best 128Iter 1798	Training step 0/400000 (0.0)%	Game 14t Best 128Iter 1799	Tra

Iter 2089	Training step 0/400000 (0.0)%	Game 17t Best 128Iter 2090	Training step 0/400000 (0.0)%	Game 17t Best 128Iter 2091	Training step 0/400000 (0.0)%	Game 17t Best 128Iter 2092	Training step 0/400000 (0.0)%	Game 17t Best 128Iter 2093	Training step 0/400000 (0.0)%	Game 17t Best 128Iter 2094	Training step 0/400000 (0.0)%	Game 17t Best 128Iter 2095	Training step 0/400000 (0.0)%	Game 17t Best 128Iter 2096	Training step 0/400000 (0.0)%	Game 17t Best 128Iter 2097	Training step 0/400000 (0.0)%	Game 17t Best 128Iter 2098	Training step 0/400000 (0.0)%	Game 17t Best 128Iter 2099	Training step 0/400000 (0.0)%	Game 17t Best 128Iter 2100	Training step 0/400000 (0.0)%	Game 17t Best 128Iter 2101	Training step 0/400000 (0.0)%	Game 17t Best 128Iter 2102	Training step 0/400000 (0.0)%	Game 17t Best 128Iter 2103	Training step 0/400000 (0.0)%	Game 17t Best 128Iter 2104	Training step 0/400000 (0.0)%	Game 17t Best 128Iter 2105	Training step 0/400000 (0.0)%	Game 17t Best 128Iter 2106	Tra

Iter 2386	Training step 0/400000 (0.0)%	Game 19t Best 128Iter 2387	Training step 0/400000 (0.0)%	Game 19t Best 128Iter 2388	Training step 0/400000 (0.0)%	Game 19t Best 128Iter 2389	Training step 0/400000 (0.0)%	Game 19t Best 128Iter 2390	Training step 0/400000 (0.0)%	Game 19t Best 128Iter 2391	Training step 0/400000 (0.0)%	Game 19t Best 128Iter 2392	Training step 0/400000 (0.0)%	Game 19t Best 128Iter 2393	Training step 0/400000 (0.0)%	Game 19t Best 128Iter 2394	Training step 0/400000 (0.0)%	Game 19t Best 128Iter 2395	Training step 0/400000 (0.0)%	Game 19t Best 128Iter 2396	Training step 0/400000 (0.0)%	Game 19t Best 128Iter 2397	Training step 0/400000 (0.0)%	Game 19t Best 128Iter 2398	Training step 0/400000 (0.0)%	Game 19t Best 128Iter 2399	Training step 0/400000 (0.0)%	Game 19t Best 128Iter 2400	Training step 0/400000 (0.0)%	Game 19t Best 128Iter 2401	Training step 0/400000 (0.0)%	Game 19t Best 128Iter 2402	Training step 0/400000 (0.0)%	Game 19t Best 128Iter 2403	Tra

Iter 2657	Training step 0/400000 (0.0)%	Game 22t Best 128Iter 2658	Training step 0/400000 (0.0)%	Game 22t Best 128Iter 2659	Training step 0/400000 (0.0)%	Game 22t Best 128Iter 2660	Training step 0/400000 (0.0)%	Game 22t Best 128Iter 2661	Training step 0/400000 (0.0)%	Game 22t Best 128Iter 2662	Training step 0/400000 (0.0)%	Game 22t Best 128Iter 2663	Training step 0/400000 (0.0)%	Game 22t Best 128Iter 2664	Training step 0/400000 (0.0)%	Game 22t Best 128Iter 2665	Training step 0/400000 (0.0)%	Game 22t Best 128Iter 2666	Training step 0/400000 (0.0)%	Game 22t Best 128Iter 2667	Training step 0/400000 (0.0)%	Game 22t Best 128Iter 2668	Training step 0/400000 (0.0)%	Game 22t Best 128Iter 2669	Training step 0/400000 (0.0)%	Game 22t Best 128Iter 2670	Training step 0/400000 (0.0)%	Game 22t Best 128Iter 2671	Training step 0/400000 (0.0)%	Game 22t Best 128Iter 2672	Training step 0/400000 (0.0)%	Game 22t Best 128Iter 2673	Training step 0/400000 (0.0)%	Game 22t Best 128Iter 2674	Tra

Iter 2935	Training step 0/400000 (0.0)%	Game 25t Best 128Iter 2936	Training step 0/400000 (0.0)%	Game 25t Best 128Iter 2937	Training step 0/400000 (0.0)%	Game 25t Best 128Iter 2938	Training step 0/400000 (0.0)%	Game 25t Best 128Iter 2939	Training step 0/400000 (0.0)%	Game 25t Best 128Iter 2940	Training step 0/400000 (0.0)%	Game 25t Best 128Iter 2941	Training step 0/400000 (0.0)%	Game 25t Best 128Iter 2942	Training step 0/400000 (0.0)%	Game 25t Best 128Iter 2943	Training step 0/400000 (0.0)%	Game 25t Best 128Iter 2944	Training step 0/400000 (0.0)%	Game 25t Best 128Iter 2945	Training step 0/400000 (0.0)%	Game 25t Best 128Iter 2946	Training step 0/400000 (0.0)%	Game 25t Best 128Iter 2947	Training step 0/400000 (0.0)%	Game 25t Best 128Iter 2948	Training step 0/400000 (0.0)%	Game 25t Best 128Iter 2949	Training step 0/400000 (0.0)%	Game 25t Best 128Iter 2950	Training step 0/400000 (0.0)%	Game 25t Best 128Iter 2951	Training step 0/400000 (0.0)%	Game 25t Best 128Iter 2952	Tra

Iter 3235	Training step 0/400000 (0.0)%	Game 27t Best 128Iter 3236	Training step 0/400000 (0.0)%	Game 27t Best 128Iter 3237	Training step 0/400000 (0.0)%	Game 27t Best 128Iter 3238	Training step 0/400000 (0.0)%	Game 27t Best 128Iter 3239	Training step 0/400000 (0.0)%	Game 27t Best 128Iter 3240	Training step 0/400000 (0.0)%	Game 27t Best 128Iter 3241	Training step 0/400000 (0.0)%	Game 27t Best 128Iter 3242	Training step 0/400000 (0.0)%	Game 27t Best 128Iter 3243	Training step 0/400000 (0.0)%	Game 27t Best 128Iter 3244	Training step 0/400000 (0.0)%	Game 27t Best 128Iter 3245	Training step 0/400000 (0.0)%	Game 27t Best 128Iter 3246	Training step 0/400000 (0.0)%	Game 27t Best 128Iter 3247	Training step 0/400000 (0.0)%	Game 27t Best 128Iter 3248	Training step 0/400000 (0.0)%	Game 27t Best 128Iter 3249	Training step 0/400000 (0.0)%	Game 27t Best 128Iter 3250	Training step 0/400000 (0.0)%	Game 27t Best 128Iter 3251	Training step 0/400000 (0.0)%	Game 27t Best 128Iter 3252	Tra

Iter 3541	Training step 0/400000 (0.0)%	Game 29t Best 128Iter 3542	Training step 0/400000 (0.0)%	Game 29t Best 128Iter 3543	Training step 0/400000 (0.0)%	Game 29t Best 128Iter 3544	Training step 0/400000 (0.0)%	Game 29t Best 128Iter 3545	Training step 0/400000 (0.0)%	Game 29t Best 128Iter 3546	Training step 0/400000 (0.0)%	Game 29t Best 128Iter 3547	Training step 0/400000 (0.0)%	Game 29t Best 128Iter 3548	Training step 0/400000 (0.0)%	Game 29t Best 128Iter 3549	Training step 0/400000 (0.0)%	Game 29t Best 128Iter 3550	Training step 0/400000 (0.0)%	Game 29t Best 128Iter 3551	Training step 0/400000 (0.0)%	Game 29t Best 128Iter 3552	Training step 0/400000 (0.0)%	Game 29t Best 128Iter 3553	Training step 0/400000 (0.0)%	Game 29t Best 128Iter 3554	Training step 0/400000 (0.0)%	Game 29t Best 128Iter 3555	Training step 0/400000 (0.0)%	Game 29t Best 128Iter 3556	Training step 0/400000 (0.0)%	Game 29t Best 128Iter 3557	Training step 0/400000 (0.0)%	Game 29t Best 128Iter 3558	Tra

Iter 3831	Training step 0/400000 (0.0)%	Game 31t Best 128Iter 3832	Training step 0/400000 (0.0)%	Game 31t Best 128Iter 3833	Training step 0/400000 (0.0)%	Game 31t Best 128Iter 3834	Training step 0/400000 (0.0)%	Game 31t Best 128Iter 3835	Training step 0/400000 (0.0)%	Game 31t Best 128Iter 3836	Training step 0/400000 (0.0)%	Game 31t Best 128Iter 3837	Training step 0/400000 (0.0)%	Game 31t Best 128Iter 3838	Training step 0/400000 (0.0)%	Game 31t Best 128Iter 3839	Training step 0/400000 (0.0)%	Game 31t Best 128Iter 3840	Training step 0/400000 (0.0)%	Game 31t Best 128Iter 3841	Training step 0/400000 (0.0)%	Game 31t Best 128Iter 3842	Training step 0/400000 (0.0)%	Game 31t Best 128Iter 3843	Training step 0/400000 (0.0)%	Game 31t Best 128Iter 3844	Training step 0/400000 (0.0)%	Game 31t Best 128Iter 3845	Training step 0/400000 (0.0)%	Game 31t Best 128Iter 3846	Training step 0/400000 (0.0)%	Game 31t Best 128Iter 3847	Training step 0/400000 (0.0)%	Game 31t Best 128Iter 3848	Tra

Iter 4142	Training step 0/400000 (0.0)%	Game 33t Best 128Iter 4143	Training step 0/400000 (0.0)%	Game 33t Best 128Iter 4144	Training step 0/400000 (0.0)%	Game 33t Best 128Iter 4145	Training step 0/400000 (0.0)%	Game 33t Best 128Iter 4146	Training step 0/400000 (0.0)%	Game 33t Best 128Iter 4147	Training step 0/400000 (0.0)%	Game 33t Best 128Iter 4148	Training step 0/400000 (0.0)%	Game 33t Best 128Iter 4149	Training step 0/400000 (0.0)%	Game 33t Best 128Iter 4150	Training step 0/400000 (0.0)%	Game 33t Best 128Iter 4151	Training step 0/400000 (0.0)%	Game 33t Best 128Iter 4152	Training step 0/400000 (0.0)%	Game 33t Best 128Iter 4153	Training step 0/400000 (0.0)%	Game 33t Best 128Iter 4154	Training step 0/400000 (0.0)%	Game 33t Best 128Iter 4155	Training step 0/400000 (0.0)%	Game 33t Best 128Iter 4156	Training step 0/400000 (0.0)%	Game 33t Best 128Iter 4157	Training step 0/400000 (0.0)%	Game 33t Best 128Iter 4158	Training step 0/400000 (0.0)%	Game 34t Best 128Iter 4159	Tra

Iter 4438	Training step 0/400000 (0.0)%	Game 36t Best 128Iter 4439	Training step 0/400000 (0.0)%	Game 36t Best 128Iter 4440	Training step 0/400000 (0.0)%	Game 36t Best 128Iter 4441	Training step 0/400000 (0.0)%	Game 36t Best 128Iter 4442	Training step 0/400000 (0.0)%	Game 36t Best 128Iter 4443	Training step 0/400000 (0.0)%	Game 36t Best 128Iter 4444	Training step 0/400000 (0.0)%	Game 36t Best 128Iter 4445	Training step 0/400000 (0.0)%	Game 36t Best 128Iter 4446	Training step 0/400000 (0.0)%	Game 36t Best 128Iter 4447	Training step 0/400000 (0.0)%	Game 36t Best 128Iter 4448	Training step 0/400000 (0.0)%	Game 36t Best 128Iter 4449	Training step 0/400000 (0.0)%	Game 36t Best 128Iter 4450	Training step 0/400000 (0.0)%	Game 36t Best 128Iter 4451	Training step 0/400000 (0.0)%	Game 36t Best 128Iter 4452	Training step 0/400000 (0.0)%	Game 36t Best 128Iter 4453	Training step 0/400000 (0.0)%	Game 36t Best 128Iter 4454	Training step 0/400000 (0.0)%	Game 36t Best 128Iter 4455	Tra

Iter 4750	Training step 0/400000 (0.0)%	Game 39t Best 128Iter 4751	Training step 0/400000 (0.0)%	Game 39t Best 128Iter 4752	Training step 0/400000 (0.0)%	Game 39t Best 128Iter 4753	Training step 0/400000 (0.0)%	Game 39t Best 128Iter 4754	Training step 0/400000 (0.0)%	Game 39t Best 128Iter 4755	Training step 0/400000 (0.0)%	Game 39t Best 128Iter 4756	Training step 0/400000 (0.0)%	Game 39t Best 128Iter 4757	Training step 0/400000 (0.0)%	Game 39t Best 128Iter 4758	Training step 0/400000 (0.0)%	Game 39t Best 128Iter 4759	Training step 0/400000 (0.0)%	Game 39t Best 128Iter 4760	Training step 0/400000 (0.0)%	Game 39t Best 128Iter 4761	Training step 0/400000 (0.0)%	Game 39t Best 128Iter 4762	Training step 0/400000 (0.0)%	Game 39t Best 128Iter 4763	Training step 0/400000 (0.0)%	Game 39t Best 128Iter 4764	Training step 0/400000 (0.0)%	Game 39t Best 128Iter 4765	Training step 0/400000 (0.0)%	Game 39t Best 128Iter 4766	Training step 0/400000 (0.0)%	Game 39t Best 128Iter 4767	Tra

Iter 404999	Training step 400000/400000 (100.0)%	Game 2986t Best 512

In [None]:
temp = dqn.replay_memory.reward.flatten()
np.unique(temp, return_counts=True)

In [12]:
dqn.play()

INFO:tensorflow:Restoring parameters from F:/training_data/DQN/Pacman/my_dqn_2048.ckpt


INFO:tensorflow:Restoring parameters from F:/training_data/DQN/Pacman/my_dqn_2048.ckpt


3 0
3 0
3 4
3 0
3 4
3 -1
3 -1
3 -1
3 -1
3 -1
3 -1
3 -1
3 -1
3 -1
