In [None]:
import numpy as np
import numpy.random as rnd
import random
import tensorflow as tf

from functools import reduce

from PIL import Image
from io import BytesIO
import base64, json, re, time, threading
import multiprocessing

from scipy.misc import imresize
%run Dino_Server.ipynb

In [9]:
class Memory:

    def __init__(self, size):
        self.size = size
        self.mem = np.ndarray((size,5), dtype=object)
        self.iter = 0
        self.current_size = 0

    def remember(self, state1, action, reward, state2, crashed):
        self.mem[self.iter,:] = state1, action, reward, state2, crashed
        self.iter = (self.iter + 1) % self.size
        self.current_size = min(self.current_size + 1, self.size)

    def sample(self, n):
        n = min(self.current_size, n)
        random_idx = random.sample(list(range(self.current_size)), n)
        sample = self.mem[random_idx]
        return (np.stack(sample[:,i], axis=0) for i in range(5))

In [10]:
class DDQNAgent:

    def __init__(self, session, num_actions, width, height, path, writer=None):
        self.path_checkpoints = path
        self.session = session
        self.num_actions = num_actions
        self.memory_size = 10000
        self.explore_prob = 1.
        self.explore_min = 0.01
        self.explore_decay = 0.997
        self.batch_size = 32
        self.discount = .95
        self.memory = Memory(self.memory_size)
        self.main_dqn = DQN(session, height, width, num_actions, "main", writer)
        self.target_dqn = DQN(session, height, width, num_actions, "target", None)
        self.session.run(tf.global_variables_initializer())

        self.update_target_network()
        self.saver = tf.train.Saver()

    def act(self, state):
        """
        :return: an action and a boolean.
        The returned boolean: - False: action generated by the DQN
                              - True: random action (exploration)
        """
        if self.explore_prob > 0 and rnd.rand() <= self.explore_prob:
            # explore
            return rnd.randint(self.num_actions), True

        return self.main_dqn.get_action(state), False

    def remember(self, state, action, reward, state_next, crashed):
        self.memory.remember(state, action, reward, state_next, crashed)

    def replay(self, cnt):
        if self.memory.current_size < self.batch_size:
            return

        print("...Training...")
        states, actions, rewards, states_next, crashes = self.memory.sample(self.batch_size)
        target = rewards
        # add Q value of next state to not terminal states (i.e. not crashed)
        target[~crashes] += self.discount * self.target_dqn.get_action_and_q(states_next[~crashes])[1]
        self.main_dqn.train(states, actions, target, cnt)

    def explore_less(self):
        self.explore_prob = max(self.explore_min, self.explore_prob * self.explore_decay)

    def update_target_network(self):
        self.target_dqn.tranfer_variables_from(self.main_dqn)

    def save(self, cnt):
        save_path = self.saver.save(self.session, self.path_checkpoints + "rex.ckpt", global_step=cnt)
        print("Model saved in file: %s" % save_path)

    def load(self, checkpoint_name):
        self.saver.restore(self.session, checkpoint_name)
        print("Model restored:", checkpoint_name)

In [11]:
class DQN:

    def __init__(self, session, height, width, num_actions, name, writer=None):
        self.num_actions = num_actions
        self.height = height
        self.width = width
        self.name = name
        self.vars = []
        self.session = session

        self.summary_ops = []
        self._create_network()
        self.writer = writer

    def linear(self, x, output_size, name, activation_fn=tf.nn.relu):
        shape = x.get_shape().as_list()

        with tf.variable_scope(name):
            w = tf.Variable(tf.random_normal([shape[1], output_size], stddev=.02), dtype=tf.float32, name='w')
            b = tf.Variable(tf.zeros([output_size]), name='b')
            out = tf.nn.bias_add(tf.matmul(x, w), b)

            if activation_fn != None:
                out =  activation_fn(out)

        return out, w, b
    
    def conv2d(self, x, output_dim, kernel_shape, stride, name):
        stride = [1, stride[0], stride[1], 1]

        with tf.variable_scope(name):
            w = tf.Variable(tf.truncated_normal(kernel_shape, mean=0, stddev=.1), dtype=tf.float32, name="w")
            conv = tf.nn.conv2d(x, w, stride, "VALID")
            b = tf.Variable(tf.constant(0.1, shape=[output_dim]), name="b")
            out = tf.nn.bias_add(conv, b)
            out = tf.nn.relu(out)

        return out, w, b
    
    def max_pool_2x2(self, x, kernel_shape, name):
        ksize = [1, *kernel_shape, 1]
        strides = [1, *kernel_shape, 1]
        return tf.nn.max_pool(x, ksize, strides, padding='SAME', name=name)

    def get_action_and_q(self, states):
        """
        returns array:
            array[0]: actions: is a array of length len(state) with the action with the highest score
            array[1]: q value: is a array of length len(state) with the Q-value belonging to the action
        """
        states = states.reshape(-1, 4, self.height, self.width)
        return self.session.run([self.a, self.Q], {self.state: states})

    def get_action(self, states):
        """
        returns action(s),
            - if states contains only a single state then we return the optimal action as an integer,
            - if states contains an array of states then we return the optimal action for each state of the array
        """
        states = states.reshape(-1, 4, self.height, self.width)
        num_states = states.shape[0]
        actions = self.session.run(self.a, {self.state: states})
        return actions[0] if num_states == 1 else actions

    def train(self, states, actions, targets, cnt):
        states = states.reshape(-1, 4, self.height, self.width)
        feed_dict = {self.state: states, self.actions: actions, self.Q_target: targets}
        summary,_ = self.session.run([tf.summary.merge(self.summary_ops), self.minimize], feed_dict)
        if self.writer: self.writer.add_summary(summary, global_step=cnt)

    def tranfer_variables_from(self, other):
        """
            Builds the operations required to transfer the values of the variables
            from other to self
        """
        ops = []
        for var_self, var_other in zip(self.vars, other.vars):
            ops.append(var_self.assign(var_other.value()))

        self.session.run(ops)


    def _create_network(self):

        with tf.variable_scope(self.name):
            self.state =  tf.placeholder(shape=[None, 4, self.height, self.width],dtype=tf.float32)
            self.state_perm = tf.transpose(self.state, perm=[0, 2, 3, 1])
            self.summary_ops.append(tf.summary.image("states", self.state[:, 0, :, :][..., tf.newaxis], max_outputs=10))

            conv1, w1, b1 = self.conv2d(self.state_perm, 32, [8, 8, 4, 32], [4, 4], "conv1")
            max_pool = self.max_pool_2x2(conv1, [2, 2], "maxpool")
            conv2, w2, b2 = self.conv2d(max_pool, 64, [4, 4, 32, 64], [2, 2], "conv2")
            conv3, w3, b3 = self.conv2d(conv2, 64, [3, 3, 64, 64], [1, 1], "conv3")
            self.vars += [w1, b1, w2, b2, w3, b3]

            shape = conv3.get_shape().as_list()
            conv3_flat = tf.reshape(conv3, [-1, reduce(lambda x, y: x * y, shape[1:])])

            value_hid, w4, b4 = self.linear(conv3_flat, 512, "value_hid")
            adv_hid, w5, b5 = self.linear(conv3_flat, 512, "adv_hid")

            value, w6, b6 = self.linear(value_hid, 1, "value", activation_fn=None)
            advantage, w7, b7 = self.linear(adv_hid, self.num_actions, "advantage", activation_fn=None)
            self.vars += [w4, b4, w5, b5, w6, b6, w7, b7]

            self.Qs = value + (advantage - tf.reduce_mean(advantage, axis=1, keepdims=True))

            # action with highest Q values
            self.a = tf.argmax(self.Qs, 1)
            self.Q = tf.reduce_max(self.Qs, 1)
            tf.summary.scalar("Q", self.Q)

            self.Q_target = tf.placeholder(shape=[None], dtype=tf.float32)
            self.actions = tf.placeholder(shape=[None], dtype=tf.int32)
            actions_onehot = tf.one_hot(self.actions, self.num_actions, on_value=1., off_value=0., axis=1, dtype=tf.float32)

            Q_tmp = tf.reduce_sum(tf.multiply(self.Qs, actions_onehot), axis=1)
            loss = tf.reduce_mean(tf.square(self.Q_target - Q_tmp))
            self.summary_ops.append(tf.summary.scalar("loss", loss))
            optimizer = tf.train.AdamOptimizer()
            self.minimize = optimizer.minimize(loss)

In [12]:
class Action:
    UP = 0
    DOWN = 1
    FORWARD = 2

In [13]:
class Environment:
    """
    Environment class is responsible for passing the actions to the game.
    It is also responsible for retrieving the game status and the reward.
    """
    actions = {Action.UP:'UP', Action.FORWARD:'FORTH', Action.DOWN:'DOWN'}

    def __init__(self, host, port, debug=False):
        self.debug = debug
        self.queue = multiprocessing.Queue()
        self.game_client = None
        self.server = WebsocketServer(port, host=host)
        self.server.set_fn_new_client(self.new_client)
        self.server.set_fn_message_received(self.new_message)
        thread = threading.Thread(target = self.server.run_forever)
        thread.daemon = True
        thread.start()

    def new_client(self, client, server):
        if self.debug: print("GameAgent: Game just connected")
        self.game_client = client
        self.server.send_message(self.game_client, "Connection to Game Agent Established");

    def new_message(self, client, server, message):
        if self.debug: print("GameAgent: Incoming data from game")
        data = json.loads(message)
        image, crashed = data['world'], data['crashed']

        image = re.sub('data:image/png;base64,', '',image)
        # convert image from base64 decoding to np array
        image = np.array(Image.open(BytesIO(base64.b64decode(image))))

        crashed = True if crashed in ['True', 'true'] else False

        self.queue.put((image, crashed))

    def start_game(self):
        """
        Starts the game and lets the TRex run for half a second and then returns the initial state.

        :return: the initial state of the game (np.array, reward, crashed).
        """
        # game can not be started as long as the browser is not ready
        while self.game_client is None:
            time.sleep(1)

        self.server.send_message(self.game_client, "START");
        time.sleep(4)
        return self.get_state(Action.FORWARD)

    def refresh_game(self):
        time.sleep(0.5)
        print("...refreshing game...")
        self.server.send_message(self.game_client, "REFRESH");
        time.sleep(1)

    def do_action(self, action):
        """
        Performs action and returns the updated status

        :param action:  Must come from the class Action.
                        The only allowed actions are Action.UP, Action.Down and Action.FORWARD.
        :return: return the image of the game after performing the action, the reward (after the action) and
                        whether the TRex crashed or not.
        """
        if action != Action.FORWARD:
            # noting needs to be send when the action is going forward
            self.server.send_message(self.game_client, self.actions[action]);

        time.sleep(.05)
        return self.get_state(action)

    def get_state(self, action):
        self.server.send_message(self.game_client, "STATE");

        image, crashed = self.queue.get()

        if crashed:
            reward = -100.
        else:
            if action == Action.UP:
                reward = -5.
            elif action == Action.DOWN:
                reward = -3.
            else:
                reward = 1.

        return image, reward, crashed

In [14]:
class Preprocessor:

    def __init__(self, width, height):
        self.width = width
        self.height = height

    def process(self, frame):
        roi_height, roi_width = frame.shape[0], int(frame.shape[1] * .68)
        processed = np.zeros((roi_height, roi_width))

        roi = frame[:, :roi_width, 0]
        all_obstacles_idx = roi > 50
        processed[all_obstacles_idx] = 1
        unharmful_obstacles_idx = roi > 200
        processed[unharmful_obstacles_idx] = 0

        processed = imresize(processed, (self.height, self.width, 1))
        processed = processed / 255.0
        return processed

    def get_initial_state(self, first_frame):
        self.state = np.array([first_frame, first_frame, first_frame, first_frame])
        return self.state

    def get_updated_state(self, next_frame):
        self.state =  np.array([*self.state[-3:], next_frame])
        return self.state