In [None]:
import numpy as np
from collections import deque
import random
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, initializers
from AtomasWrapper import AtomasWrapper

In [None]:
# simple for now
def create_q_model():

    state = layers.Input(shape = (20, 1))

    layer1 = layers.Conv1D(
        filters = 256, 
        kernel_size = 20, 
        strides = 1, 
        activation = "relu",
        kernel_initializer = initializers.TruncatedNormal(mean = 0., stddev = 0.01),
        bias_initializer = initializers.Constant(0.01)
    )(state)

    layer2 = layers.Flatten()(layer1)

    layer3 = layers.Dense(
        units = 128, 
        activation = "linear",
        kernel_initializer = initializers.TruncatedNormal(mean = 0., stddev = 0.01),
        bias_initializer = initializers.Constant(0.01)
    )(layer2)

    layer4 = layers.Dense(
        units = 128, 
        activation = "linear",
        kernel_initializer = initializers.TruncatedNormal(mean = 0., stddev = 0.01),
        bias_initializer = initializers.Constant(0.01)
    )(layer3)

    q_value = layers.Dense(
        units = 19, 
        activation = "linear",
        kernel_initializer = initializers.TruncatedNormal(mean=0., stddev=0.01),
        bias_initializer=initializers.Constant(0.01)
    )(layer4)

    return keras.Model(inputs = state, outputs = q_value)

In [None]:
model = create_q_model()
print(model.summary())

In [None]:
# training procedure adapted from Yale's S&DS 365: Intermediate Machine Learning, Assignment 4.3.

NUM_ACTIONS = 19

GAMMA = 0.99            # decay rate of past observations
STEP_SIZE = 1e-4        # step size
OBSERVE = 10000         # timesteps to observe before training
TRAINING = 10000000       # timesteps of observing + training
REPLAY_MEMORY = 10000   # number of previous transitions to remember
BATCH_SIZE = 32         # size of each batch
EPSILON = 0.2           # exploration probability

In [None]:
def dql_atomas(model, optimizer, loss_function):

    # initiate game
    game = AtomasWrapper()

    # store the previous state, action and transitions
    history_data = deque()

    # get the first observation by doing nothing and preprocess the image
    current_state, reward, terminal = game.check()
  
    # training
    t = 0

    while t < TRAINING:

        if np.random.rand(1)[0] < EPSILON:
            # random action
            action = np.random.choice(NUM_ACTIONS)
        else:
            # compute the Q function
            current_state_tensor = tf.convert_to_tensor(current_state)
            current_state_tensor = tf.expand_dims(current_state_tensor, 0)
            q_value = model(current_state_tensor, training = False)
          
            # greedy action   
            action = tf.argmax(q_value[0]).numpy()

        # take the action and observe the reward and the next state
        action_vec = np.zeros(NUM_ACTIONS)
        action_vec[action] = 1
        next_state, reward, terminal = game.step(action_vec)

        # store the observation
        history_data.append((current_state, action, reward, next_state, 
                            terminal))
        if len(history_data) > REPLAY_MEMORY:
            history_data.popleft()  # discard old data


        # train if done observing
        if t > OBSERVE:

            # sample a batch
            batch = random.sample(history_data, BATCH_SIZE)
            state_sample = np.array([d[0] for d in batch])
            action_sample = np.array([d[1] for d in batch])
            reward_sample = np.array([d[2] for d in batch])
            state_next_sample = np.array([d[3] for d in batch])
            terminal_sample = np.array([d[4] for d in batch])

            # compute the updated Q-values for the samples
            future_rewards = model(tf.convert_to_tensor(state_next_sample), training = True)
            terminal_tensor = tf.convert_to_tensor([float(s) for s in terminal_sample])   
            updated_q_value = reward_sample + (GAMMA * tf.reduce_max(future_rewards, axis = 1))

            # If final frame set the last value to -1
            updated_q_value = updated_q_value * (1 - terminal_tensor) - (terminal_tensor * 16)

            # train the model on the states and updated Q-values
            with tf.GradientTape() as tape:

                # compute the current Q-values for the samples
                current_q_value = model(state_sample, training = True)
                mask = tf.one_hot(action_sample, NUM_ACTIONS)
                current_q_value = tf.reduce_sum(tf.multiply(current_q_value, mask), axis = 1)

                # compute the loss
                loss = loss_function(updated_q_value, current_q_value)

            # backpropagation
            grads = tape.gradient(loss, model.trainable_variables)
            optimizer.apply_gradients(zip(grads, model.trainable_variables))

        else:
            loss = 0

        # update current state and counter
        current_state = next_state
        t += 1

        if t % 500 == 0: # originally 500
            print(f"STEP {t} | PHASE {'observe' if t <= OBSERVE else 'train'}", 
                  f"| ACTION {action} | REWARD {reward} | LOSS {loss}")
            if t and not t % 10000:
                model.save("first_model")
                wrapper = AtomasWrapper()
                wrapper.activate()


In [None]:
def train_atomas(start_from_ckpt = False, ckpt_path = None):

    np.random.seed(37)

    if start_from_ckpt:
        # if you want to start from a checkpoint
        model = keras.models.load_model("first_model")
    else:
        model = create_q_model()

    # specify the optimizer and loss function
    optimizer = keras.optimizers.Adam(learning_rate = STEP_SIZE, clipnorm = 1.0)
    loss_function = keras.losses.MeanSquaredError()

    # train model
    dql_atomas(model = model, optimizer = optimizer, loss_function = loss_function)

In [None]:
train_atomas()