In [None]:
from datetime import datetime
import gym
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
import math
import random
import matplotlib.pyplot as plt
import pickle
import logging
import lz4.frame as lz4f

In [None]:
#define the agent
class Agent: 
    def __init__(self, state_size, action_size,
                 name="DeepQNetwork",
                 #anatomy of the hidden layers, must add the object from tf.keras.layers directly
                 anatomy=[layers.Dense(24), layers.Dense(24)],
                 #developer option, 
                 compile_model=False,
                 lr=0.001,
                 loss_function=keras.losses.MeanSquaredError(),
                 optimizer=None, #defaults to adam. Is set below
                 #debug options
                 model_summary=True,
                 model_verbose=0,
                 #discount factor 0 <= gamma <= 1
                 gamma=0.95,
                 #exploration parameters
                 linear_decrease=False,
                 epsilon=1,
                 epsilon_min=0.0001,
                 epsilon_decay=0.995,
                 #replay options
                 batch_size=32,
                 max_memory_size=1e6,
                 compress_memory=False, #if enabled, 10% performance penalty for larger replay buffer EXP
                 state_dtype=np.uint8,
                 dtype_info_helper=np.iinfo,
                 disable_double=False, #if true, acts like a single dqn
                ):
        
        #model parameters
        self.model_anatomy = anatomy 
        self.loss_function = loss_function
        self.default_name = name
        self.disable_double = disable_double
        self.compile_model = compile_model
        
        #tracking vars
        self.gradient_updates = 0
        self.greedy_actions = 0
        self.exploration_actions = 0
        self.target_updates = 0
        
        self.state_size = state_size

        self.action_size = action_size
        self.max_memory_size = max_memory_size
        
        #set default optimizer to Adam, if not provided
        if optimizer != None:
            self.optimizer = optimizer
        else:
            self.optimizer = Adam(learning_rate=lr, clipnorm=1.0)
            print(f"Optimizer Adam: {lr=}")
            
        #initiate memory
        self.compress_memory = compress_memory
        self.state_dtype = state_dtype
        self.dtype_info = dtype_info_helper(state_dtype)
        self.action_hist = []
        self.state_hist = []
        self.state_next_hist = []
        self.reward_hist = []
        self.done_hist = []
        
        #hyperparameters
        self.learning_rate = lr
        self.gamma = gamma
        self.linear_decrease = linear_decrease
        self.exploration_rate = epsilon
        self.exploration_min = epsilon_min
        self.exploration_decay = epsilon_decay
        self.sample_batch_size = 32
        
        #debug options
        self.model_summary = model_summary
        self.model_verbose = model_verbose
        
        #create model
        self.model = self._build_model(name="live_model")
        if not disable_double:
            self.target_model = self._build_model(name="target_model")
        else:
            self.target_model = self.model
            
        print(self.__str__())
            
    def __str__(self):
        #useful for debugging
        memory_ratio = (len(self.done_hist) / self.max_memory_size)*100 
        ret_string = f"""
                    NAME: {self.default_name}
                    INPUT SHAPE: {self.state_size}, OUTPUT SHAPE: {self.action_size}
                    OPTIMIZER: {type(self.optimizer)}
                    LOSS FUNCTION: {type(self.loss_function)}
                    LEARNING RATE: {self.learning_rate}
                    TARGET_NETWORK: enabled-> {not self.disable_double}
                                    target_updates->{self.target_updates}
                    MEMORY: {len(self.done_hist)}/{self.max_memory_size:0.0f} ¦ {memory_ratio:0.2f}%
                    EPSILON: {self.exploration_rate:0.6f}
                             min-> {self.exploration_min}
                    ACTIONS TAKEN:  greedy-> {self.greedy_actions}
                                    exploration-> {self.exploration_actions}
                    REPLAY: batch_size-> {self.sample_batch_size}
                            gamma-> {self.gamma}
                            gradient_updates-> {self.gradient_updates}
                    """
        return ret_string
    
    def _build_model(self, name=None):
        #input layer
        inputs = layers.Input(shape=self.state_size)
        
        #input layer
        x = self.model_anatomy[0](inputs)
        #create hidden layers
        for layer in self.model_anatomy[1:]:
            x = layer(x)
        #output layer
        outputs = layers.Dense(self.action_size, activation="linear")(x)
        
        #create model
        model = keras.Model(inputs=inputs, outputs=outputs, name=name)
        
        if self.compile_model:
            model.compile(optimizer=self.optimizer,
                         loss=self.loss_function)
        #model summary if enabled
        if self.model_summary and self.compile_model: model.summary()
        return model
    
    def update_memory(self, state, reward, action, state_next, done):
        #Format of step: [state(t), reward(t+1), action(t), state(t+1), done?]
        
        self.reward_hist.append(reward)
        self.action_hist.append(action)
        self.done_hist.append(done)
        
        state = self.replay_compress(state) if self.compress_memory else state
        state_next = self.replay_compress(state_next) if self.compress_memory else state_next
        
        self.state_hist.append(state)
        self.state_next_hist.append(state_next)
        
        if len(self.state_hist) > self.max_memory_size:
            del self.state_hist[:1]
            del self.reward_hist[:1]
            del self.action_hist[:1]
            del self.state_next_hist[:1]
            del self.done_hist[:1]
            
    def pick_action(self, state):
        if np.random.rand(1)[0] < self.exploration_rate:
            #return random move 
            self.exploration_actions += 1
            return np.random.choice(self.action_size)
        else:
            q_values = self.predict(tf.convert_to_tensor(state))[0]
            #return action with the highest expected reward
            return_val = np.array(tf.argmax(q_values))
            self.greedy_actions += 1
            return return_val

    def predict(self, state, main=True):
        #print(state.shape)
        if main:
            return self.model.predict(state, verbose=self.model_verbose)
        else:
            return self.target_model.predict(state, verbose=self.model_verbose)
    
    def update_target(self):
        self.target_updates += 1
        self.target_model.set_weights(self.model.get_weights())
    
    #compress states into a bytes
    def replay_compress(self, replay_obj):
        #replay_obj *= self.dtype_info.max #scale [0-1] to [0-max_dtype]
        replay_obj = replay_obj.astype(dtype=self.state_dtype)
        flattened = replay_obj.flatten("F")
        obj_bytes = replay_obj.tobytes()
        return lz4f.compress(obj_bytes)
    
    #helper function for sampling the minibatch from the replay memory
    def replay_unpack(self, replay_obj):
        bytes_decompressed = lz4f.decompress(replay_obj)
        arr = np.frombuffer(bytes_decompressed, dtype=self.state_dtype)
        reshaped = np.reshape(arr, self.state_size)
        #reshaped_adj = reshaped / self.dtype_info.max
        return reshaped
    
    def replay(self, debug=False):
        if len(self.state_hist) < self.sample_batch_size:
            return
        
        self.gradient_updates += 1
        #replay
        #samples random experiences from memory
        indices = np.random.choice(range(len(self.done_hist)), size=self.sample_batch_size)
        
        if debug: 
            print("++++++++++++")
            #print(f"{indices=}")
        
        if self.compress_memory:
            state_batch = np.array([
                self.replay_unpack(self.state_hist[i]) for i in indices])
            next_state_batch = np.array([
                self.replay_unpack(self.state_next_hist[i]) for i in indices])
        else:
            state_batch = np.array([
                self.state_hist[i] for i in indices])
            next_state_batch = np.array([
                self.state_next_hist[i] for i in indices])
        
        action_batch = [self.action_hist[i] for i in indices]
        reward_batch = [self.reward_hist[i] for i in indices]
        done_batch = tf.convert_to_tensor(
            [float(self.done_hist[i]) for i in indices]
        )

        #get future rewards from target model
        future_rewards = self.target_model.predict(next_state_batch, verbose=self.model_verbose)
        
        #update q_values for every action in state
        updated_q_values = reward_batch + (self.gamma * tf.reduce_max(
            future_rewards, axis=1))
        #set reward to -1 if done
        updated_q_values = updated_q_values * (1 - done_batch) - done_batch
        
        masks = tf.one_hot(action_batch, self.action_size)
        
        with tf.GradientTape() as tape:
            # Train the model on the states and updated Q-values
            q_values = self.model(state_batch)

            # Apply the masks to the Q-values to get the Q-value for action taken
            q_action = tf.reduce_sum(tf.multiply(q_values, masks), axis=1)
            # Calculate loss between new Q-value and old Q-value
            loss = self.loss_function(updated_q_values, q_action)

        # Backpropagation
        grads = tape.gradient(loss, self.model.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.model.trainable_variables))
        
        if debug:
            for i in range(2):
                print(action_batch[i])
                print(current_rewards[i])
                print(tf.reduce_max(future_rewards, axis=1)[i])
                print(updated_q_values[i])
                print("-----")
            print(f"{loss=}")
 

        #update exploration rate
        if self.exploration_rate > self.exploration_min:
            if not self.linear_decrease:
                self.exploration_rate *= self.exploration_decay
            else:
                self.exploration_rate -= self.exploration_decay
    
    def build_name(self, name=None):
        if name != None:
            name = name + "-"
        else:
            name = ""
        now = datetime.now().strftime("%d-%m-%Y-%H-%M")
        return f"{self.default_name}-{name}{now}"
        
    def save_model(self, name="", save_memory=False):
        self.model.save(f"./models/{self.build_name(name=name)}.h5")
        if save_memory:
            pass
            #pickle.dump(self.memory, open(f"./models/{self.build_name(name=name)}.pkl", "wb"))
        print(f"MODEL SAVED AS ./models/{self.build_name(name=name)}.h5")
    
    
    def load_model(self, overwrite_epsilon=-1):
        #set an value for epsilon to overwrite it
        if overwrite_epsilon == -1:
            self.exploration_rate = self.exploration_min
        else:
            self.exploration_rate = overwrite_epsilon
            
        name = input("model file (in ./models):")
        self.model = keras.models.load_model(f"./models/{name}.h5")
        self.target_model = self.model
        pkl_name = input("replay memory file (enter x if none): ")
        
        if pkl_name == "x":
            pass
        else:
            pass#self.memory = pickle.load(open(f"./models/{pkl_name}.pkl", "rb"))
        print(f"LOADED ./models/{name}.h5")
        