In [10]:
#import os
#for AMD gpus
#os.environ["KERAS_BACKEND"] = "plaidml.keras.backend"
from datetime import datetime
import gym
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
import math
import random
import matplotlib.pyplot as plt
#import random
import pickle
import logging

In [73]:
#define the agent
class Agent: 
    def __init__(self, state_size, action_size,
                 name="DeepQNetwork",
                 #anatomy of the hidden layers, must add the object from tf.keras.layers directly
                 anatomy=[layers.Dense(24), layers.Dense(24)],
                 #developer option, 
                 compile_model=True,
                 lr=0.001,
                 loss_function=keras.losses.MeanSquaredError(),
                 optimizer=None, #defaults to adam. Is set below
                 #debug options
                 model_summary=True,
                 model_verbose=0,
                 #discount factor 0 <= gamma <= 1
                 gamma=0.95,
                 #exploration parameters
                 linear_decrease=False,
                 epsilon=1,
                 epsilon_min=0.0001,
                 epsilon_decay=0.995,
                 #replay options
                 batch_size=32,
                 max_memory_size=1e6,
                 disable_double=False, #if true, acts like a single dqn
                ):
        
        #model parameters
        self.model_anatomy = anatomy 
        self.default_name = name
        self.disable_double = disable_double
        #tracking vars
        self.gradient_updates = 0
        self.greedy_actions = 0
        self.exploration_actions = 0
        self.target_updates = 0
        
        self.state_size = state_size
        self.action_size = action_size
        self.max_memory_size = max_memory_size
        self.loss_function = loss_function
        
        #set default optimizer to Adam, if not provided
        if optimizer != None:
            self.optimizer = optimizer
        else:
            self.optimizer = Adam(learning_rate=lr, clipnorm=1.0)
            print(f"Optimizer Adam: {lr=}")
            
        #initiate memory
        self.action_hist = []
        self.state_hist = []
        self.state_next_hist = []
        self.reward_hist = []
        self.done_hist = []
        
        
        #hyperparameters
        self.learning_rate = lr
        self.gamma = gamma
        self.linear_decrease = linear_decrease
        self.exploration_rate = epsilon
        self.exploration_min = epsilon_min
        self.exploration_decay = epsilon_decay
        self.sample_batch_size = 32
        
        #debug options
        self.model_summary = model_summary
        self.model_verbose = model_verbose
        
        #create model
        self.model = self._build_model(name="live_model")
        if not disable_double:
            self.target_model = self._build_model(name="target_model")
        else:
            self.target_model = self.model
            
    def __str__(self):
        #useful for debugging
        memory_ratio = (len(self.done_hist) / self.max_memory_size)*100 
        ret_string = f"""
                    NAME: {self.default_name}
                    INPUT SHAPE: {self.state_size}, OUTPUT SHAPE: {self.action_size}
                    OPTIMIZER: {type(self.optimizer)}
                    LOSS FUNCTION: {type(self.loss_function)}
                    LEARNING RATE: {self.learning_rate}
                    TARGET_NETWORK: enabled-> {not self.disable_double}
                                    target_updates->{self.target_updates}
                    MEMORY: {len(self.done_hist)}/{self.max_memory_size:0.0f} ¦ {memory_ratio:0.2f}%
                    EPSILON: {self.exploration_rate:0.6f}
                             min-> {self.exploration_min}
                    ACTIONS TAKEN:  greedy-> {self.greedy_actions}
                                    exploration-> {self.exploration_actions}
                    REPLAY: batch_size-> {self.sample_batch_size}
                            gamma-> {self.gamma}
                            gradient_updates-> {self.gradient_updates}
                    """
        return ret_string
    
    def _build_model(self, name=None):
        #input layer
        inputs = keras.Input(shape=self.state_size)
        
        #input layer
        x = self.model_anatomy[0](inputs)
        #create hidden layers
        for layer in self.model_anatomy[1:]:
            x = layer(x)
        #output layer
        outputs = layers.Dense(self.action_size, activation = "linear")(x)
        
        #create model
        model = keras.Model(inputs=inputs, outputs=outputs, name=name)
        model.compile(optimizer=self.optimizer,
                     loss=self.loss_function)
        #model summary if enabled
        if self.model_summary: model.summary()
        return model
    
    def update_memory(self, state, reward, action, state_next, done):
        #Format of step: [state(t), reward(t+1), action(t), state(t+1), done?]
        
        self.state_hist.append(state)
        self.reward_hist.append(reward)
        self.action_hist.append(action)
        self.state_next_hist.append(state_next)
        self.done_hist.append(done)
        
        if len(self.state_hist) > self.max_memory_size:
            del self.state_hist[:1]
            del self.reward_hist[:1]
            del self.action_hist[:1]
            del self.state_next_hist[:1]
            del self.done_hist[:1]
            
    def pick_action(self, state):
        if np.random.rand(1)[0] < self.exploration_rate:
            #return random move 
            self.exploration_actions += 1
            return np.random.choice(self.action_size)
        else:
            q_values = self.predict(tf.convert_to_tensor(state))[0]
            #return action with the highest expected reward
            return_val = np.array(tf.argmax(q_values))
            self.greedy_actions += 1
            return return_val

    def predict(self, state, main=True):
        #print(state.shape)
        if main:
            return self.model.predict(state, verbose=self.model_verbose)
        else:
            return self.target_model.predict(state, verbose=self.model_verbose)
    
    def update_target(self):
        self.target_updates += 1
        self.target_model.set_weights(self.model.get_weights())
    
    def replay(self, debug=False):
        if len(self.state_hist) < self.sample_batch_size:
            return
        
        #replay
        #samples random experiences from memory
        indices = np.random.choice(range(len(self.done_hist)), size=self.sample_batch_size)
        
        if debug: 
            print("++++++++++++")
            #print(f"{indices=}")
        
        state_batch = np.array([self.state_hist[i] for i in indices])
        next_state_batch = np.array([self.state_next_hist[i] for i in indices])
        action_batch = [self.action_hist[i] for i in indices]
        reward_batch = [self.reward_hist[i] for i in indices]
        done_batch = tf.convert_to_tensor(
            [float(self.done_hist[i]) for i in indices]
        )

        #get future rewards from target model
        future_rewards = self.predict(tf.convert_to_tensor(next_state_batch), main=False)
        
        #update q_values for every action in state
        updated_q_values = reward_batch + (self.gamma * tf.reduce_max(future_rewards, axis=1))
        #set reward to -1 if done
        updated_q_values = (updated_q_values * (1 - done_batch)) - done_batch
         
        #get current rewards
        state_batch = tf.convert_to_tensor(state_batch)
        current_rewards = self.predict(state_batch)
        
        #generate indices to replace
        mask = [[i, action_batch[i]] for i in range(self.sample_batch_size)]
        #update current rewards to 
        target_values = tf.tensor_scatter_nd_update(current_rewards, mask, updated_q_values)
        
        if debug:
            print(action_batch[0])
            print(current_rewards[0])
            print(tf.reduce_max(future_rewards, axis=1)[0])
            print(updated_q_values[0])
            print(target_values[0])
            print("++++")
            print(action_batch[1])
            print(current_rewards[1])
            print(tf.reduce_max(future_rewards, axis=1)[1])
            print(updated_q_values[1])
            print(target_values[1])
        #update model
        self.model.fit(state_batch, target_values, 
                       batch_size=self.sample_batch_size, 
                       verbose=self.model_verbose)
        
        #update exploration rate
        if self.exploration_rate > self.exploration_min:
            if not self.linear_decrease:
                self.exploration_rate *= self.exploration_decay
            else:
                self.exploration_rate -= self.exploration_decay
    
    def build_name(self, name=None):
        if name != None:
            name = name + "-"
        else:
            name = ""
        now = datetime.now().strftime("%d-%m-%Y-%H-%M")
        return f"{self.default_name}-{name}{now}"
        
    def save_model(self, name="", save_memory=False):
        self.model.save(f"./models/{self.build_name(name=name)}.h5")
        if save_memory:
            pass
            #pickle.dump(self.memory, open(f"./models/{self.build_name(name=name)}.pkl", "wb"))
        print(f"MODEL SAVED AS ./models/{self.build_name(name=name)}.h5")
    
    
    def load_model(self, overwrite_epsilon=-1):
        #set an value for epsilon to overwrite it
        if overwrite_epsilon == -1:
            self.exploration_rate = self.exploration_min
        else:
            self.exploration_rate = overwrite_epsilon
            
        name = input("model file (in ./models):")
        self.model = keras.models.load_model(f"./models/{name}.h5")
        self.target_model = self.model
        pkl_name = input("replay memory file (enter x if none): ")
        
        if pkl_name == "x":
            pass
        else:
            pass#self.memory = pickle.load(open(f"./models/{pkl_name}.pkl", "rb"))
        print(f"LOADED ./models/{name}.h5")
        

In [74]:
agent = Agent(1, 1)

Optimizer Adam: lr=0.001
Model: "live_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_53 (InputLayer)       [(None, 1)]               0         
                                                                 
 dense_88 (Dense)            (None, 24)                48        
                                                                 
 dense_89 (Dense)            (None, 24)                600       
                                                                 
 dense_90 (Dense)            (None, 1)                 25        
                                                                 
Total params: 673
Trainable params: 673
Non-trainable params: 0
_________________________________________________________________
Model: "target_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_54 (

In [75]:
print(agent)


                    NAME: DeepQNetwork
                    INPUT SHAPE: 1, OUTPUT SHAPE: 1
                    OPTIMIZER: <class 'keras.optimizers.optimizer_v2.adam.Adam'>
                    LOSS FUNCTION: <class 'keras.losses.MeanSquaredError'>
                    LEARNING RATE: 0.001
                    TARGET_NETWORK: enabled-> True
                                    target_updates->0
                    MEMORY: 0/1000000 ¦ 0.00%
                    EPSILON: 1.000000
                             min-> 0.0001
                    ACTIONS TAKEN:  greedy-> 0
                                    exploration-> 0
                    REPLAY: batch_size-> 32
                            gamma-> 0.95
                            gradient_updates-> 0
                    
