In [None]:
#import os
#for AMD gpus
#os.environ["KERAS_BACKEND"] = "plaidml.keras.backend"
from datetime import datetime
import gym
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
import math
import random
import matplotlib.pyplot as plt
#import random
import pickle


In [None]:
#define the agent
class Agent: 
    def __init__(self, state_size, action_size,
                #anatomy of the hidden layers, must add the object from tf.keras.layers directly
                anatomy=[layers.Dense(24), layers.Dense(24)],
                name="DeepQNetwork",
                lr=0.001,
                #discount factor 0 <= gamma <= 1
                gamma=0.95,
                #probability that it takes an exploritory action
                linear_decrease=False,
                epsilon=1,
                epsilon_min=0.0001,
                epsilon_decay=0.995,
                batch_size=32,
                model_summary=True,
                model_verbose=0,
                max_memory_size=1e6,
                loss="mse",
                optimizer=None
                ):
        
        #model parameters
        self.model_anatomy = anatomy 
        self.default_name = name
        #self.weight_backup_default = "cartpole_weight.h5"
        self.state_size = state_size
        self.action_size = action_size
        self.max_memory_size = max_memory_size
        self.loss = loss
        if optimizer != None:
            self.optimizer = optimizer
        else:
            self.optimizer = Adam(learning_rate=lr, clipnorm=1.0)
        #initiate memory
        self.action_hist = []
        self.state_hist = []
        self.state_next_hist = []
        self.reward_hist = []
        self.done_hist = []
        
        
        #hyperparameters
        self.learning_rate = lr
        self.gamma = gamma
        self.linear_decrease = linear_decrease
        self.exploration_rate = epsilon
        self.exploration_min = epsilon_min
        self.exploration_decay = epsilon_decay
        self.sample_batch_size = 32
        
        #debug options
        self.model_summary = model_summary
        self.model_verbose = model_verbose
        #create model
        self.model = self._build_model()
        self.target_model = self._build_model()
    
 
    def _build_model(self):
        #input layer
        inputs = keras.Input(shape=self.state_size)
        
        #create hidden layers
        x = self.model_anatomy[0](inputs)
        for layer in self.model_anatomy[1:]:
            x = layer(x)
        """
        #deprecated code for hidden layer generation
        x = self.model_anatomy[0][0](**self.model_anatomy[0][1])(inputs)
        for num_neurons in self.model_anatomy[1:]:
            x = layers.Dense(num_neurons, activation="relu")(x)
        """
        #output layer
        outputs = layers.Dense(self.action_size, activation = "linear")(x)
        
        #create model
        model = keras.Model(inputs=inputs, outputs=outputs)
        model.compile(optimizer=self.optimizer,
                     loss=self.loss)
        #model summary if enabled
        if self.model_summary: model.summary()
        return model
    
    def update_memory(self, state, reward, action, state_next, done):
        #Format of step: [state(t), reward(t+1), action(t), state(t+1), done?]
        
        self.state_hist.append(state)
        self.reward_hist.append(reward)
        self.action_hist.append(action)
        self.state_next_hist.append(state_next)
        self.done_hist.append(done)
        
        if len(self.state_hist) > self.max_memory_size:
            del self.state_hist[:1]
            del self.reward_hist[:1]
            del self.action_hist[:1]
            del self.state_next_hist[:1]
            del self.done_hist[:1]
            
    def pick_action(self, state):
        if random.random() < self.exploration_rate:
            #return random move  
            return np.random.choice(self.action_size)
        
        q_values = self.predict(tf.convert_to_tensor(state))[0]
        
        #return action with the highest expected reward
        return_val = tf.argmax(q_values)
        return return_val
    
    def predict(self, state, main=True):
        #print(state.shape)
        if main:
            return self.model(state)
        else:
            return self.target_model(state)
    
    def update_target(self):
        self.target_model.set_weights(self.model.get_weights())
        
    
    def replay(self):
        if len(self.state_hist) < self.sample_batch_size:
            return
        
        #replay
        #samples random experiences from memory
        indices = np.random.choice(range(len(self.done_hist)), size=self.sample_batch_size)
        
        
        state_batch = np.array([self.state_hist[i] for i in indices])
        next_state_batch = np.array([self.state_next_hist[i] for i in indices])
        action_batch = [self.action_hist[i] for i in indices]
        reward_batch = [self.reward_hist[i] for i in indices]
        done_batch = tf.convert_to_tensor([float(self.done_hist[i]) for i in indices])

        #reshape into (batch_size, (frame size), frames from frame_skip)
        #state_batch = tf.reshape(np.array(state_batch), (self.sample_batch_size, *self.state_size))
        #next_state_batch = tf.reshape(np.array(next_state_batch), (self.sample_batch_size, *self.state_size))
        
        #use target for stability
        fut_rewards = self.predict(next_state_batch, main=False)
        
        #update q values for all actions in s
        updated_q_vals = reward_batch + self.gamma * tf.reduce_max(fut_rewards, axis=1)
        updated_q_vals = updated_q_vals * (1 - done_batch) - done_batch
        
        masks = tf.one_hot(action_batch, self.action_size)
        
        with tf.GradientTape() as tape:
                # Train the model on the states and updated Q-values
                q_values = self.predict(state_batch)

                # Apply the masks to the Q-values to get the Q-value for action taken
                q_action = tf.reduce_sum(tf.multiply(q_values, masks), axis=1)
                # Calculate loss between new Q-value and old Q-value
                loss = self.loss(updated_q_vals, q_action)
        
                grads = tape.gradient(loss, self.model.trainable_variables)
                self.optimizer.apply_gradients(zip(grads, self.model.trainable_variables))
                        
        #update exploration rate
        if self.exploration_rate > self.exploration_min:
            if not self.linear_decrease:
                self.exploration_rate *= self.exploration_decay
            else:
                self.exploration_rate -= self.exploration_decay
    
    def build_name(self, name=""):
        now = datetime.now().strftime("%d-%m-%Y-%H-%M")
        return f"{self.default_name}-{name}-{now}"
        
    def save_model(self, name="", save_memory=False):
        self.model.save(f"./models/{self.build_name(name=name)}.h5")
        if save_memory:
            pass
            #pickle.dump(self.memory, open(f"./models/{self.build_name(name=name)}.pkl", "wb"))
        print(f"MODEL SAVED AS ./models/{self.build_name(name=name)}.h5")
    
    
    def load_model(self, overwrite_epsilon=-1):
        #set an value for epsilon to overwrite it
        if overwrite_epsilon == -1:
            self.exploration_rate = self.exploration_min
        else:
            self.exploration_rate = overwrite_epsilon
            
        name = input("model file (in ./models):")
        self.model = keras.models.load_model(f"./models/{name}.h5")
        self.target_model = self.model
        pkl_name = input("replay memory file (enter x if none): ")
        
        if pkl_name == "x":
            pass
        else:
            pass#self.memory = pickle.load(open(f"./models/{pkl_name}.pkl", "rb"))
        print(f"LOADED ./models/{name}.h5")
        