# <font color='purple'>**Common**</font>

## Instalation

In [None]:
# !pip install gym
# !pip install gym[classic_control]

## <font color='green'>Basics</font>

In [None]:
import gym
import random
import tensorflow as tf
from tensorflow.keras import layers
import numpy as np
from collections import deque
from tensorflow.keras.regularizers import l2
from IPython.display import clear_output
import matplotlib.pyplot as plt
import gc
from numba import cuda
import time
import h5py
import json
import os
import math

log_buffer = []

def clearLog():
    log_buffer = []

def log(message):
    if len(log_buffer) > 5:
        log_buffer.pop(0)
    log_buffer.append(message)
    clear_output(wait=True)
    for log in log_buffer:
        print(log)

def releaseMemory():
    gc.collect()
    cuda.select_device(0)
    cuda.close()

def bn(x):
    fx = layers.BatchNormalization()(x)
    return fx

def bn_relu(x, leaky=-1):
    fx = layers.BatchNormalization()(x)
    fx = relu(fx, leaky)
    return fx

def relu(x, leaky=-1):
    if leaky == -1:
        fx = layers.ReLU()(x)
    else:
        fx = layers.LeakyReLU(leaky)(x)
    return fx

def conv(x, filterNumb, kernel_size, strides=1, use_bias=True):
    fx = layers.Conv2D(filterNumb, kernel_size, strides, padding='same', 
                    use_bias=use_bias, kernel_regularizer=l2(0.01))(x)
    return fx

def residual_block(x, filterNumb, kernel_size=3, poolStride=1):
    shortcut = x
    if poolStride != 1:
        shortcut = conv(x, filterNumb, kernel_size=1, strides=poolStride)
    
    fx = conv(x, filterNumb, kernel_size=kernel_size, strides=poolStride)
    fx = bn_relu(fx)
    fx = conv(fx, filterNumb, kernel_size=kernel_size)
    fx = layers.BatchNormalization()(fx)
    fx = layers.Add()([fx, shortcut]) # skip
    fx = relu(fx)
    return fx

def getEnvInputOutputShape(env):
    env.reset()
    img = env.render()
    env.reset()
    inShape_img = img[:,:,0].shape
    inShape_vector = env.observation_space.shape
    outShape = env.action_space.n
    print(f'[getEnvInputOutputShape] inShape_img={inShape_img} inShape_vector={inShape_vector} outShape={outShape}')
    return inShape_img, inShape_vector, outShape

class Serializable:
    def toJson(self, attrList=None, isInclude=True, file=None):
        dict = {}
        for key, value in self.__dict__.items():
            if attrList != None:
                if isInclude and key not in attrList:
                    continue
                elif not isInclude and key in attrList:
                    continue
            
            if isinstance(value, np.ndarray):
                value = value.tolist()
            if isinstance(value, np.int32) or isinstance(value, np.int64) or isinstance(value, np.uint8):
                value = int(value)
            if isinstance(value, np.float32) or isinstance(value, np.float64):
                value = float(value)
            dict[key] = value
        try:
            if file is None:
                return json.dumps(dict)
            else:
                json.dump(dict, file)
        except Exception as e:
            print(dict)
            raise e

    def fromJson(self, jsonStr):
        dict = json.loads(jsonStr)
        for key, value in dict.items():
            if hasattr(self, key):  # Check if the object has the attribute
                setattr(self, key, value)

    def fromJsonFile(self, file):
        dict = json.load(file)
        for key, value in dict.items():
            if hasattr(self, key):  # Check if the object has the attribute
                setattr(self, key, value)

class DQNBase:
    def __init__(self, inputShape, outputShape, lr, loss_fn='mse'):
        self.inputShape = inputShape
        self.outputShape = outputShape
        self.lr = lr
        self.loss = -1
        
        inputs = layers.Input(shape=self.inputShape)
        outputs = self.hiddenLayers(inputs)
        
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
        self.model = tf.keras.Model(inputs, outputs)
    
        if isinstance(loss_fn, str):
            self.loss_fn = tf.keras.losses.get(loss_fn)
        else:
            self.loss_fn = loss_fn

        self.model.compile(optimizer=self.optimizer, loss=self.loss_fn,  metrics=['mae'])

    def setLearningRate(self, lr):
        if self.lr == lr:
            return
        self.lr = lr
        tf.keras.backend.set_value(self.model.optimizer.lr, lr)

    def clone(self):
        cloned_instance = type(self)(self.inputShape, self.outputShape, self.model.optimizer.learning_rate.numpy())
        cloned_instance.copyFrom(self)
        return cloned_instance

    def summary(self):
        self.model.summary()

    def trainOnBatch(self, batchX, targetY):
        loss, _ = self.model.train_on_batch(batchX, targetY)
        self.loss = loss

    def predict(self, x):
        return self.model.predict_on_batch(x)

    def copyFrom(self, otherDQN):
        self.loss = otherDQN.loss
        self.model.set_weights(otherDQN.model.get_weights())

    def hiddenLayers(self, inputs):
        raise Exception('[DQN] hiddenLayers not implemented')
    
class TemporalMemory:
    def __init__(self, maxlen, chunkSize=100):
        self.chunkSize = chunkSize
        self.maxlen = maxlen
        self.array = np.empty(maxlen, dtype=object)  # Initialize with a numpy array of objects
        self.tail = 0
        self.size = 0
        self.a = 0

        chunkCount = math.ceil(self.maxlen/self.chunkSize)
        self.savedChunks = [True] * chunkCount
        self.cache_batch = None

    def push(self, value):
        if self.size < self.maxlen:
            self.size += 1
        
        chunkIndex = math.floor(self.tail/self.chunkSize)
        self.savedChunks[chunkIndex] = False

        self.array[self.tail] = value
        self.tail = (self.tail + 1) % self.maxlen

    def sampleBatch(self, batchSize):
        if self.size == self.maxlen:
            array = self.array
        else:
            array = self.array[0:self.tail]
        
        if (batchSize <= len(array)):
            batch = np.random.choice(array, batchSize, replace=False)
        else:
            return None

        if self.cache_batch is None:
            self.cache_batch = np.stack(batch, axis=0)
        else:
            np.stack(batch, axis=0, out=self.cache_batch)

        return self.cache_batch
    
    def __len__(self):
        return self.size
    
    def save(self, filePath):
        with h5py.File(filePath, 'a') as f:
            if 'metadata' in f:
                [chunkSize, maxlen, _, _] = f['metadata'][()]
                if chunkSize != self.chunkSize:
                    raise Exception('[TemporalMemory] currrent chunkSize = {self.chunkSize} and file chunkSize = {chunkSize} are different')
                if maxlen != self.maxlen:
                    raise Exception('[TemporalMemory] currrent maxlen = {self.maxlen} and file maxlen = {maxlen} are different')
                f['metadata'][()] = [self.chunkSize, self.maxlen, self.tail, self.size]
            else:
                f['metadata'] = [self.chunkSize, self.maxlen, self.tail, self.size]

            if 'array_chunks' in f:
                chunks = f['array_chunks']
                self.a = True
            else:
                chunks = f.create_dataset('array_chunks', shape=len(self.savedChunks), dtype=h5py.string_dtype(encoding='utf-8'), chunks=1)

            for i in range(0, len(self.savedChunks)):
                if not self.savedChunks[i]:
                    self.savedChunks[i] = True
                    start = i * self.chunkSize
                    end = min(start + self.chunkSize, self.maxlen)
                    chunk = self.array[start:end]
                    jsonStr = json.dumps(chunk.tolist())
                    chunks[i] = jsonStr

    def load(self, filePath):
        if not os.path.exists(filePath):
            print(f'[TemporalMemory] fail to load filePath="{filePath}", file does not exist')
            return
        
        with h5py.File(filePath, 'r') as f:
            [self.chunkSize, self.maxlen, self.tail, self.size] = f['metadata'][()]

            if len(self.array) != self.maxlen:
                self.array = np.empty(self.maxlen, dtype=object)

            chunks = f['array_chunks']
            self.savedChunks = [True] * len(chunks)
            for i in range(len(chunks)):
                jsonStr = chunks[i].decode('utf-8')
                if jsonStr == '':
                    continue
                start = i * self.chunkSize
                end = min(start + self.chunkSize, self.maxlen)
                self.array[start:end] = json.loads(jsonStr)
            a = self.array

class AgentBase(Serializable):
    def reset(self):
        pass
        
    def chooseAction(self, state_t):
        print('[AgentBase] chooseAction not implemented')
        return 0
        
    def train(self, batch, step):
        pass

class Plot(Serializable):
    def __init__(self, size=[0, 10, 0, 10], winTitle="Plot", xTitle="X Axis", yTitle="Y Axis"):
        self.winTitle = winTitle
        self.xTitle = xTitle
        self.yTitle = yTitle
        self.X = []
        self.Y = []
        [self.xmin, self.xmax, self.ymin, self.ymax] = size
    
    def add(self, x, y):
        self.xmin = min(self.xmin, x)
        self.xmax = max(self.xmax, x)
        self.ymin = min(self.ymin, y)
        self.ymax = max(self.ymax, y)
        self.X.append(x)
        self.Y.append(y)

    def show(self, msg):
        clear_output(wait=True)
        self.focus()
        plt.plot(self.X, self.Y)
        plt.text(self.xmin-(self.xmax-self.xmin)*0.2, self.ymin-(self.ymax-self.ymin)*0.2, msg, fontsize=10, color='red')
        plt.show(block=False)

    def focus(self):
        plt.figure(hash(self))
        plt.axis([self.xmin, self.xmax, self.ymin, self.ymax])
        plt.title(self.winTitle)
        plt.xlabel(self.xTitle)
        plt.ylabel(self.yTitle)

## <font color='green'>DQNAgent</font> Class

In [None]:
class DQNAgent(AgentBase):
    def __init__(self, dqn, decayGamma=0.9, exploreRate=[0.01, 1, 0.9996], syncRate=10, eagerMode=False):
        self.decayGamma = decayGamma # reward discount factor
        [self.exploreRate_min, exploreRate_max, self.exploreRate_decay] = exploreRate
        self.exploreRate = exploreRate_max
        self.isTraining = True
        self.syncRate = syncRate
        self.waitToSync = 0
        self.eagerMode = eagerMode

        self.dqn_policy = dqn
        self.dqn_target = dqn.clone()
        self.numActions = dqn.outputShape

        self.is_cache_init = False

    def CopyBatchToCache(self, batch):
        if not self.is_cache_init:
            self.state_t_batch = np.stack(batch[:,0], axis=0)
            self.state_t1_batch = np.stack(batch[:,3], axis=0)
        
        # SARS to numpy array
        np.stack(batch[:,0], axis=0, out=self.state_t_batch)
        self.action_t_batch = batch[:,1]
        self.reward_t_batch = batch[:,2]
        np.stack(batch[:,3], axis=0, out=self.state_t1_batch)
        self.e_batch = batch[:,4]

        if not self.eagerMode:
            if not self.is_cache_init:
                self.state_t_tensor = tf.Variable(self.state_t_batch, dtype=tf.float32)
                self.action_t_tensor = tf.Variable(self.action_t_batch, dtype=tf.int32)
                self.reward_t_tensor = tf.Variable(self.reward_t_batch, dtype=tf.float32)
                self.state_t1_tensor = tf.Variable(self.state_t1_batch, dtype=tf.float32)
                self.e_tensor = tf.Variable(self.e_batch, dtype=tf.float32)
            else:
                self.state_t_tensor.assign(self.state_t_batch)
                self.action_t_tensor.assign(self.action_t_batch)
                self.reward_t_tensor.assign(self.reward_t_batch)
                self.state_t1_tensor.assign(self.state_t1_batch)
                self.e_tensor.assign(self.e_batch)

        self.is_cache_init = True
        
    def reset(self):
        pass
        
    def chooseAction(self, state_t):
        if self.isTraining and np.random.uniform(0, 1) < self.exploreRate:
            action = random.randint(0, self.numActions-1)
        else:
            s = np.array([state_t])
            actionsVal = self.dqn_policy.predict(s)
            action = int(np.argmax(actionsVal, axis=1)[0])

        if (self.exploreRate > self.exploreRate_min):
            self.exploreRate = max(self.exploreRate * self.exploreRate_decay, self.exploreRate_min)

        return action
        
    def train(self, batch, step):
        if not self.isTraining:
            return

        self.trainOnBatch(batch)
        # self.trainOnBatch_old(batch)

        self.waitToSync += 1
        if self.waitToSync >= self.syncRate:
            self.waitToSync = 0
            self.dqn_target.copyFrom(self.dqn_policy)

    def trainOnBatch(self, batch):
        self.CopyBatchToCache(batch)
        
        if self.eagerMode:
            # train directly on numpy arrays
            loss = self.trainOnTensor_eager(self.state_t_batch, self.action_t_batch, self.reward_t_batch, self.state_t1_batch, self.e_batch)
        else:
            # train on tensors
            loss = self.trainOnTensor(self.state_t_tensor, self.action_t_tensor, self.reward_t_tensor, self.state_t1_tensor, self.e_tensor)

        self.dqn_policy.loss = loss.numpy()

    @tf.function
    def trainOnTensor(self, state_t_batch, action_t_batch, reward_t_batch, state_t1_batch, e_batch):
        # calculate the target Q value
        Q_t1_batch = self.dqn_target.model(state_t1_batch, training=False)
        next_action_value_batch = tf.reduce_max(Q_t1_batch, axis=1)
        target_action_value_batch = reward_t_batch + e_batch * self.decayGamma * next_action_value_batch

        # convert the current selected actions into onehot form
        onehot_action_t_batch = tf.one_hot(action_t_batch, self.numActions, dtype=tf.float32)

        # record operations performed on tensors for later gradients computations
        with tf.GradientTape() as tape:
            # calculate the policy Q value
            Q_t_batch = self.dqn_policy.model(state_t_batch, training=True)
            predict_action_value_batch = tf.reduce_sum(Q_t_batch * onehot_action_t_batch, axis=1)
            
            # calculate the losses of the two q values for the seleted actions
            loss_value = self.dqn_policy.loss_fn(predict_action_value_batch, target_action_value_batch)

        # calculate the gradients and update the dqn_policy model
        gradients = tape.gradient(loss_value, self.dqn_policy.model.trainable_variables)
        self.dqn_policy.optimizer.apply_gradients(zip(gradients, self.dqn_policy.model.trainable_variables))

        # return the mean loss
        return tf.reduce_mean(loss_value)

    def trainOnTensor_eager(self, state_t_batch, action_t_batch, reward_t_batch, state_t1_batch, e_batch):
        # calculate the target Q value
        Q_t1_batch = self.dqn_target.model(state_t1_batch, training=False)
        next_action_value_batch = tf.reduce_max(Q_t1_batch, axis=1)
        target_action_value_batch = reward_t_batch + e_batch * self.decayGamma * next_action_value_batch

        # convert the current selected actions into onehot form
        onehot_action_t_batch = tf.one_hot(action_t_batch, self.numActions, dtype=tf.float32)

        # record operations performed on tensors for later gradients computations
        with tf.GradientTape() as tape:
            # calculate the policy Q value
            Q_t_batch = self.dqn_policy.model(state_t_batch, training=True)
            predict_action_value_batch = tf.reduce_sum(Q_t_batch * onehot_action_t_batch, axis=1)
            
            # calculate the losses of the two q values for the seleted actions
            loss_value = self.dqn_policy.loss_fn(predict_action_value_batch, target_action_value_batch)

        # calculate the gradients and update the dqn_policy model
        gradients = tape.gradient(loss_value, self.dqn_policy.model.trainable_variables)
        self.dqn_policy.optimizer.apply_gradients(zip(gradients, self.dqn_policy.model.trainable_variables))

        # return the mean loss
        return tf.reduce_mean(loss_value)

    def trainOnBatch_old(self, batch):
        state_t_batch = np.stack(batch[:,0], axis=0)
        action_t_batch = batch[:,1]
        reward_t_batch = batch[:,2]
        state_t1_batch = np.stack(batch[:,3], axis=0)
        e_batch =  batch[:,4]

        Q_t_target = self.dqn_policy.predict(state_t_batch)
        Q_t1_batch = self.dqn_target.predict(state_t1_batch)
        next_action_value_batch = np.max(Q_t1_batch, axis=1)
        target_action_value_batch = reward_t_batch + e_batch * self.decayGamma * next_action_value_batch

        for i in range(0, len(batch)):
            action_t = action_t_batch[i]
            Q_t_target[i, action_t] = target_action_value_batch[i]

        self.dqn_policy.trainOnBatch(state_t_batch, Q_t_target)

    def toData(self):
        return [self.exploreRate, self.numActions, self.dqn_policy.lr]

    def fromData(self, data):
        [self.exploreRate, self.numActions, lr] = data
        self.dqn_policy.setLearningRate(lr)

## <font color='green'>DeepQLearning</font> Class

In [None]:
import cv2
import json

class DeepQLearning:
    def __init__(self, env, agent, memSize=10000, batchSize=32, useImageInput=False):
        self.env = env
        self.agent = agent
        self.useImageInput = useImageInput
        self.cache_frames = None
        self.memory = TemporalMemory(memSize)
        self.batchSize = batchSize
        self.onInputImage = self.__onInputImage__
        self.onEpisodeEnd = None
        self.onStepEnd = None
        self.onKeyPressed = None
        self.lastStep = 0
        
    def __onInputImage__(self, img):
        return img[:,:,0]
    
    def stackFramesToChannel(self, frames):
        if self.cache_frames is None:
            self.cache_frames = np.stack(frames, axis=-1)
        else:
            np.stack(frames, axis=-1, out=self.cache_frames)
        return self.cache_frames.tolist()

    def play(self, frameSkipping=1, steps=-1, fromLastStep=False, frameWaitTime=10):
        episode = 1
        step = 0 if not fromLastStep else self.lastStep

        while step < steps or steps == -1:
            # episode starts
            state_t = None
            state_t1_frames = []
            reward_t = 0
            gameEnd = False
            score = 0
            
            self.agent.reset()
            s_vector = self.env.reset()[0] 
            s_img = self.env.render()
            s_frame = self.onInputImage(s_img) if self.useImageInput else s_vector
            # duplicate the first frame 's_frame' to create a complete state
            # complete state means stacking certain frames into a state
            state_t = self.stackFramesToChannel([s_frame for _ in range(frameSkipping)])

            # len(state_t1_frames) means frames skipped
            while not gameEnd:
                # choose a new action when state_t1_frames_frames is comsumed by state_t
                if len(state_t1_frames) == 0: # len(state_t1_frames) == 0 means the new generated state_t1_frames became state_t
                    action_t = self.agent.chooseAction(state_t)

                # interact with the evironment using the current action, 
                # the current action repeats while frame skipping
                s_vector, r_frame, terminated, truncated, info = self.env.step(action_t)
                s_img = self.env.render()
                s_frame = self.onInputImage(s_img) if self.useImageInput else s_vector

                if frameWaitTime > 0:
                    cv2.imshow('Game', s_img)
                    key = cv2.waitKey(frameWaitTime)
                    if key != None and self.onKeyPressed != None:
                        self.onKeyPressed(key)

                state_t1_frames.append(s_frame)
                reward_t += r_frame
                score += reward_t
                gameEnd = terminated or truncated

                # duplicate s into state_t1_frames to be a complete state when the game terminates
                if gameEnd and len(state_t1_frames) < frameSkipping:
                    for i in range(0, frameSkipping-len(state_t1_frames)):
                        state_t1_frames.append(s_frame)

                # train the agent with the result of iteraction after certain frames passed
                if len(state_t1_frames) == frameSkipping: #len(state_t1_frames) == frameSkipping means new state is generated
                    state_t1 = self.stackFramesToChannel(state_t1_frames)
                    self.memory.push([state_t, action_t, reward_t, state_t1, 0 if terminated else 1])
                    batch = self.memory.sampleBatch(self.batchSize)
                    if batch is not None:
                        self.agent.train(batch, step)
                        
                    state_t = state_t1
                    state_t1_frames = []
                    reward_t = 0
                
                if self.onStepEnd != None:
                    self.onStepEnd(episode, step, score, terminated)  
                step += 1
                self.lastStep = step

            # do something when one episode ends
            if self.onEpisodeEnd != None:
                self.onEpisodeEnd(episode, step, score)
            episode += 1
                
        self.env.reset()
        if frameWaitTime > 0:
            cv2.waitKey(frameWaitTime)
            cv2.destroyAllWindows()
    
    def save(self, filePath, extra_data=[]):
        if filePath is None:
            raise Exception('[DeepQLearning] save(filePath) filePath is None')
        
        mem_path_old = f'{filePath}_mem_old.h5'
        mem_path = f'{filePath}_mem.h5'
        h5_path = f'{filePath}.h5'

        if os.path.exists(mem_path_old):
            os.remove(mem_path_old)
        if os.path.exists(mem_path):
            os.rename(mem_path, mem_path_old)
        else:
            os.makedirs(os.path.dirname(mem_path), exist_ok=True)
        
        self.memory.save(mem_path)
        
        # Save the model as an HDF5 file
        self.agent.dqn_policy.model.save(h5_path)
        # Add custom data to the same HDF5 file
        with h5py.File(h5_path, 'a') as f:
            f.create_dataset('custom_data/agent', data=self.agent.toData())
            f.create_dataset('custom_data/lastStep', data=self.lastStep)
            f.create_dataset('custom_data/extra_data', data=extra_data)

    def load(self, filePath):
        if filePath is None:
            raise Exception('[DeepQLearning] load(filePath) filePath is None')

        mem_path = f'{filePath}_mem.h5'
        h5_path = f'{filePath}.h5'
        
        if not os.path.exists(mem_path):
            print(f'[DeepQLearning] fail to load mem file, file="{mem_path}" does not exist')
        else:
            self.memory.load(mem_path)          
                
        
        if not os.path.exists(h5_path):
            print(f'[DeepQLearning] fail to load h5 file, file="{h5_path}" does not exist')
            return
        else:
            # Load the model as an HDF5 file
            self.agent.dqn_policy.model.load_weights(h5_path)
            self.agent.dqn_target = self.agent.dqn_policy.clone()
            # To read the custom data back from the HDF5 file
            with h5py.File(h5_path, 'r') as f:
                self.agent.fromData(f['custom_data/agent'][()])
                self.lastStep = int(f['custom_data/lastStep'][()])
                return f['custom_data/extra_data'][()]
        
        

## Test Classes

In [None]:
env = gym.make('CartPole-v1', render_mode="rgb_array")
agent = AgentBase()
b = DeepQLearning(env, agent)
b.play(4, 2)

getEnvInputOutputShape(env)

# import cv2
# env.reset()
# img = env.render()
# img = img[170:-80,:,0] 
# print(img.shape)
# img = cv2.resize(img, (120, 80), interpolation=cv2.INTER_CUBIC) 
# print(img.dtype)

# cv2.imshow('Image', img)
# cv2.waitKey(1)
# cv2.destroyAllWindows()

# <font color='purple'>**Deep Q-Learning CartPole**</font>

## Test CartPole

In [None]:
env = gym.make('CartPole-v1', render_mode="rgb_array")
state_shape = env.observation_space.shape
action_count = env.action_space.n
print(f'state_shape:{state_shape}, action_count:{action_count}')

episodes = 5
for e in range(1, episodes+1):
    state = env.reset()
    terminated = False
    truncated = False
    score = 0
    while not terminated and not truncated:
        env.render()
        action = random.choice([0, 1])
        n_state, reward, terminated, truncated, info = env.step(action)
        score += reward
    print('Episode:{} Score:{}'.format(e, score))
env.close()

## <font color='green'>DQN_DenseResnet</font> Class

In [None]:
class DQN_DenseResnet(DQNBase):
    def hiddenLayers(self, inputs):
        fx = layers.Flatten()(inputs)
        fx = layers.Dense(128)(fx)
        fx = bn(fx)
        fx = relu(fx)
        fx = self.dense_res(fx, 128)
        fx = self.dense_res(fx, 128)

        advantage = layers.Dense(self.outputShape, activation='linear')(fx)
        value = layers.Dense(1, activation='linear')(fx)
        q_values = value + (advantage - tf.reduce_mean(advantage, axis=1, keepdims=True))

        return q_values

    def dense_res(self, x, size):
        fx = layers.Dense(size)(x)
        fx = bn(fx)
        fx = relu(fx)
        fx = layers.Dense(size)(fx)
        fx = bn(fx)
        fx = layers.Add()([fx, x])
        fx = relu(fx)
        return fx

In [None]:
#================== Data ==================
# releaseMemory()
env = gym.make('CartPole-v1', render_mode="rgb_array")
inputFrameCount = 4
input_count = env.observation_space.shape[0]
action_count = env.action_space.n
inputShape=(inputFrameCount, input_count)
outputShape=2
lr=0.0006
decayGamma=0.95
exploreRate=[0.01, 1, 0.9996]
steps = 80000
syncRate=20
batchSize=32
memSize=1000

loss = tf.keras.losses.Huber(delta=0.005)
dqn_dense = DQN_DenseResnet(inputShape, outputShape, lr, loss)
dqn_dense.summary()
agent = DQNAgent(dqn_dense, decayGamma, exploreRate, syncRate)
dq_rl = DeepQLearning(env, agent, memSize, batchSize)

In [None]:
#============ Training ==============
maxScore = 0
next_print_time = 0
plot = Plot(size=[0,10000, 0, 500], xTitle='Steps', yTitle='Scores', winTitle='CartPole')
filePath = '../Data/DenseResnet/data'
data = dq_rl.load(filePath)

if data is not None:
    [plotJson] = data
    plot.fromJson(plotJson)

def onStepEnd(episode, step, score, terminated):
    if step < 2000:
        agent.dqn_policy.setLearningRate(0.000001)
    elif step < 20000:
        agent.dqn_policy.setLearningRate(0.0006)
    elif step < 30000:
        agent.dqn_policy.setLearningRate(0.0001)
    else:
        agent.dqn_policy.setLearningRate(0.00005)
    
    if step > 1000 and step % 1000 == 0:
        dq_rl.save(filePath, [plot.toJson()])

def onEpisodeEnd(episode, step, score):
    global maxScore, next_print_time, plot
    maxScore = max(score, maxScore)
    plot.add(x=step, y=score)
    if time.time() > next_print_time:
        next_print_time = time.time() + 2
        plot.show(f'[step={step}] a={dq_rl.memory.a} score={score} max={maxScore} lr={agent.dqn_policy.lr} loss={agent.dqn_policy.loss} explor={agent.exploreRate}')
    
dq_rl.onStepEnd = onStepEnd
dq_rl.onEpisodeEnd = onEpisodeEnd
dq_rl.play(frameSkipping=inputFrameCount, steps=steps, fromLastStep=True, frameWaitTime=0)

In [None]:
#============ Testing ==============
agent.isTraining = False
plot = Plot(size=[0,0, 0, 1300], xTitle='Steps', yTitle='Scores', winTitle='CartPole')
def onEpisodeEnd(episode, step, score):
    global maxScore
    maxScore = max(score, maxScore)
    plot.add(x=step, y=score)
    plot.show(f'[step={step}] score={score} max={maxScore}')

dq_rl.onStepEnd = None
dq_rl.onEpisodeEnd = onEpisodeEnd
dq_rl.play(frameSkipping=inputFrameCount, steps=10000, frameWaitTime=1)

## <font color='green'>DQN_ConvResnet</font> Class

In [None]:
class DQN_ConvResnet(DQNBase):
    def hiddenLayers(self, inputs):
        fx = conv(inputs, 64, kernel_size=5, strides=2) # (32, 128) -> (16, 64)
        fx = bn(fx)
        fx = relu(fx)
        fx = self.bottleneck(fx, neck_num=16, out_num=64, kernel_size=3)
        fx = self.bottleneck(fx, neck_num=16, out_num=64, kernel_size=3)
        fx = self.bottleneck(fx, neck_num=32, out_num=128, kernel_size=3, poolStride=2) # (16, 64) -> (8, 32)
        fx = self.bottleneck(fx, neck_num=32, out_num=128, kernel_size=3)
        fx = self.bottleneck(fx, neck_num=32, out_num=128, kernel_size=3)
        fx = self.bottleneck(fx, neck_num=64, out_num=256, kernel_size=3, poolStride=2) # (8, 32) -> (4, 16)
        fx = self.bottleneck(fx, neck_num=64, out_num=256, kernel_size=2)
        fx = self.bottleneck(fx, neck_num=64, out_num=256, kernel_size=2)
        fx = self.bottleneck(fx, neck_num=128, out_num=512, kernel_size=2, poolStride=2) # (4, 16) -> (2, 8)
        fx = self.bottleneck(fx, neck_num=128, out_num=512, kernel_size=1)
        fx = self.bottleneck(fx, neck_num=128, out_num=512, kernel_size=1)
        fx = self.bottleneck(fx, neck_num=256, out_num=1024, kernel_size=2, poolStride=2) # (2, 8) -> (1, 4)
        fx = layers.Flatten()(fx)
        
        fx = layers.Dense(128)(fx)
        fx = bn(fx)
        fx = relu(fx)
        fx = self.dense_res(fx, 128)
        fx = self.dense_res(fx, 128)

        advantage = layers.Dense(self.outputShape, activation='linear')(fx)
        value = layers.Dense(1, activation='linear')(fx)
        q_values = value + (advantage - tf.reduce_mean(advantage, axis=1, keepdims=True))

        return q_values

    def residual_block(self, x, num, kernel_size=3, poolStride=1):
        shortcut = x
        if poolStride != 1:
            shortcut = layers.AveragePooling2D(pool_size=poolStride, strides=poolStride, padding='same')(shortcut)
        if num != shortcut.shape[-1]:
            shortcut = conv(shortcut, num, kernel_size=1)
            shortcut = bn(shortcut)
        
        fx = conv(x, num, kernel_size=kernel_size, strides=poolStride)
        fx = bn_relu(fx)
        fx = conv(fx, num, kernel_size=kernel_size)
        fx = bn(fx)
        fx = layers.Add()([fx, shortcut]) # skip
        fx = relu(fx)
        return fx
        
    def bottleneck(self, x, neck_num, out_num, kernel_size=3, poolStride=1):
        shortcut = x
        if poolStride != 1:
            shortcut = layers.AveragePooling2D(pool_size=poolStride, strides=poolStride, padding='same')(shortcut)
        if out_num != shortcut.shape[-1]:
            shortcut = conv(shortcut, out_num, kernel_size=1)
            shortcut = bn(shortcut)
        
        fx = conv(x, neck_num, kernel_size=1)
        fx = bn_relu(fx)
        fx = conv(fx, neck_num, kernel_size=kernel_size, strides=poolStride)
        fx = bn_relu(fx)
        fx = conv(fx, out_num, kernel_size=1)
        fx = bn(fx)
        fx = layers.Add()([fx, shortcut]) # skip
        fx = relu(fx)
        return fx
        
    def dense_res(self, x, num):
        input_dim = x.shape[-1]
        shortcut = x
        
        if input_dim != num:
            shortcut = layers.Dense(num)(shortcut)
            shortcut = bn(shortcut)
            
        fx = layers.Dense(num)(x)
        fx = bn_relu(fx)
        fx = layers.Dense(num)(fx)
        fx = bn(fx)
        fx = layers.Add()([fx, shortcut])
        fx = relu(fx)
        return fx

In [None]:
#============ Training ==============
# releaseMemory()
env = gym.make('CartPole-v1', render_mode="rgb_array")
inputFrameCount = 4
action_count = env.action_space.n
imWidth = 128
imHeight = 32
inputShape=(imHeight, imWidth, inputFrameCount)
outputShape=2
lr=0.0006
decayGamma=0.95
exploreRate=[0.02, 1, 0.9999]
steps = 500000
syncRate=20
batchSize=32
memSize=20000

loss = tf.keras.losses.Huber(delta=1.0)
dqn_dense = DQN_ConvResnet(inputShape, outputShape, lr, loss)
dqn_dense.summary()
agent = DQNAgent(dqn_dense, decayGamma, exploreRate, syncRate)
dq_rl = DeepQLearning(env, agent, memSize, batchSize, useImageInput=True)

In [None]:
maxScore = 0
next_print_time = 0
plot = Plot(size=[0,10000, 0, 300], xTitle='Steps', yTitle='Scores', winTitle='CartPole')
filePath = 'CartPole/DQN_ConvResnet_data'
data = dq_rl.load(filePath)
if data is not None:
    [plotJson] = data
    plot.fromJson(plotJson)

def onInputImage(img):
    return cv2.resize(img[170:-80,:,0], (imWidth, imHeight), interpolation=cv2.INTER_CUBIC)

def onStepEnd(episode, step, score, terminated):
    if step < 2000:
        agent.dqn_policy.setLearningRate(0.00000001)
    elif step < 10000:
        agent.dqn_policy.setLearningRate(0.00001)
    elif step < 50000:
        agent.dqn_policy.setLearningRate(0.000005)
    else:
        agent.dqn_policy.setLearningRate(0.000003)

    if step > 1000 and (step % 3000 == 0 or step % 100000 == 0):
        dq_rl.save(filePath, [plot.toJson()])

def onEpisodeEnd(episode, step, score):
    global maxScore, next_print_time, plot
    maxScore = max(score, maxScore)
    if (episode % 10==0):
        plot.add(x=step, y=maxScore)
        maxScore = 0
    if time.time() > next_print_time:
        next_print_time = time.time() + 6
        plot.show(f'[step={step}] score={score} lr={agent.dqn_policy.lr} loss={agent.dqn_policy.loss} explor={agent.exploreRate}')

def onKeyPressed(key):
    global lr
    if key == ord('w'):
        lr = lr * 10
    elif key == ord('s'):
        lr = lr * 0.1
    agent.dqn_policy.setLearningRate(lr)

dq_rl.onInputImage = onInputImage
dq_rl.onStepEnd = onStepEnd
dq_rl.onEpisodeEnd = onEpisodeEnd
dq_rl.onKeyPressed = onKeyPressed
dq_rl.play(frameSkipping=inputFrameCount, steps=steps, fromLastStep=True, frameWaitTime=0)

In [None]:
agent.isTraining = False
maxScore = 0
sumScore = 0
def onEpisodeEnd(episode, step, score):
    global maxScore, sumScore
    maxScore = max(score, maxScore)
    sumScore += score
    print(f'[step={step}] score={score} max={maxScore} mean={sumScore/episode}')

dq_rl.onStepEnd = None
dq_rl.onEpisodeEnd = onEpisodeEnd
dq_rl.play(frameSkipping=inputFrameCount, steps=5000, frameWaitTime=10)

In [None]:
import math
import h5py
import json

class TemporalMemory:
    def __init__(self, maxlen, chunkSize=100):
        self.chunkSize = chunkSize
        self.maxlen = maxlen
        self.array = np.empty(maxlen, dtype=object)  # Initialize with a numpy array of objects
        self.tail = 0
        self.size = 0

        chunkCount = math.ceil(self.maxlen/self.chunkSize)
        self.savedChunks = [True] * chunkCount
        self.cache_batch = None

    def push(self, value):
        if self.size < self.maxlen:
            self.size += 1
        
        chunkIndex = math.floor(self.tail/self.chunkSize)
        self.savedChunks[chunkIndex] = False

        self.array[self.tail] = value
        self.tail = (self.tail + 1) % self.maxlen

    def sampleBatch(self, batchSize):
        if self.size == self.maxlen:
            array = self.array
        else:
            array = self.array[0:self.tail]
        
        if (batchSize <= len(array)):
            batch = np.random.choice(array, batchSize, replace=False)
        else:
            return None

        if self.cache_batch is None:
            self.cache_batch = np.stack(batch, axis=0)
        else:
            np.stack(batch, axis=0, out=self.cache_batch)

        return self.cache_batch
    
    def __len__(self):
        return self.size
    
    def save(self, filePath):
        h5_path = f'{filePath}.h5'

        with h5py.File(h5_path, 'a') as f:
            if 'metadata' in f:
                [chunkSize, maxlen] = f['metadata'][()]
                if chunkSize != self.chunkSize:
                    raise Exception('[TemporalMemory] currrent chunkSize = {self.chunkSize} and file chunkSize = {chunkSize} are different')
                if maxlen != self.maxlen:
                    raise Exception('[TemporalMemory] currrent maxlen = {self.maxlen} and file maxlen = {maxlen} are different')
                f['metadata'][()] = [self.chunkSize, self.maxlen, self.tail, self.size]
            else:
                f['metadata'] = [self.chunkSize, self.maxlen, self.tail, self.size]

            if 'array_chunks' in f:
                chunks = f['array_chunks']
            else:
                chunks = f.create_dataset('array_chunks', shape=len(self.savedChunks), dtype=h5py.string_dtype(encoding='utf-8'), chunks=1)

            for i in range(0, len(self.savedChunks)):
                if not self.savedChunks[i]:
                    self.savedChunks[i] = True
                    start = i * self.chunkSize
                    end = min(start + self.chunkSize, self.maxlen)
                    chunk = self.array[start:end]
                    jsonStr = json.dumps(chunk.tolist())
                    chunks[i] = jsonStr

    def load(self, filePath):
        h5_path = f'{filePath}.h5'
        if not os.path.exists(h5_path):
            print(f'[TemporalMemory] fail to load filePath="{filePath}", file does not exist')
            return
        
        with h5py.File(h5_path, 'r') as f:
            [self.chunkSize, self.maxlen, self.tail, self.size] = f['metadata'][()]

            if len(self.array) != self.maxlen:
                self.array = np.empty(self.maxlen, dtype=object)

            chunks = f['array_chunks']
            self.savedChunks = [True] * len(chunks)
            for i in range(len(chunks)):
                jsonStr = chunks[i].decode('utf-8')
                if jsonStr == '':
                    continue
                start = i * self.chunkSize
                end = min(start + self.chunkSize, self.maxlen)
                self.array[start:end] = json.loads(jsonStr)


mem1 = TemporalMemory(19,90)
mem1.load('test')
print(mem1.sampleBatch(5))