In [2]:
import pygame,random,gymnasium as gym,numpy as np,matplotlib.pyplot as plt
import tensorflow as tf,os,warnings
from tensorflow.keras import optimizers, losses
from tensorflow.keras import Model
from collections import deque
from tensorflow.python.framework import random_seed
from IPython.display import clear_output
warnings.filterwarnings("ignore", message="Model's `__init__()` arguments contain non-serializable objects.")

seed = 1
np.random.seed(seed)
np.random.default_rng(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
random_seed.set_seed(seed)
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)

In [3]:
class Network(Model):
    def __init__(self, state_size: int, action_size: int, ):
        """
        Initialization.
        :param state_size: The size of the state space.
        :param action_size: The size of the action space.
        :param hidden_size: The size of the hidden layers.
        """
        super(Network, self).__init__()
        
        self.num_action = action_size
        self.layer1 = tf.keras.layers.Dense(hidden_size, activation='relu')# Define the first hidden layer with ReLU activation
        self.layer2 = tf.keras.layers.Dense(hidden_size, activation='relu')# Define the second hidden layer with ReLU activation
        self.state = tf.keras.layers.Dense(self.num_action)# Define the output layer for state values
        self.action = tf.keras.layers.Dense(self.num_action)# Define the output layer for action values

    def call(self, state):
        """
        Forward pass of the network.
        :param state: Input state.
        :return: Value function Q(s, a).
        """
        layer1 = self.layer1(state) # Pass the input state through the first hidden layer      
        layer2 = self.layer2(layer1)  # Pass the result through the second hidden layer
        state = self.state(layer2) # Compute the state values       
        action = self.action(layer2) # Compute the action values        
        mean = tf.keras.backend.mean(action, keepdims=True)# Calculate the mean of the action values 
        advantage = (action - mean)# Calculate the advantage by subtracting the mean action value      
        value = state + advantage # Compute the final Q-values by adding state values and advantages 

        return value



In [4]:




loss_history=[]
class DQNAgent:
    def __init__(
        self, 
        env: gym.Env,
        batch_size: int,
        target_update: int,
    ):
        """Initialization.
        
        Args:
            env (gym.Env): openAI Gym environment
            memory_size (int): length of memory
            batch_size (int): batch size for sampling
            target_update (int): period for target model's hard update
            epsilon_decay (float): step size to decrease epsilon
            lr (float): learning rate
            max_epsilon (float): max value of epsilon
            min_epsilon (float): min value of epsilon
            gamma (float): discount factor
        """
        
        # CREATING THE Q-Network
        self.env = env
        self.action_space = env.action_space
        self.action_space.seed(seed)
        self.state_size = self.env.observation_space.shape[0]
        self.action_size = self.env.action_space.n
        
        self.batch_size = batch_size
        # hyper parameters
        memory_size = 100000
        self.lr = 0.001
        self.target_update = target_update
        self.gamma = 0.99
        
        self.dqn = Network(self.state_size, self.action_size)
        self.dqn_target = Network(self.state_size, self.action_size)
        self.train_start = 1000

        self.optimizers = optimizers.Adam(learning_rate=self.lr, )
        
        self.memory = deque(maxlen=memory_size)
        self.Soft_Update = False # use soft parameter update

        self.TAU = 0.1 # target network soft update hyperparameter
        
        self._target_hard_update()
        
    # EXPLORATION VS EXPLOITATION
    def get_action(self, state, epsilon):
        q_value = self.dqn(tf.convert_to_tensor([state], dtype=tf.float32))[0]
        # Choose an action a in the current world state (s)
        # If this number < greater than epsilon doing a random choice --> exploration
        if np.random.rand() <= epsilon:
            action = np.random.choice(self.action_size)

        ## Else --> exploitation (taking the biggest Q value for this state)
        else:
            action = np.argmax(q_value) 

        return action
    
    def append_sample(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    # UPDATING THE Q-VALUE
    def train_step(self):
        mini_batch = random.sample(self.memory, self.batch_size)

        states      = [i[0] for i in mini_batch]
        actions     = [i[1] for i in mini_batch]
        rewards     = [i[2] for i in mini_batch]
        next_states = [i[3] for i in mini_batch]
        dones       = [i[4] for i in mini_batch]
        
        dqn_variable = self.dqn.trainable_variables
        with tf.GradientTape() as tape:
            tape.watch(dqn_variable)
            
            states      = tf.convert_to_tensor(np.vstack(states), dtype=tf.float32)
            actions     = tf.convert_to_tensor(actions, dtype=tf.int32)
            rewards     = tf.convert_to_tensor(rewards, dtype=tf.float32)
            next_states = tf.convert_to_tensor(np.vstack(next_states), dtype=tf.float32)
            dones       = tf.convert_to_tensor(dones, dtype=tf.float32)
            
            next_Qs = self.dqn(next_states)
            next_Qs = tf.stop_gradient(next_Qs)
            next_Q_targs = self.dqn_target(next_states)
            next_action = tf.argmax(next_Qs, axis=1)
            target_value = tf.reduce_sum(tf.one_hot(next_action, self.action_size) * next_Q_targs, axis=1)
            
            mask = 1 - dones
            target_value = rewards + self.gamma * target_value * mask 
            
            curr_Qs = self.dqn(states)
            
            main_value = tf.reduce_sum(tf.one_hot(actions, self.action_size) * curr_Qs, axis=1)
            error = tf.square(main_value - target_value) * 0.5
            loss  = tf.reduce_mean(error)
            loss_history.append(loss)
            
        dqn_grads = tape.gradient(loss, dqn_variable)
        self.optimizers.apply_gradients(zip(dqn_grads, dqn_variable))
        
    # after some time interval update the target model to be same with model
    def _target_hard_update(self):
        if not self.Soft_Update:
            self.dqn_target.set_weights(self.dqn.get_weights())
            return
        if self.Soft_Update:
            q_model_theta = self.dqn.get_weights()
            dqn_target_theta = self.dqn_target.get_weights()
            counter = 0
            for q_weight, target_weight in zip(q_model_theta, dqn_target_theta):
                target_weight = target_weight * (1-self.TAU) + q_weight * self.TAU
                dqn_target_theta[counter] = target_weight
                counter += 1
            self.dqn_target.set_weights(dqn_target_theta)
    
    def update_Gamma(self):
        self.gamma = 1 - 0.985 * (1 - self.gamma)
    def load(self, phat):
        
        self.dqn = tf.keras.models.load_model(phat, custom_objects={'Network': Network})
    def save(self, phat):
        self.dqn.save(phat)

Train=True


env = gym.make("LunarLander-v2",render_mode="human" if not Train else None)

# parameters
target_update = 20


# INITIALIZING THE Q-PARAMETERS
hidden_size = 86
max_episodes = 300  # Set total number of episodes to train agent on.
batch_size =128#128

# Exploration parameters
epsilon = 1.0                 # Exploration rate
max_epsilon = 1.0             # Exploration probability at start
min_epsilon = 0.01            # Minimum exploration probability 
decay_rate = 0.025            # Exponential decay rate for exploration prob

# train
agent = DQNAgent(
    env, 
#     memory_size, 
    batch_size, 
    target_update, 
#     epsilon_decay,
)
save_path='./weights-and-plot/final_weights'
load_path='./weights-and-plot/final_weights' + '_' + '300'
def plot_training(episode):
        agent.save(save_path + '_' + f'{episode}')
        print('\n~~~~~~Interval Save: Model saved.\n')
        sma_reward = np.convolve(reward_history, np.ones(50)/50, mode='valid')
        max_reward=np.max(reward_history)
        min_reward=np.min(reward_history)
        #normalized_loss = np.interp(loss_history, (np.min(loss_history), np.max(loss_history)), (min_reward/2,max_reward))
        normalized_epsilon = np.interp(epsilon_history, (np.min(epsilon_history), np.max(epsilon_history)), (min_reward/4,max_reward))
        plt.plot(loss_history, label='Loss', color='#CB291A', alpha=0.8)
        
        plt.title("Training Progress")
        plt.xlabel("Episode")
        plt.ylabel("loss")

        plt.xlim(0, len(loss_history))
        
        plt.legend()
        plt.grid(True)

        if episode == max_episodes:
            plt.savefig('./weights-and-plot/loss-of-training_progress.png', format='png', dpi=600, bbox_inches='tight')
            
        plt.tight_layout()
        plt.show()
        plt.clf()
        plt.close()

        plt.figure(figsize=(10, 6))
        
        #Plot Rewards,SMA 50 Reward ,Normalized Loss and Normalized Epsilon
        plt.plot(reward_history, label='Raw Reward', color='#F6CE3B', alpha=0.8)

        plt.plot(sma_reward, label='SMA 50 Reward', color='#385DAA')

        plt.plot(normalized_epsilon, label='Normalized Epsilon', color='green', alpha=0.8)
        
        #plt.plot(normalized_loss, label='Normalized Loss', color='#CB291A', alpha=0.8)
        
        plt.title("Training Progress")
        plt.xlabel("Episode")
        plt.ylabel("Reward")
        plt.legend()
        plt.grid(True)

        # Save as file if last episode
        if episode == max_episodes:
            plt.savefig('./weights-and-plot/training_progress.png', format='png', dpi=600, bbox_inches='tight')
            
        plt.tight_layout()
        plt.show()
        plt.clf()
        plt.close()
if __name__ == "__main__":
    
    save_intervalve=100
    if Train:

        update_cnt    = 0
        # TRAINING LOOP
        #List to contain all the rewards of all the episodes given to the agent
        scores = []
        reward_history=[]
        epsilon_history=[]
        # EACH EPISODE    
        for episode in range(1,max_episodes+1):
            ## Reset environment and get first new observation
            state = agent.env.reset(seed=1)
            state=state[0]
            episode_reward = 0
            done = False  # has the enviroment finished?
            
                
            # EACH TIME STEP    
            while not done :
            # for step in range(max_steps):  # step index, maximum step is 200
                update_cnt += 1
                # EXPLORATION VS EXPLOITATION
                # Take the action (a) and observe the outcome state(s') and reward (r)
                action = agent.get_action(state, epsilon)
            
                # TAKING ACTION
                next_state, reward, done, _ ,_= agent.env.step(action)
                if isinstance(state, tuple): 
                        next_state = next_state[0]
                agent.append_sample(state, action, reward, next_state, done)
                
                # Our new state is state
                state = next_state

                episode_reward += reward

                # if episode ends
                if done:
                    scores.append(episode_reward)
                    print("episode: {}/{}, score: {}, e: {:.4}".format(episode+1, max_episodes, episode_reward, epsilon,)) 
                    break
                # if training is ready
                if (update_cnt >= agent.batch_size):
                    # UPDATING THE Q-VALUE
                    agent.train_step()
                    #agent.update_Gamma()
                
                    # if hard update is needed
                    if update_cnt % agent.target_update == 0:
                        agent._target_hard_update()
            
            reward_history.append(episode_reward)   
            epsilon_history.append(epsilon)
            epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode) 
            
            if episode % save_intervalve==0:
                plot_training(episode)

    else:
        agent.load(load_path)
        scores = []
        for episode in range(5):
            state = agent.env.reset(seed=1)
            state=state[0]
            episode_reward = 0
            done = False  
            while not done:
                action = agent.get_action(state,0.01)
                next_state, reward, done, _ ,_= agent.env.step(action)
                if isinstance(state, tuple): 
                        next_state = next_state[0]
                agent.append_sample(state, action, reward, next_state, done)
                state = next_state
                episode_reward += reward
                if done:
                    scores.append(episode_reward)
                    print("episode: {}/{}, score: {}, e: {:.4}".format(episode+1, max_episodes, episode_reward, 0.01)) 
                    break
        pygame.quit()
        

I0000 00:00:1759402423.710708  132847 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 1355 MB memory:  -> device: 0, name: NVIDIA GeForce MX550, pci bus id: 0000:02:00.0, compute capability: 7.5


ValueError: Passed in object <Variable path=network/dense/kernel, shape=(8, 86), dtype=float32, value=[[-0.12177525  0.18314123  0.22151998 -0.07921028 -0.23674747 -0.19689164
   0.02276471 -0.24431936  0.1498181  -0.11006273  0.07424611 -0.19026557
   0.06082076 -0.07555687  0.01466727 -0.21164581 -0.09841512  0.00384176
   0.23363367 -0.1216822  -0.1892071  -0.06211522  0.06369936  0.2110743
   0.10493085 -0.13438752  0.1942491  -0.11770347 -0.1626687  -0.15875685
  -0.21273488  0.121694    0.01320517 -0.21739553 -0.23257032 -0.08973484
  -0.04147279  0.03971833  0.17834142  0.10765892  0.1385093  -0.06631562
  -0.07206383  0.16575754 -0.0205761  -0.05700642 -0.01305674  0.12107947
   0.22262198  0.21571663 -0.0537004   0.15945354  0.10266799 -0.11028276
  -0.05725797  0.11838901 -0.10737008  0.06135362 -0.22396992 -0.11250931
   0.02572948  0.02296016 -0.24522705  0.1500127  -0.04462624 -0.24389549
   0.07436252 -0.00794445  0.07689652  0.24256265 -0.05014296  0.05073175
   0.16694927  0.09456477  0.0524115  -0.23819697 -0.12754256  0.12188542
  -0.05940963  0.10782391 -0.17253068 -0.23796898  0.11526176 -0.1365698
   0.19163585  0.11507943]
 [ 0.01822212 -0.22318065  0.00099176 -0.22465527  0.15304056  0.10136515
   0.07709149  0.19694379 -0.00486882 -0.03921349 -0.03510302 -0.14570498
  -0.13211393 -0.01091766 -0.03252387  0.18016016 -0.17245334 -0.06526229
  -0.22043121 -0.20465393  0.01399532 -0.00604944  0.0410406   0.00878593
  -0.24769393 -0.08068912  0.07811368  0.18974441  0.0124076   0.2507775
  -0.1746847   0.11611897 -0.01764391  0.1831049   0.04923359 -0.15712862
  -0.17947355 -0.0365871   0.23931056  0.12822008 -0.010648   -0.11843394
   0.16025129  0.01790839  0.01655179  0.05212885 -0.03550075 -0.1922599
  -0.24147817  0.07613629 -0.063704    0.09914505  0.00686643 -0.06111044
   0.00593007  0.153864   -0.188716    0.01101363 -0.15607005 -0.24164534
   0.06703982 -0.04366843 -0.01531623 -0.03549924 -0.21278721  0.1723324
   0.09118271  0.00533336 -0.25169954 -0.13520715 -0.04183915  0.16663086
   0.10494477  0.16904986  0.24174887  0.06605998  0.23237419 -0.2133244
  -0.04574867 -0.24796759  0.10016948  0.02668476  0.17573225 -0.05756287
   0.17998686 -0.06462897]
 [-0.09011993  0.15041056  0.1612156   0.10748297  0.01893699  0.12058911
   0.08031324 -0.04349881  0.03871828  0.05678698 -0.00542143  0.03606245
  -0.2506222  -0.02028301  0.0915637   0.10492837  0.22616231  0.14146158
   0.04563501 -0.03196993  0.06141883  0.10913816  0.24873999  0.17870915
  -0.2452992   0.24055207  0.04015955  0.17865735 -0.01312017  0.03088227
  -0.0476515   0.13845402  0.09029269 -0.22745526  0.18901145  0.02123696
  -0.05795652 -0.01110657  0.19185969 -0.22218111  0.04982516 -0.11247166
  -0.18545294 -0.06191374 -0.18326892  0.0010049   0.07328269 -0.22836415
  -0.07456781 -0.02088915  0.22077021 -0.12104732  0.23900273  0.10462406
   0.18514657 -0.22407106  0.03285787  0.12901303 -0.09699754  0.14210701
   0.12314785  0.15272745 -0.09270559 -0.1382229   0.01311848 -0.12568092
  -0.08169071  0.14782763  0.25171426 -0.18476306  0.19359994  0.08452302
   0.12625712 -0.20169789  0.12945604 -0.2351022  -0.00768429  0.00678611
   0.21791399  0.03492606 -0.16121776 -0.14565963 -0.00046134 -0.17883384
   0.07212567  0.19639981]
 [-0.01203685 -0.09778687 -0.05690035 -0.15934005  0.02140048 -0.10451522
   0.23858115  0.23774558  0.01818982 -0.10321474 -0.01851244 -0.12770224
  -0.21638326 -0.21071614  0.23707044  0.24401507  0.13946918  0.19581017
  -0.11360408  0.12433749  0.00502157  0.03370172  0.01523143 -0.18929613
   0.16460505 -0.09795566 -0.22703247  0.17616674 -0.21139848 -0.0278161
  -0.09902459 -0.10938032  0.08957234 -0.09796023  0.20604128 -0.0258605
  -0.10543188 -0.13307181  0.08816877 -0.23636612 -0.05831528 -0.23544873
  -0.0281271  -0.17967805 -0.22171043  0.24258295  0.09254742 -0.23957384
   0.20502281  0.18392932 -0.13955441 -0.1768653   0.05603126  0.01727739
  -0.08518682 -0.09404872 -0.03360653 -0.06674431  0.0067113   0.2428419
  -0.14591682 -0.1238644   0.2116467  -0.04706     0.2365776  -0.24047737
   0.21256337 -0.04035597 -0.21547486 -0.02412723  0.09534016  0.02004826
  -0.14234126 -0.20197381 -0.00653505 -0.03466831 -0.1046357  -0.06574392
  -0.04261076  0.12628654  0.0204719   0.13052896 -0.2188987   0.05146891
  -0.1395712   0.17425302]
 [-0.06583127  0.21566531  0.17511317 -0.21122874  0.17603904  0.11863405
  -0.06856373 -0.20278785  0.23767191 -0.02684668 -0.02425337 -0.16409822
   0.09366989  0.16449687 -0.17174053 -0.05869283 -0.2215767  -0.18173146
   0.0034633   0.17951289 -0.15691972  0.03115159 -0.11603723  0.1390971
   0.01732489  0.05326459  0.13490146 -0.08057159 -0.18274337 -0.13174735
   0.22411269  0.12380177 -0.01464991  0.05072832  0.05883324 -0.08673802
   0.21026397  0.23654309 -0.20982237 -0.2370478   0.16677526 -0.07829222
   0.08839744 -0.04742134 -0.11977497 -0.02884674 -0.01355647  0.05290282
   0.07490671  0.05633467  0.02699414 -0.16529347 -0.2514204  -0.05254039
  -0.03320025  0.12363046 -0.00567333 -0.04485297 -0.06404643 -0.12396812
   0.19796461 -0.24766417  0.2358849   0.2479848  -0.11032854 -0.22712655
  -0.02032788  0.12691978  0.1297279   0.19211823  0.09839532  0.13595894
   0.23673868 -0.11685789  0.18605226 -0.03901471 -0.04006179 -0.02561347
   0.2525182   0.2329145   0.07604665  0.20687422 -0.24770339  0.01424783
  -0.02234143 -0.07072744]
 [ 0.25216958 -0.11856972 -0.02559209 -0.17452478  0.05272362  0.13935924
   0.00906223 -0.24495359  0.11695787 -0.09889111  0.1687293  -0.05772069
   0.20845875 -0.19402467 -0.06265578 -0.02383153 -0.14467417 -0.2031295
  -0.18675336 -0.04087026 -0.03845398 -0.14817336 -0.15707749 -0.10193451
   0.15518343  0.04730588  0.25178853 -0.04947175  0.00225472  0.18784174
  -0.17377497  0.0352748  -0.13980721 -0.01965679 -0.07046789 -0.05265664
   0.00086528 -0.08944072 -0.18435201 -0.17708069 -0.16709986  0.04789153
   0.08292651 -0.1139555  -0.0311944  -0.0920659  -0.13314801 -0.13075069
   0.02702063  0.00688401 -0.00925192 -0.1692231   0.23685482  0.19631681
   0.17542091  0.24865791  0.10149804 -0.10721673 -0.13029851 -0.01119602
   0.03912848 -0.2396163   0.07526353  0.22064066 -0.19775939  0.06211644
  -0.21042702 -0.02652851  0.1490013  -0.1159087  -0.20678747 -0.18784434
  -0.22460227  0.22076279  0.2213881  -0.22813965  0.18593651 -0.22508428
   0.17617458 -0.2430116  -0.1448127  -0.17874637  0.10344869  0.0751574
   0.14603698  0.10991701]
 [-0.10486434  0.24944773 -0.15728475 -0.20532073 -0.12136777  0.06977728
   0.16237855 -0.20957793 -0.00915542  0.12084854  0.02681047  0.02206567
   0.15423906  0.05591002 -0.1365913  -0.19522166  0.20344716 -0.23187931
   0.12737834 -0.19136178  0.12236974  0.20175782  0.09709194 -0.19440964
   0.07987726 -0.00303358 -0.01322921  0.06884244 -0.21029763  0.0519011
   0.0862616   0.03578362  0.11208054  0.17353386  0.10800171  0.08482951
  -0.09520181 -0.19377169  0.06149024  0.01748219 -0.23629408  0.13304222
  -0.1385366   0.21721059 -0.10737328  0.1302737  -0.13235807 -0.23137966
  -0.17684259 -0.20701943  0.24706447  0.0193305  -0.24291377 -0.14673458
   0.05832365  0.22432601  0.04299495  0.21643302 -0.01073527 -0.08380841
   0.06327    -0.07684176  0.12953666 -0.10824825  0.13609678 -0.22365747
  -0.09214088 -0.02023061 -0.03089811  0.08401188 -0.25179836 -0.20610012
  -0.11481807 -0.19693428 -0.03579049  0.1711851   0.02820516  0.13704959
   0.0747405  -0.0552908   0.09051296 -0.21876474 -0.09940268  0.09842205
  -0.10783438 -0.13868725]
 [-0.02765925 -0.17101908  0.14404333 -0.08614945 -0.21330066 -0.22129974
  -0.14709154 -0.15094486 -0.08822367 -0.20184907 -0.03604136 -0.08796357
   0.2000747   0.13439435  0.03045899 -0.0673898   0.0461002   0.00997734
   0.22000575  0.24005935  0.10327691 -0.04651541 -0.24279794  0.14672476
   0.05103546 -0.22155285  0.01688078  0.15313008 -0.16663074  0.23720473
  -0.24934143 -0.17626227  0.2446706   0.14809728 -0.08985791  0.11552739
   0.11441141 -0.02961275 -0.04016207 -0.21154648 -0.24377368 -0.14061268
  -0.18494496  0.20537096 -0.114454    0.05548698  0.09304321 -0.18785259
  -0.03008391 -0.24100424 -0.10002136 -0.03176183 -0.144358    0.01995257
  -0.12738016  0.09064561  0.02157879 -0.14994633  0.14587212  0.09508669
  -0.01006834 -0.07879151  0.06032231 -0.13897988  0.0088681  -0.05166692
   0.13555735  0.14900273  0.22535309 -0.06129831 -0.12843114 -0.22831795
  -0.02537471  0.06602675 -0.14675705 -0.21264133  0.0955036  -0.12455404
   0.05705366  0.19863251 -0.03538276  0.16939586 -0.06838104  0.16421881
   0.09305635 -0.01924972]]> of type 'Variable', not tf.Tensor or tf.Variable or ExtensionType.