In [1]:
import os
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

2023-06-21 17:32:55.782083: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-06-21 17:32:57.097785: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /home/arpan/.mujoco/mujoco210/bin:/usr/lib/nvidia
2023-06-21 17:32:57.097870: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /home/arpan/.mujoco/mujoco210/bin:/usr/lib/nvidia


In [2]:
class CriticNetwork(keras.Model):
    def __init__(self,layer1_dim=512,layer2_dim=512,chkpt_dir='weightFiles/DDPG',name = "Critic"):
        super(CriticNetwork,self).__init__()
        self.layer1 = Dense(layer1_dim,activation = 'relu')
        self.layer2 = Dense(layer2_dim,activation='relu')
        self.v = Dense(1,activation=None)
        self.checkpoint_file = os.path.join(chkpt_dir,name+"_DDPG.h5")
        
    def call(self,state,action):
        x = self.layer1(tf.concat([state,action],axis=1))
        x = self.layer2(x)
        q = self.v(x)
        return q
    

In [3]:
class ActorNetwork(keras.Model):
    def __init__(self,layer1_dim=512,layer2_dim=512,n_action_dim = 2,chkpt_dir="weightFiles/DDPG",name = "Actor"):
        super(ActorNetwork,self).__init__()
        self.layer1 =  Dense(layer1_dim,activation='relu')
        self.layer2 = Dense(layer2_dim,activation='relu')
        self.actions = Dense(n_action_dim,activation='tanh')
        self.checkpoint_file = os.path.join(chkpt_dir,name+"_DDPG.h5")
    
    def call(self,state):
        x = self.layer1(state)
        x = self.layer2(x)
        actions = self.actions(x)
        return actions      

In [4]:
class ReplayBuffer:
    def __init__(self,max_size,state_shape,action_shape):
        self.mem_cnt=0
        self.max_size = max_size
        self.states = np.zeros((max_size,*state_shape))
        self.next_states = np.zeros((max_size,*state_shape))
        self.actions_memory = np.zeros((max_size,*action_shape))
        self.rewards_memory = np.zeros((max_size,))
        self.dones = np.zeros((max_size,))
    
    def store_transition(self,state,next_state,action,reward,done):
        self.mem_cnt+=1
        index = self.mem_cnt%self.max_size
        self.states[index] = state
        self.next_states[index] = next_state
        self.actions_memory[index] = action
        self.rewards_memory[index] = reward
        self.dones[index] = done
    
    def sample_buffer(self,batch_size):
        max_mem = min(self.mem_cnt,self.max_size)
        batch = np.random.choice(max_mem,batch_size,replace = False)
        
        states = self.states[batch]
        next_states = self.next_states[batch]
        actions = self.actions_memory[batch]
        rewards = self.rewards_memory[batch]
        dones = self.dones[batch]
        return states,next_states,actions,rewards,dones      

In [5]:

class Agent:
    def __init__(self,env,action_dim,lr_actor=0.001,lr_critic=0.002,gamma=0.99,max_size=1000000,batch_size=64,noise=0.1,tau=0.005):
        self.lr_actor = lr_actor
        self.lr_critic = lr_critic
        self.gamma = gamma
        self.max_size = max_size
        self.batch_size = batch_size
        self.noise = noise
        self.tau = tau
        #TOCHECK what does env.action_space.high return
        self.max_action = env.action_space.high[0]
        self.min_action = env.action_space.low[0]
        self.n_actions = env.action_space.shape[0]
        self.memory = ReplayBuffer(max_size,env.observation_space.shape,env.action_space.shape)
        
        self.actor = ActorNetwork(n_action_dim = action_dim)
        self.critic = CriticNetwork()
        
        self.target_actor = ActorNetwork(n_action_dim = action_dim,name = "Target_actor")
        self.target_critic = CriticNetwork(name = "Target_critic")
        
        self.actor.compile(optimizer = Adam(learning_rate=lr_actor))
        self.critic.compile(optimizer = Adam(learning_rate = lr_critic))
        self.target_actor.compile(optimizer = Adam(learning_rate=lr_actor))
        self.target_critic.compile(optimizer = Adam(learning_rate = lr_critic))
        
        self.update_weights(tau = 1)
    
    def update_weights(self,tau=None):
        if tau is None:
            tau = self.tau
        
        weights = []
        
        target = self.target_actor.weights
        for i,weight in enumerate(self.actor.weights):
            weights.append((1-tau)*target[i] + tau*weight)
        
        self.target_actor.set_weights(weights)
        
        weights=[]
        target = self.target_critic.weights
        for i,weight in enumerate(self.critic.weights):
            weights.append((1-tau)*target[i] + tau*weight)
        
        self.target_critic.set_weights(weights)
        
    def remember(self,state,next_state,action,reward,done):
        self.memory.store_transition(state,next_state,action,reward,done)
    
    def save_models(self):
        print("...saving models...")
        self.actor.save_weights(self.actor.checkpoint_file)
        self.critic.save_weights(self.critic.checkpoint_file)
        self.target_actor.save_weights(self.target_actor.checkpoint_file)
        self.target_critic.save_weights(self.target_critic.checkpoint_file)
    
    def load_models(self):
        print("...loading weights...")
        self.actor.load_weights(self.actor.checkpoint_file)
        self.critic.load_weights(self.critic.checkpoint_file)
        self.target_actor.load_weights(self.target_actor.checkpoint_file)
        self.target_critic.load_weights(self.target_critic.checkpoint_file)
    
    def choose_action(self,observation,evaluation = False):
        state = tf.convert_to_tensor([observation],dtype = tf.float32)
        actions = self.actor(state)
        
        if not evaluation:
            actions+=tf.random.normal(shape=[self.n_actions],mean=0.0,stddev=self.noise)
        
        actions = tf.clip_by_value(actions,self.min_action,self.max_action)
        
        return actions[0]
        
    
    def learn(self):
        if self.memory.mem_cnt < self.batch_size:
            return
        states,next_states,actions,rewards,dones = self.memory.sample_buffer(self.batch_size)
        
        states = tf.convert_to_tensor(states,dtype=tf.float32)
        next_states = tf.convert_to_tensor(next_states,dtype=tf.float32)
        actions = tf.convert_to_tensor(actions,dtype=tf.float32)
        rewards = tf.convert_to_tensor(rewards,dtype=tf.float32)
        dones = tf.convert_to_tensor(dones,dtype=tf.float32)
        
        with tf.GradientTape() as tape:
            target_actions = self.target_actor(next_states)
            
            next_critic_val = tf.squeeze(self.target_critic(next_states,target_actions),1)
            
            critic_val = tf.squeeze(self.critic(states,actions),1)
            target = rewards+self.gamma*next_critic_val*(1-done)
            critic_loss = keras.losses.MSE(target,critic_val)
        
        critic_network_gradient = tape.gradient(critic_loss,self.critic.trainable_variables) 
        self.critic.optimizer.apply_gradients(zip(critic_network_gradient,self.critic.trainable_variables))
        
        
        with tf.GradientTape() as tape:
            new_policy_actions = self.actor(states)
            actor_loss = -self.critic(states,new_policy_actions)
            actor_loss = tf.math.reduce_mean(actor_loss)
        
        actor_network_gradient = tape.gradient(actor_loss,self.actor.trainable_variables)
        
        self.actor.optimizer.apply_gradients(zip(actor_network_gradient,self.actor.trainable_variables))
        
        self.update_weights()
        

        
        

In [14]:
import gym
import numpy as np 
env = gym.make('HalfCheetah-v2')

#agent = Agent(env = env,action_dim = env.action_space.shape[0])

n_games=250

scores=[]
best_avg_score=-2000
avg_score=-2000
for i in range(n_games):
    state = env.reset()
    done = False
    score=0
    step=0
    while step<150:
        step+=1 
        action = agent.choose_action(state,evaluation=False)
        next_state,reward,done,_ = env.step(action)
        score+=reward
#         agent.remember(state,next_state,action,reward,done)
#         agent.learn()
        state = next_state
        env.render()
#     scores.append(score)
#     if(len(scores)>=20):
#         avg_score = np.mean(scores[-20:])
#     if best_avg_score<avg_score:
#         best_avg_score = avg_score
#         agent.save_models()
    print(f"game no ={i} score={score} avg_score={avg_score} best_avg_score={best_avg_score}")
    
    
        
        

Creating window glfw
game no =0 score=231.6189282619259 avg_score=-2000 best_avg_score=-2000
game no =1 score=201.43405401735504 avg_score=-2000 best_avg_score=-2000
game no =2 score=226.5650524107698 avg_score=-2000 best_avg_score=-2000
game no =3 score=201.48754449686035 avg_score=-2000 best_avg_score=-2000
game no =4 score=208.08700964749949 avg_score=-2000 best_avg_score=-2000
game no =5 score=257.2616618225298 avg_score=-2000 best_avg_score=-2000
game no =6 score=210.36753510830738 avg_score=-2000 best_avg_score=-2000
game no =7 score=231.6123823305953 avg_score=-2000 best_avg_score=-2000
game no =8 score=210.70308247596543 avg_score=-2000 best_avg_score=-2000
game no =9 score=243.988581529572 avg_score=-2000 best_avg_score=-2000
game no =10 score=219.12942704739297 avg_score=-2000 best_avg_score=-2000
game no =11 score=240.75423382119791 avg_score=-2000 best_avg_score=-2000
game no =12 score=192.2543581673887 avg_score=-2000 best_avg_score=-2000
game no =13 score=289.325567696956

SystemExit: 0

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
