In [1]:
import gym
import random
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Dense,Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential,Model
from tensorflow.keras import backend as K
from tensorflow.keras import losses
from tensorflow.keras import utils 


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [591]:
class Critic:
    def __init__(self,state_size,action_size,path=None):
            self.path=path
            self.state_size=state_size
            self.action_size=action_size
            self.gamma=0.95
            self.sess=tf.Session()
            #self.sess.run(tf.global_variables_initializer())
            self.critic_model=self.build_model()
            self.output_dim=self.critic_model.get_output_shape_at(-1)[1]
            self.input_dim=self.critic_model.get_input_at(0).shape
            self.lr=0.001
            if (self.path!=None):
                self.critic_model.load_weights(self.path)
            
            
    def build_model(self):
        model_input=Input(shape=(self.state_size,))
        y1=Dense(50,activation="relu")(model_input)
        y2=Dense(self.action_size,activation="linear")(y1)
        model=Model(inputs=[model_input],outputs=[y2])
        return(model)
    def update_critic(self,reward,state,action,next_state,next_action):
        Q_input_placeholder=tf.placeholder(shape=self.input_dim,dtype=tf.float32)
        Q_next_input_placeholder=tf.placeholder(shape=self.input_dim,dtype=tf.float32)
        Q=self.critic_model(Q_input_placeholder)
        Q_next=self.critic_model(Q_next_input_placeholder)
        action_one_hot_placeholder=tf.placeholder(shape=(None,self.output_dim),dtype=tf.float32)
        next_action_one_hot_placeholder=tf.placeholder(shape=(None,self.output_dim),dtype=tf.float32)
        Q_action=Q*action_one_hot_placeholder
        Q_action=tf.keras.backend.sum(Q_action,axis=1)
        Q_next_action=Q_next*next_action_one_hot_placeholder
        Q_next_action=tf.keras.backend.sum(Q_next_action,axis=1)
        td_err=reward+self.gamma*Q_next_action-Q_action
        grads=tf.gradients(Q_action,self.critic_model.trainable_weights)
        self.sess.run(tf.global_variables_initializer())
        td_zero_error,my_grads,value_action=self.sess.run([td_err,grads,Q_action],feed_dict={
            Q_input_placeholder:state.reshape((1,state.shape[0])),Q_next_input_placeholder:next_state.reshape((1,next_state.shape[0])),action_one_hot_placeholder:
            utils.to_categorical(action, num_classes=self.output_dim).reshape(1,self.output_dim),
            next_action_one_hot_placeholder:utils.to_categorical(next_action, num_classes=self.output_dim).reshape(1,self.output_dim)}) 
        new_weights=[self.lr*td_zero_error[0]*x+y for x,y in zip(my_grads,self.critic_model.get_weights())]
        self.critic_model.set_weights(new_weights)
        return(value_action[0])
    
        
 
    
        
        

In [595]:
class Actor:
    def __init__(self,state_size,action_size,path=None):
            self.path=path
            self.state_size=state_size
            self.action_size=action_size
            self.sess=tf.Session()
            self.actor_model=self.build_model()
            self.output_dim=self.actor_model.get_output_shape_at(-1)[1]
            self.input_dim=self.actor_model.get_input_at(0).shape
            self.lr=0.001
            if (self.path!=None):
                self.critic_model.load_weights(self.path)

    def predict(self,state):
        return(self.actor_model.predict(state.reshape((1,state.shape[0]))))
    
    def build_model(self):
        model_input=Input(shape=(self.state_size,))
        y1=Dense(50,activation="relu")(model_input)
        y2=Dense(self.action_size,activation="softmax")(y1)
        model=Model(inputs=[model_input],outputs=[y2])
        return(model)
    def update_actor(self,state,action,action_value):
        state_placeholder=tf.placeholder(shape=self.input_dim,dtype=tf.float32)
        actions_prob=self.actor_model(state_placeholder)
        action_one_hot_placeholder=tf.placeholder(shape=(None,self.output_dim),dtype=tf.float32)
        action_prob=actions_prob*action_one_hot_placeholder
        action_prob=tf.keras.backend.sum(action_prob,axis=1)
        action_prob_log=tf.keras.backend.log(action_prob)
        grads=tf.gradients(action_prob_log,self.actor_model.trainable_weights)
        self.sess.run(tf.global_variables_initializer())
        my_grads=self.sess.run([grads],feed_dict={
            state_placeholder:state.reshape((1,state.shape[0])),action_one_hot_placeholder:
            utils.to_categorical(action, num_classes=self.output_dim).reshape(1,self.output_dim)
})      
        
        
        new_weights=[self.lr*action_value*x+y for x,y in zip(my_grads[0],self.actor_model.get_weights())]
        self.actor_model.set_weights(new_weights)
        
        

In [596]:
class Actor_Critic:
    def __init__(self,Actor,Critic):
        self.actor=Actor
        self.critic=Critic
        self.env=gym.make("MountainCar-v0")
        self.actor_path="D:/RL_CartPole_agent_weights/MountainCar_actor.h5"
        self.critic_path="D:/RL_CartPole_agent_weights/MountainCar_critic.h5"
    def generate_episode(self,steps=200):
        print("generate_episode_begin")
        state=self.env.reset()
        self.max_pos=-0.4
        actions=[]
        visited_states=[]
        rewards=[]
        for step in range(steps):
            prediction=self.actor.predict(state)
            action=np.random.choice([0,1,2],p=prediction[0].reshape((self.actor.output_dim,)))
            next_state,reward,done,_=self.env.step(action)
            # Adjust reward based on car position
            if (next_state[0]>self.max_pos):
                self.max_pos=next_state[0]
                #print("new max_pos={}".format(self.max_pos))
                reward+=1
    
 
        
            if (next_state[0]>=0.5):
                print("goal achieved")
                reward+=10
                self.max_pos=-0.4
                break

            actions.append(action)
            rewards.append(reward)
            visited_states.append(state)
            state=next_state
        print("new max_pos={}".format(self.max_pos))
        return (visited_states,actions,rewards)
    
    def train_AC(self,n_episodes=1000):
        for episode in range(n_episodes):
            states,actions,rewards=self.generate_episode(steps=200)
            
            for i in range(len(states)-1):
                action_value=self.critic.update_critic(rewards[i],states[i],actions[i],states[i+1],actions[i+1])
                self.actor.update_actor(states[i],actions[i],action_value)
                if(i%50==0):
                    print("step{}/{}".format(i,len(states)))
            print("episode {}/{}".format(episode+1,n_episodes))
            if episode % 5 == 0:
                self.actor.actor_model.save_weights(self.actor_path)
                self.critic.critic_model.save_weights(self.critic_path)
            
        return(states,actions,rewards)
    
        

In [597]:
state_size=env.observation_space.shape[0]
action_size=env.action_space.n
actor=Actor(state_size,action_size)
critic=Critic(state_size,action_size)
AC_agent=Actor_Critic(actor,critic)
states,actions,rewards=AC_agent.train_AC()


generate_episode_begin
new max_pos=-0.4
step0/200


KeyboardInterrupt: 

#### This implementation is so slow, it's almost impossible to use it to train anything. update_actor and update_critic are building the graph in every call. One improvement would be to find a way to build the graph only once. 