In [1]:
import tensorflow as tf
import tensorflow.keras as keras
import numpy as np
import os
import gym
import datetime

In [2]:
keras.backend.set_floatx('float64')

In [3]:
%load_ext tensorboard

In [7]:
class PolicyGredientAgent:
    
    def __init__(self, n_features, n_actions, gamma, learning_rate):
        self.n_features = n_features
        self.n_actions = n_actions
        self.gamma = gamma
        self.learning_rate = learning_rate
        
        current_time = datetime.datetime.now().strftime("%Y%m%d-%H%S%M")
        rewards_log_dir = 'logs/pg_agent/cartpole/' + current_time + '/rewards'
        self.rewards_summary_writer = tf.summary.create_file_writer(rewards_log_dir)
        self.rewards_summary = keras.metrics.Mean('rewards', dtype=tf.float32)
        
        self.states = []
        self.actions = []
        self.rewards = []
        
        self._build_net()
    
    def _build_net(self):
        model_input = keras.Input(shape=(self.n_features,), name='model_input')
        x = keras.layers.Dense(32, activation='relu')(model_input)
        x = keras.layers.Dense(32, activation='relu')(x)
        model_output = keras.layers.Dense(self.n_actions, activation='softmax' , name='model_output')(x)
        self.model = keras.Model(model_input, model_output)
        self.optimizer = keras.optimizers.Adam(self.learning_rate)
    
    def choose_action(self, obs):
        obs = obs[np.newaxis, :]
        probs = self.model(obs)[0]
        action = np.random.choice(self.n_actions, p=probs)
        return action
    
    def store_experience(self, obs, a, r):
        self.states.append(obs)
        self.actions.append(a)
        self.rewards.append(r)
            
    def train(self):
        states = np.array(self.states)
        actions = np.array(self.actions)
        rewards = np.array(self.rewards)
        
        des_rew = np.zeros_like(rewards, dtype=np.float32)
        for t in range(len(self.rewards)):
            cur_sum = 0
            discount = 1
            for k in range(t, len(rewards)):
                cur_sum += rewards[k]*discount
                discount *= self.gamma
            des_rew[t] = cur_sum
        mean = des_rew.mean()
        std = des_rew.std() if des_rew.std() > 0 else 1
        des_rew = (des_rew - mean) / std
        
        with tf.GradientTape() as tape:
            neg_log_probs = keras.losses.sparse_categorical_crossentropy(y_true=actions, y_pred=self.model(states))
            loss = neg_log_probs * des_rew
        gradients = tape.gradient(loss, self.model.trainable_variables)
        self.optimizer.apply_gradients(zip(gradients, self.model.trainable_variables))
        
        self.states = []
        self.actions = []
        self.rewards = []
        
    def load(self):
        self.model.load_weights('./model/pg_agent_cartpole.h5')
    
    def save(self):
        self.model.save_weights('./model/pg_agent_cartpole.h5')
    
    def write_rewards(self, episode, total_reward):
        self.rewards_summary(total_reward)
        with self.rewards_summary_writer.as_default():
            tf.summary.scalar('rewards', self.rewards_summary.result(), step=episode)

In [11]:
env = gym.make('CartPole-v0')
pg_agent = PolicyGredientAgent(env.observation_space.shape[0], env.action_space.n, 0.99, 0.001)

In [12]:
episodes = 1000
last_500_rewards, last_100_rawards = np.zeros((500,)), np.zeros((100,))
for episode in range(episodes):
    steps = 0
    total_reward = 0
    done = False
    obs = env.reset()
    while not done:
        #env.render()
        a = pg_agent.choose_action(obs)
        obs_, r, done, _ = env.step(a)
        x, vel, angle, angle_vel = obs_
        r1 = (env.x_threshold - abs(x))/env.x_threshold-0.8
        r2 = (env.theta_threshold_radians - abs(angle))/env.theta_threshold_radians - 0.5
        r = r1 + r2
        pg_agent.store_experience(obs, a, r)
        if done:
            pg_agent.train()
            pg_agent.write_rewards(episode, total_reward)
            break
        obs = obs_
        total_reward += r
        steps += 1
    last_100_rawards[episode%100] = total_reward
    last_500_rewards[episode%500] = total_reward
    print('Episode {}, Reward: {}, Steps: {}, Last 100 rewards Mean:{}, 500: {}'.format(episode+1, total_reward, steps,
                                                                                        last_100_rawards.mean(), 
                                                                                        last_500_rewards.mean()), end='\r')
env.close()
pg_agent.save()

Episode 1000, Reward: 72.40082946040363, Steps: 199, Last 100 rewards Mean:76.2297351076961, 500: 67.21628717272448646

In [13]:
episodes = 10
pg_agent.load()
for episode in range(episodes):
    steps = 0
    done = False
    obs = env.reset()
    while not done:
        env.render()
        a = pg_agent.choose_action(obs)
        obs_, r, done, _ = env.step(a)
        if done:
            break
        steps += 1
        obs = obs_
    print('Episode {}, Steps: {}'.format(episode+1, steps), 
          end='\r')
env.close()

Episode 10, Steps: 199

In [10]:
tensorboard --logdir './logs/pg_agent/cartpole'

Reusing TensorBoard on port 6006 (pid 13812), started 0:10:12 ago. (Use '!kill 13812' to kill it.)