# Use Natural Policy Gradient to Play Acrobot-v1

TensorFlow version

In [1]:
%matplotlib inline

import sys
import logging
import imp
import itertools

import numpy as np
np.random.seed(0)
import pandas as pd
import scipy.signal as signal
import gym
import matplotlib.pyplot as plt
import tensorflow.compat.v2 as tf
tf.random.set_seed(0)
from tensorflow import keras
from tensorflow import nn
from tensorflow import optimizers
from tensorflow import losses
from tensorflow.keras import layers


imp.reload(logging)
logging.basicConfig(level=logging.DEBUG,
        format='%(asctime)s [%(levelname)s] %(message)s',
        stream=sys.stdout, datefmt='%H:%M:%S')

In [2]:
env = gym.make('Acrobot-v1')
env.seed(0)
for key in vars(env):
    logging.info('%s: %s', key, vars(env)[key])

15:25:36 [INFO] env: <AcrobotEnv<Acrobot-v1>>
15:25:36 [INFO] action_space: Discrete(3)
15:25:36 [INFO] observation_space: Box(-28.274333953857422, 28.274333953857422, (6,), float32)
15:25:36 [INFO] reward_range: (-inf, inf)
15:25:36 [INFO] metadata: {'render.modes': ['human', 'rgb_array'], 'video.frames_per_second': 15}
15:25:36 [INFO] _max_episode_steps: 500
15:25:36 [INFO] _elapsed_steps: None


In [3]:
class PPOReplayer:
    def __init__(self):
        self.fields = ['state', 'action', 'prob', 'advantage', 'return']
        self.memory = pd.DataFrame(columns=self.fields)
    
    def store(self, df):
        self.memory = pd.concat([self.memory, df[self.fields]], ignore_index=True)
        
    def sample(self, size):
        indices = np.random.choice(self.memory.shape[0], size=size)
        return (np.stack(self.memory.loc[indices, field]) for field in
                self.memory.columns)

In [4]:
def conjugate_gradient(f, b, iter_count=10, epsilon=1e-12, tol=1e-6):
    x = b * 0.
    r = tf.identity(b)
    p = tf.identity(b)
    rho = tf.reduce_sum(r * r)
    for i in range(iter_count):
        z = f(p)
        alpha = rho / (tf.reduce_sum(p * z) + epsilon)
        x += alpha * p
        r -= alpha * z
        rho_new = tf.reduce_sum(r * r)
        p = r + (rho_new / rho) * p
        rho = rho_new
        if rho < tol:
            break
    return x, f(x)

In [5]:
class NPGAgent:
    def __init__(self, env):
        self.action_n = env.action_space.n
        self.gamma = 0.99
        
        self.replayer = PPOReplayer()
        self.trajectory = []

        self.max_kl = 0.0005
        self.actor_net = self.build_net(hidden_sizes=[100,],
                output_size=self.action_n, output_activation=nn.softmax)
        self.critic_net = self.build_net(hidden_sizes=[100,],
                learning_rate=0.002)

    def build_net(self, input_size=None, hidden_sizes=None, output_size=1, 
                activation=nn.relu, output_activation=None,
                loss=losses.mse, learning_rate=0.001):
        model = keras.Sequential()
        for hidden_size in hidden_sizes:
            model.add(layers.Dense(units=hidden_size,
                    activation=activation))
        model.add(layers.Dense(units=output_size,
                activation=output_activation))
        optimizer = optimizers.Adam(learning_rate)
        model.compile(optimizer=optimizer, loss=loss)
        return model

    def reset(self, mode=None):
        self.mode = mode
        if self.mode == 'train':
            self.trajectory = []

    def step(self, observation, reward, done):
        probs = self.actor_net.predict(observation[np.newaxis])[0]
        action = np.random.choice(self.action_n, p=probs)
        if self.mode == 'train':
            self.trajectory += [observation, reward, done, action]
        return action

    def close(self):
        if self.mode == 'train':
            self.save_trajectory_to_replayer()
            if len(self.replayer.memory) >= 1000:
                for batch in range(5): # learn multiple times
                    self.learn()
                self.replayer = PPOReplayer() # reset replayer after the agent changes itself

    def save_trajectory_to_replayer(self):
        df = pd.DataFrame(
                np.array(self.trajectory, dtype=object).reshape(-1, 4),
                columns=['state', 'reward', 'done', 'action'], dtype=object)
        states = np.stack(df['state'])
        df['v'] = self.critic_net.predict(states)
        pis = self.actor_net.predict(states)
        df['prob'] = [pi[action] for pi, action in zip(pis, df['action'])]
        df['next_v'] = df['v'].shift(-1).fillna(0.)
        df['u'] = df['reward'] + self.gamma * df['next_v']
        df['delta'] = df['u'] - df['v']
        df['advantage'] = signal.lfilter([1.,], [1., -self.gamma],
                df['delta'][::-1])[::-1]
        df['return'] = signal.lfilter([1.,], [1., -self.gamma],
                df['reward'][::-1])[::-1]
        self.replayer.store(df)

    def learn(self):
        states, actions, old_pis, advantages, returns = \
                self.replayer.sample(size=64)
        state_tensor = tf.convert_to_tensor(states, dtype=tf.float32)
        action_tensor = tf.convert_to_tensor(actions, dtype=tf.int32)
        old_pi_tensor = tf.convert_to_tensor(old_pis, dtype=tf.float32)
        advantage_tensor = tf.convert_to_tensor(advantages, dtype=tf.float32)
        
        # train actor
        # ... calculate first order gradient of KL divergence
        with tf.GradientTape() as tape:
            all_pi_tensor = self.actor_net(state_tensor)
            pi_tensor = tf.gather(all_pi_tensor, action_tensor, batch_dims=1)
            surrogate_tensor = (pi_tensor / old_pi_tensor) * advantage_tensor
        actor_grads = tape.gradient(surrogate_tensor, self.actor_net.variables)
        loss_grad = tf.concat([tf.reshape(grad, (-1,)) for grad in actor_grads], axis=0)
        
        # ... calculate conjugate gradient: Fx = g
        def f(x): # calculate Fx
            with tf.GradientTape() as tape2: # tape for 2nd-order gradient
                with tf.GradientTape() as tape1: # tape for 1st-order gradient
                    prob_tensor = self.actor_net(state_tensor)
                    prob_old_tensor = tf.stop_gradient(prob_tensor)
                    kld_tensor = tf.reduce_sum(prob_old_tensor * (tf.math.log(
                            prob_old_tensor) - tf.math.log(prob_tensor)), axis=1)
                    kld_loss_tensor = tf.reduce_mean(kld_tensor)
                grads = tape1.gradient(kld_loss_tensor, self.actor_net.variables)
                flatten_grad_tensor = tf.concat(
                        [tf.reshape(grad, (-1,)) for grad in grads], axis=-1)
                grad_matmul_x = tf.tensordot(flatten_grad_tensor, x, axes=[[-1], [-1]])
            grad_grads = tape2.gradient(grad_matmul_x, self.actor_net.variables)
            flatten_grad_grad = tf.stop_gradient(tf.concat(
                    [tf.reshape(grad_grad, (-1,)) for grad_grad in grad_grads], axis=-1))
            fx = flatten_grad_grad + x * 1e-2
            return fx
        x, fx = conjugate_gradient(f, loss_grad)

        # ... calculate natural gradient
        natural_gradient_tensor = tf.sqrt(2 * self.max_kl / tf.reduce_sum(fx * x)) * x
        # ....... refactor the flatten gradient into un-flatten version
        flatten_natural_gradient = natural_gradient_tensor.numpy()
        weights = []
        begin = 0
        for weight in self.actor_net.get_weights():
            end = begin + weight.size
            weight += flatten_natural_gradient[begin:end].reshape(weight.shape)
            weights.append(weight)
            begin = end
        self.actor_net.set_weights(weights)
        
        # train critic
        self.critic_net.fit(states, returns, verbose=0)


agent = NPGAgent(env)

In [None]:
def play_episode(env, agent, max_episode_steps=None, mode=None, render=False):
    observation, reward, done = env.reset(), 0., False
    agent.reset(mode=mode)
    episode_reward, elapsed_steps = 0., 0
    while True:
        action = agent.step(observation, reward, done)
        if render:
            env.render()
        if done:
            break
        observation, reward, done, _ = env.step(action)
        episode_reward += reward
        elapsed_steps += 1
        if max_episode_steps and elapsed_steps >= max_episode_steps:
            break
    agent.close()
    return episode_reward, elapsed_steps


logging.info('==== train ====')
episode_rewards = []
for episode in itertools.count():
    episode_reward, elapsed_steps = play_episode(env.unwrapped, agent,
            max_episode_steps=env._max_episode_steps, mode='train')
    episode_rewards.append(episode_reward)
    logging.debug('train episode %d: reward = %.2f, steps = %d %d',
            episode, episode_reward, elapsed_steps, np.mean(episode_rewards[-10:]))
    if np.mean(episode_rewards[-10:]) > -120:
        break
plt.plot(episode_rewards)


logging.info('==== test ====')
episode_rewards = []
for episode in range(100):
    episode_reward, elapsed_steps = play_episode(env, agent)
    episode_rewards.append(episode_reward)
    logging.debug('test episode %d: reward = %.2f, steps = %d',
            episode, episode_reward, elapsed_steps)
logging.info('average episode reward = %.2f ± %.2f',
        np.mean(episode_rewards), np.std(episode_rewards))

15:25:37 [INFO] ==== train ====
15:26:13 [INFO] NumExpr defaulting to 8 threads.
15:26:13 [DEBUG] train episode 0: reward = -500.00, steps = 500 -500
15:26:45 [DEBUG] train episode 1: reward = -500.00, steps = 500 -500
15:27:16 [DEBUG] train episode 2: reward = -500.00, steps = 500 -500
15:27:44 [DEBUG] train episode 3: reward = -500.00, steps = 500 -500
15:28:08 [DEBUG] train episode 4: reward = -422.00, steps = 423 -484
15:28:30 [DEBUG] train episode 5: reward = -388.00, steps = 389 -468
15:28:54 [DEBUG] train episode 6: reward = -411.00, steps = 412 -460
15:29:19 [DEBUG] train episode 7: reward = -427.00, steps = 428 -456
15:29:46 [DEBUG] train episode 8: reward = -500.00, steps = 500 -460
15:30:15 [DEBUG] train episode 9: reward = -500.00, steps = 500 -464
15:30:31 [DEBUG] train episode 10: reward = -279.00, steps = 280 -442
15:31:00 [DEBUG] train episode 11: reward = -500.00, steps = 500 -442
15:31:14 [DEBUG] train episode 12: reward = -237.00, steps = 238 -416
15:31:30 [DEBUG] tr

16:04:56 [DEBUG] train episode 116: reward = -284.00, steps = 285 -272
16:05:05 [DEBUG] train episode 117: reward = -173.00, steps = 174 -275
16:05:19 [DEBUG] train episode 118: reward = -274.00, steps = 275 -265
16:05:32 [DEBUG] train episode 119: reward = -270.00, steps = 271 -274
16:05:42 [DEBUG] train episode 120: reward = -198.00, steps = 199 -269
16:05:55 [DEBUG] train episode 121: reward = -261.00, steps = 262 -260
16:06:05 [DEBUG] train episode 122: reward = -198.00, steps = 199 -253
16:06:13 [DEBUG] train episode 123: reward = -139.00, steps = 140 -220
16:06:24 [DEBUG] train episode 124: reward = -225.00, steps = 226 -227
16:06:33 [DEBUG] train episode 125: reward = -180.00, steps = 181 -220
16:06:40 [DEBUG] train episode 126: reward = -143.00, steps = 144 -206
16:07:00 [DEBUG] train episode 127: reward = -407.00, steps = 408 -229
16:07:11 [DEBUG] train episode 128: reward = -197.00, steps = 198 -221
16:07:19 [DEBUG] train episode 129: reward = -169.00, steps = 170 -211
16:07:

16:24:02 [DEBUG] train episode 232: reward = -159.00, steps = 160 -169
16:24:13 [DEBUG] train episode 233: reward = -216.00, steps = 217 -176
16:24:21 [DEBUG] train episode 234: reward = -144.00, steps = 145 -174
16:24:30 [DEBUG] train episode 235: reward = -182.00, steps = 183 -176
16:24:44 [DEBUG] train episode 236: reward = -294.00, steps = 295 -179
16:24:52 [DEBUG] train episode 237: reward = -156.00, steps = 157 -178
16:25:05 [DEBUG] train episode 238: reward = -243.00, steps = 244 -192
16:25:11 [DEBUG] train episode 239: reward = -112.00, steps = 113 -188
16:25:22 [DEBUG] train episode 240: reward = -219.00, steps = 220 -196
16:25:29 [DEBUG] train episode 241: reward = -131.00, steps = 132 -185
16:25:37 [DEBUG] train episode 242: reward = -173.00, steps = 174 -187
16:25:45 [DEBUG] train episode 243: reward = -148.00, steps = 149 -180
16:25:51 [DEBUG] train episode 244: reward = -130.00, steps = 131 -178
16:26:07 [DEBUG] train episode 245: reward = -302.00, steps = 303 -190
16:26:

In [None]:
env.close()