In [1]:
import numpy as np
import matplotlib.pyplot as plt
import moviepy.editor as mpy
import skimage.transform
from IPython.display import Image, display

import tensorflow as tf
import tensorflow_probability as tfp
import tensorflow.keras.losses as kls

In [2]:
gpus = tf.config.list_physical_devices("GPU") 
if gpus:
    try:
        # Restrict TensorFlow to only use the fourth GPU
        tf.config.set_visible_devices(gpus[0], 'GPU')

        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        
        logical_gpus = tf.config.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

1 Physical GPUs, 1 Logical GPUs


In [3]:
import os
os.environ["SDL_VIDEODRIVER"] = "dummy"  # this line make pop-out window not appear
from ple.games.flappybird import FlappyBird
from ple import PLE

game = FlappyBird()
env = PLE(game, fps=30, display_screen=False)  # environment interface to game
env.reset_game()

test_game = FlappyBird()
test_env = PLE(test_game, fps=30, display_screen=False)
test_env.reset_game()

couldn't import doomish
Couldn't import doom


In [4]:
path = './movie_f' 
if not os.path.exists(path):
    os.makedirs(path)


In [5]:
hparas = {
    'image_size': 84,
    'num_stack': 4,
    'action_dim': len(env.getActionSet()),
    'hidden_size': 256,
    'lr': 0.0001,
    'gamma': 0.99,
    'lambda': 0.95,
    'clip_val': 0.2,
    'ppo_epochs': 8,
    'test_epochs': 1,
    'num_steps': 512,
    'mini_batch_size': 64,
    'target_reward': 200,
    'max_episode': 30000,
}

In [6]:
# Please do not modify this method
def make_anim(images, fps=60, true_image=False):
    duration = len(images) / fps

    def make_frame(t):
        try:
            x = images[int(len(images) / duration * t)]
        except:
            x = images[-1]

        if true_image:
            return x.astype(np.uint8)
        else:
            return ((x + 1) / 2 * 255).astype(np.uint8)

    clip = mpy.VideoClip(make_frame, duration=duration)
    clip.fps = fps
    
    return clip

In [7]:
def preprocess_screen(screen):
    screen = skimage.transform.rotate(screen, -90, resize=True)
    screen = screen[:400, :]
    screen = skimage.transform.resize(screen, [hparas['image_size'], hparas['image_size'], 1])
    return screen.astype(np.float32)

def frames_to_state(input_frames):
    if(len(input_frames) == 1):
        state = np.concatenate(input_frames*4, axis=-1)
    elif(len(input_frames) == 2):
        state = np.concatenate(input_frames[0:1]*2 + input_frames[1:]*2, axis=-1)
    elif(len(input_frames) == 3):
        state = np.concatenate(input_frames + input_frames[2:], axis=-1)
    else:
        state = np.concatenate(input_frames[-4:], axis=-1)

    return state

In [8]:
class ActorCriticNetwork(tf.keras.Model):
    def __init__(self, hparas):
        super().__init__()

        self.feature_extractor = tf.keras.Sequential([
          # Convolutional Layers
          tf.keras.layers.Conv2D(filters=32, kernel_size=8, strides=4),
          tf.keras.layers.ReLU(),
          tf.keras.layers.Conv2D(filters=64, kernel_size=4, strides=2),
          tf.keras.layers.ReLU(),
          tf.keras.layers.Conv2D(filters=64, kernel_size=3, strides=1),
          tf.keras.layers.ReLU(),
          # Embedding Layers
          tf.keras.layers.Flatten(),
          tf.keras.layers.Dense(hparas['hidden_size']),
          tf.keras.layers.ReLU(),
        ])

        # Actor Network
        self.actor = tf.keras.layers.Dense(hparas['action_dim'], activation='softmax')
        # Critic Network
        self.critic = tf.keras.layers.Dense(1, activation = None)

    def call(self, input):
        x = self.feature_extractor(input)
        action_logits = self.actor(x)
        value = self.critic(x)
        return action_logits, value

In [9]:
class Agent():
    def __init__(self, hparas):
        self.gamma = hparas['gamma']
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=hparas['lr'])
        self.actor_critic = ActorCriticNetwork(hparas)
        self.clip_pram = hparas['clip_val']
    
    def ppo_iter(self, mini_batch_size, states, actions, log_probs, returns, advantage):
        batch_size = states.shape[0]
        for _ in range(batch_size // mini_batch_size):
            rand_ids = tf.convert_to_tensor(np.random.randint(0, batch_size, mini_batch_size), dtype=tf.int32)
            yield tf.gather(states, rand_ids), tf.gather(actions, rand_ids), tf.gather(log_probs, rand_ids), \
             tf.gather(returns, rand_ids), tf.gather(advantage, rand_ids)
    
    def ppo_update(self, ppo_epochs, mini_batch_size, states, actions, log_probs, discount_rewards, advantages):       
        total_actor_loss = 0
        total_critic_loss = 0
        for _ in range(ppo_epochs):
            for state, action, old_log_probs, reward, advantage in self.ppo_iter(mini_batch_size, states, actions, log_probs, discount_rewards, advantages):
                reward = tf.expand_dims(reward, axis=-1)

                with tf.GradientTape() as tape:
                    prob, value = self.actor_critic(state, training=True)
                    dist = tfp.distributions.Categorical(probs=prob, dtype=tf.float32)
                    entropy = tf.math.reduce_mean(dist.entropy())
                    new_log_probs = dist.log_prob(action)

                    # PPO ratio
                    ratio = tf.math.exp(new_log_probs - old_log_probs)
                    surr1 = ratio * advantage
                    surr2 = tf.clip_by_value(ratio, 1.0 - self.clip_pram, 1.0 + self.clip_pram) * advantage

                    actor_loss = tf.math.negative(tf.math.reduce_mean(tf.math.minimum(surr1, surr2))) - 0.1 * entropy
                    critic_loss = 0.5 * tf.math.reduce_mean(kls.mean_squared_error(reward, value))

                    total_loss = actor_loss + critic_loss
            
                # single optimizer
                grads = tape.gradient(total_loss, self.actor_critic.trainable_variables)
                self.optimizer.apply_gradients(zip(grads, self.actor_critic.trainable_variables))
      
                total_actor_loss += actor_loss
                total_critic_loss += critic_loss
        return total_actor_loss, total_critic_loss

In [10]:
# https://arxiv.org/pdf/1506.02438.pdf
# Equation 16
def compute_gae(rewards, masks, values, gamma, LAMBDA):
    gae = 0
    returns = []
    for i in reversed(range(len(rewards))):
        delta = rewards[i] + gamma * values[i + 1] * masks[i] - values[i]
        gae = delta + gamma * LAMBDA * masks[i] * gae
        returns.append(gae + values[i])

    returns.reverse()
    return returns

## Testing Environment

In [11]:
def test_reward(test_env, agent):
    total_reward = 0
    # Reset the environment
    test_env.reset_game()
    input_frames = [preprocess_screen(test_env.getScreenGrayscale())]

    while not test_env.game_over():

        state = frames_to_state(input_frames)
        state = tf.expand_dims(state, axis=0)
        prob, value = agent.actor_critic(state)

        action = np.argmax(prob[0].numpy())
        reward = test_env.act(test_env.getActionSet()[action])
        total_reward += reward

        input_frames.append(preprocess_screen(test_env.getScreenGrayscale()))

    return total_reward

## Training

In [12]:
agent = Agent(hparas)
max_episode = hparas['max_episode']
test_per_n_episode = 10
force_save_per_n_episode = 1000
early_stop_reward = 10

start_s = 0
best_reward = -5.0

checkpoint = tf.train.Checkpoint(
    actor_critic = agent.actor_critic,
    optimizer = agent.optimizer,
)

# Load from old checkpoint
# checkpoint.restore('ckpt_dir/ckpt-?')

In [13]:
ep_reward = []
total_avgr = []
early_stop = False
avg_rewards_list = []

env.reset_game()

for s in range(0, max_episode):
    if early_stop == True:
        break

    rewards = []
    states = []
    actions = []
    log_probs = []
    masks = []
    values = []

    display_frames = [env.getScreenRGB()]
    input_frames = [preprocess_screen(env.getScreenGrayscale())]

    for step in range(hparas['num_steps']):

        state = frames_to_state(input_frames)
        state = tf.expand_dims(state, axis=0)
        prob, value = agent.actor_critic(state)

        dist = tfp.distributions.Categorical(probs=prob[0], dtype=tf.float32)
        action = dist.sample(1)
        log_prob = dist.log_prob(action)

        reward = env.act(env.getActionSet()[int(action.numpy())])

        done = env.game_over()

        states.append(state)
        actions.append(action)
        values.append(value[0])
        log_probs.append(log_prob)
        rewards.append(tf.convert_to_tensor(reward, dtype=tf.float32))
        masks.append(tf.convert_to_tensor(1-int(done), dtype=tf.float32))

        display_frames.append(env.getScreenRGB())
        input_frames.append(preprocess_screen(env.getScreenGrayscale()))

        if done:
            env.reset_game()
            input_frames = [preprocess_screen(env.getScreenGrayscale())]
  
    _, next_value = agent.actor_critic(state)
    values.append(next_value[0])

    returns = compute_gae(rewards, masks, values, hparas['gamma'], hparas['lambda'])

    returns = tf.concat(returns, axis=0)
    log_probs = tf.concat(log_probs, axis=0)
    values = tf.concat(values, axis=0)
    states = tf.concat(states, axis=0)
    actions = tf.concat(actions, axis=0)
    advantage = returns - values[:-1]

    a_loss, c_loss = agent.ppo_update(hparas['ppo_epochs'], hparas['mini_batch_size'], states, actions, log_probs, returns, advantage)
    print('[Episode %d]  Actor loss: %.5f, Critic loss: %.5f' % (s, a_loss, c_loss))

    if s % test_per_n_episode == 0:
        # test agent hparas['test_epochs'] times to get the average reward
        avg_reward = np.mean([test_reward(test_env, agent) for _ in range(hparas['test_epochs'])])
        print("Test average reward is %.1f, Current best average reward is %.1f\n" % (avg_reward, best_reward))
        avg_rewards_list.append(avg_reward)

        if avg_reward > best_reward:
            best_reward = avg_reward
            agent.actor_critic.save('./save/Actor/model_actor_{}_{}'.format(s, avg_reward), save_format="tf")
            checkpoint.save(file_prefix = './save/checkpoints/ckpt')

    if s % force_save_per_n_episode == 0:
        agent.actor_critic.save('./save/Actor/model_actor_{}_{}'.format(s, avg_reward), save_format="tf")
        checkpoint.save(file_prefix = './save/checkpoints/ckpt')
        clip = make_anim(display_frames, fps=60, true_image=True).rotate(-90)
        clip.write_videofile("movie_f/{}_demo-{}.webm".format('Lab15', s), fps=60)
        display(clip.ipython_display(fps=60, autoplay=1, loop=1, maxduration=120))

    if best_reward >= early_stop_reward:
        early_stop = True

[Episode 0]  Actor loss: 78.67571, Critic loss: 60.07767
Test average reward is -5.0, Current best average reward is -5.0

INFO:tensorflow:Assets written to: ./save/Actor/model_actor_0_-5.0\assets
Moviepy - Building video movie_f/Lab15_demo-0.webm.
Moviepy - Writing video movie_f/Lab15_demo-0.webm



                                                               

Moviepy - Done !
Moviepy - video ready movie_f/Lab15_demo-0.webm
Moviepy - Building video __temp__.mp4.
Moviepy - Writing video __temp__.mp4



                                                               

Moviepy - Done !
Moviepy - video ready __temp__.mp4




[Episode 1]  Actor loss: 48.18402, Critic loss: 35.33224
[Episode 2]  Actor loss: 40.89743, Critic loss: 24.75761
[Episode 3]  Actor loss: 22.85589, Critic loss: 13.12137
[Episode 4]  Actor loss: 18.16325, Critic loss: 7.77806
[Episode 5]  Actor loss: 20.16643, Critic loss: 6.71530
[Episode 6]  Actor loss: 1.91814, Critic loss: 3.23884
[Episode 7]  Actor loss: 4.25463, Critic loss: 4.61538
[Episode 8]  Actor loss: -0.09294, Critic loss: 3.66436
[Episode 9]  Actor loss: -10.12521, Critic loss: 2.54103
[Episode 10]  Actor loss: -6.89476, Critic loss: 2.17800
Test average reward is -5.0, Current best average reward is -5.0

[Episode 11]  Actor loss: -9.71891, Critic loss: 2.66344
[Episode 12]  Actor loss: 0.64489, Critic loss: 2.76453
[Episode 13]  Actor loss: -3.38575, Critic loss: 2.18189
[Episode 14]  Actor loss: -3.92528, Critic loss: 2.11146
[Episode 15]  Actor loss: -3.46103, Critic loss: 2.14489
[Episode 16]  Actor loss: -0.70099, Critic loss: 2.84826
[Episode 17]  Actor loss: -2.7

                                                               

Moviepy - Done !
Moviepy - video ready movie_f/Lab15_demo-1000.webm
Moviepy - Building video __temp__.mp4.
Moviepy - Writing video __temp__.mp4



                                                               

Moviepy - Done !
Moviepy - video ready __temp__.mp4


[Episode 1001]  Actor loss: -0.96469, Critic loss: 1.44172
[Episode 1002]  Actor loss: -14.00193, Critic loss: 1.41611
[Episode 1003]  Actor loss: -2.04839, Critic loss: 1.17102
[Episode 1004]  Actor loss: -6.88600, Critic loss: 1.54063
[Episode 1005]  Actor loss: -1.97290, Critic loss: 1.01043
[Episode 1006]  Actor loss: -2.34662, Critic loss: 0.91127
[Episode 1007]  Actor loss: -7.63708, Critic loss: 0.79355
[Episode 1008]  Actor loss: -23.80127, Critic loss: 3.18299
[Episode 1009]  Actor loss: -4.46175, Critic loss: 1.50225
[Episode 1010]  Actor loss: -1.11366, Critic loss: 1.25641
Test average reward is -5.0, Current best average reward is -4.0

[Episode 1011]  Actor loss: 3.52346, Critic loss: 2.36637
[Episode 1012]  Actor loss: 1.14741, Critic loss: 1.83857
[Episode 1013]  Actor loss: -10.05801, Critic loss: 2.00828
[Episode 1014]  Actor loss: -19.35230, Critic loss: 1.97282
[Episode 1015]  Actor loss: -9.22731, Critic loss: 3.08048
[Episode 1016]  Actor loss: 2.05718, Critic los

                                                               

Moviepy - Done !
Moviepy - video ready movie_f/Lab15_demo-2000.webm
Moviepy - Building video __temp__.mp4.
Moviepy - Writing video __temp__.mp4



                                                               

Moviepy - Done !
Moviepy - video ready __temp__.mp4




[Episode 2001]  Actor loss: -16.79783, Critic loss: 3.66185
[Episode 2002]  Actor loss: -12.37404, Critic loss: 2.55379
[Episode 2003]  Actor loss: -8.17080, Critic loss: 2.16940
[Episode 2004]  Actor loss: -17.60952, Critic loss: 2.61466
[Episode 2005]  Actor loss: -12.11849, Critic loss: 4.20179
[Episode 2006]  Actor loss: -17.79030, Critic loss: 2.14000
[Episode 2007]  Actor loss: -10.17881, Critic loss: 5.88368
[Episode 2008]  Actor loss: -9.77929, Critic loss: 5.07131
[Episode 2009]  Actor loss: -14.23838, Critic loss: 10.07444
[Episode 2010]  Actor loss: 11.69010, Critic loss: 5.54216
Test average reward is -5.0, Current best average reward is 2.0

[Episode 2011]  Actor loss: 11.29829, Critic loss: 7.48804
[Episode 2012]  Actor loss: 7.76369, Critic loss: 2.96006
[Episode 2013]  Actor loss: -21.31937, Critic loss: 3.58374
[Episode 2014]  Actor loss: -14.35164, Critic loss: 5.56582
[Episode 2015]  Actor loss: -4.55442, Critic loss: 3.17529
[Episode 2016]  Actor loss: -8.72077, Cri

# Report

我在第2050個Episode時，Test average reward超過early_stop_reward，也就是10，來到了 26 Test average reward。

在現實環境當中，有非常多的state跟action，要計算並儲存全部的state-action pairs是不現實的。因此需要理deep reinforcement learninge來evalutate value或action。在這個lab主要在實作GAE與PPO，以下會依照我的理解講解:


## GAE
首先是GAE，GAE是一種用來優化reward的方法。使用GAE不只可以看到近期的reward，也可以看到長期的reward，也就是說即使執行action不能立即獲得好的reward，多走幾步之後就可以獲得reward。在**compute_gae( )**這個function中:
1. 首先會先定義delta(會考慮discount)
2. 並再用delta去更新GAE
3. 最後再拿更新好的GAE加上reward而得到reward
4. 最後的return值要reverse，是因為我們是先從最後的time stamp開始計算，因此在return前，要先reverse。

## PPO
PPO是一種優化loss function的方法，根據paper，有些model一但學習到不好的Policy，他之後就只會產生出不好的action，因此無法幫助Policy進步，PPO試著解決這樣的問題。PPO會讓policy在更新的時候，只走一小步，因此可以穩定訓練過程。在**ppo_update( )**中：
1. 首先會計算ratio，代表policy改變的多寡，並經過log
2. 並利用ratio去計算surr1與surr2，再取當中的最小值，去計算**actor loss**
3. 並利用MSE去計算reward, value的差異，並以此當做**critical loss**
4. 最後再actor loss與critical loss做相加，當作total loss
