In [1]:
import numpy as np
import matplotlib.pyplot as plt
import moviepy.editor as mpy
import skimage.transform
from IPython.display import Image, display

import tensorflow as tf
import tensorflow_probability as tfp
import tensorflow.keras.losses as kls

In [2]:
# cd PyGame-Learning-Environment

In [3]:
# pip install tensorflow_probability==0.12.2

In [4]:
gpus = tf.config.list_physical_devices("GPU") 
if gpus:
    try:
        # Restrict TensorFlow to only use the fourth GPU
        tf.config.set_visible_devices(gpus[0], 'GPU')

        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        
        logical_gpus = tf.config.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

1 Physical GPUs, 1 Logical GPUs


In [5]:
import os
os.environ["SDL_VIDEODRIVER"] = "dummy"  # this line make pop-out window not appear
from ple.games.flappybird import FlappyBird
from ple import PLE

game = FlappyBird()
env = PLE(game, fps=30, display_screen=False)  # environment interface to game
env.reset_game()

test_game = FlappyBird()
test_env = PLE(test_game, fps=30, display_screen=False)
test_env.reset_game()

couldn't import doomish
Couldn't import doom


In [6]:
path = './movie_f' 
if not os.path.exists(path):
    os.makedirs(path)

In [7]:
hparas = {
    'image_size': 84,
    'num_stack': 4,
    'action_dim': len(env.getActionSet()),
    'hidden_size': 256,
    'lr': 0.0001,
    'gamma': 0.99,
    'lambda': 0.95,
    'clip_val': 0.2,
    'ppo_epochs': 8,
    'test_epochs': 1,
    'num_steps': 512,
    'mini_batch_size': 64,
    'target_reward': 200,
    'max_episode': 30000,
}

In [8]:
# Please do not modify this method
def make_anim(images, fps=60, true_image=False):
    duration = len(images) / fps

    def make_frame(t):
        try:
            x = images[int(len(images) / duration * t)]
        except:
            x = images[-1]

        if true_image:
            return x.astype(np.uint8)
        else:
            return ((x + 1) / 2 * 255).astype(np.uint8)

    clip = mpy.VideoClip(make_frame, duration=duration)
    clip.fps = fps
    
    return clip

In [9]:
def preprocess_screen(screen):
    screen = skimage.transform.rotate(screen, -90, resize=True)
    screen = screen[:400, :]
    screen = skimage.transform.resize(screen, [hparas['image_size'], hparas['image_size'], 1])
    return screen.astype(np.float32)

def frames_to_state(input_frames):
    if(len(input_frames) == 1):
        state = np.concatenate(input_frames*4, axis=-1)
    elif(len(input_frames) == 2):
        state = np.concatenate(input_frames[0:1]*2 + input_frames[1:]*2, axis=-1)
    elif(len(input_frames) == 3):
        state = np.concatenate(input_frames + input_frames[2:], axis=-1)
    else:
        state = np.concatenate(input_frames[-4:], axis=-1)

    return state

In [10]:
class ActorCriticNetwork(tf.keras.Model):
    def __init__(self, hparas):
        super().__init__()

        self.feature_extractor = tf.keras.Sequential([
          # Convolutional Layers
          tf.keras.layers.Conv2D(filters=32, kernel_size=8, strides=4),
          tf.keras.layers.ReLU(),
          tf.keras.layers.Conv2D(filters=64, kernel_size=4, strides=2),
          tf.keras.layers.ReLU(),
          tf.keras.layers.Conv2D(filters=64, kernel_size=3, strides=1),
          tf.keras.layers.ReLU(),
          # Embedding Layers
          tf.keras.layers.Flatten(),
          tf.keras.layers.Dense(hparas['hidden_size']),
          tf.keras.layers.ReLU(),
        ])

        # Actor Network
        self.actor = tf.keras.layers.Dense(hparas['action_dim'], activation='softmax')
        # Critic Network
        self.critic = tf.keras.layers.Dense(1, activation = None)

    def call(self, input):
        x = self.feature_extractor(input)
        action_logits = self.actor(x)
        value = self.critic(x)
        return action_logits, value

In [11]:
class Agent():
    def __init__(self, hparas):
        self.gamma = hparas['gamma']
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=hparas['lr'])
        self.actor_critic = ActorCriticNetwork(hparas)
        self.clip_pram = hparas['clip_val']
    
    def ppo_iter(self, mini_batch_size, states, actions, log_probs, returns, advantage):
        batch_size = states.shape[0]
        for _ in range(batch_size // mini_batch_size):
            rand_ids = tf.convert_to_tensor(np.random.randint(0, batch_size, mini_batch_size), dtype=tf.int32)
            yield tf.gather(states, rand_ids), tf.gather(actions, rand_ids), tf.gather(log_probs, rand_ids), \
             tf.gather(returns, rand_ids), tf.gather(advantage, rand_ids)
    
    def ppo_update(self, ppo_epochs, mini_batch_size, states, actions, log_probs, discount_rewards, advantages):       
        total_actor_loss = 0
        total_critic_loss = 0
        for _ in range(ppo_epochs):
            for state, action, old_log_probs, reward, advantage in self.ppo_iter(mini_batch_size, states, actions, log_probs, discount_rewards, advantages):
                reward = tf.expand_dims(reward, axis=-1)

                with tf.GradientTape() as tape:
                    prob, value = self.actor_critic(state, training=True)
                    dist = tfp.distributions.Categorical(probs=prob, dtype=tf.float32)
                    entropy = tf.math.reduce_mean(dist.entropy())
                    new_log_probs = dist.log_prob(action)

                    # PPO ratio
                    ratio = tf.math.exp(new_log_probs - old_log_probs)
                    surr1 = ratio * advantage
                    surr2 = tf.clip_by_value(ratio, 1.0 - self.clip_pram, 1.0 + self.clip_pram) * advantage

                    actor_loss = tf.math.negative(tf.math.reduce_mean(tf.math.minimum(surr1, surr2))) - 0.1 * entropy
                    critic_loss = 0.5 * tf.math.reduce_mean(kls.mean_squared_error(reward, value))

                    total_loss = actor_loss + critic_loss
            
                # single optimizer
                grads = tape.gradient(total_loss, self.actor_critic.trainable_variables)
                self.optimizer.apply_gradients(zip(grads, self.actor_critic.trainable_variables))
      
                total_actor_loss += actor_loss
                total_critic_loss += critic_loss
        return total_actor_loss, total_critic_loss

In [12]:
# https://arxiv.org/pdf/1506.02438.pdf
# Equation 16
def compute_gae(rewards, masks, values, gamma, LAMBDA):
    gae = 0
    returns = []
    for i in reversed(range(len(rewards))):
        delta = rewards[i] + gamma * values[i + 1] * masks[i] - values[i]
        gae = delta + gamma * LAMBDA * masks[i] * gae
        returns.append(gae + values[i])

    returns.reverse()
    return returns

## Testing Environment

In [13]:
def test_reward(test_env, agent):
    total_reward = 0
    # Reset the environment
    test_env.reset_game()
    input_frames = [preprocess_screen(test_env.getScreenGrayscale())]

    while not test_env.game_over():

        state = frames_to_state(input_frames)
        state = tf.expand_dims(state, axis=0)
        prob, value = agent.actor_critic(state)

        action = np.argmax(prob[0].numpy())
        reward = test_env.act(test_env.getActionSet()[action])
        total_reward += reward

        input_frames.append(preprocess_screen(test_env.getScreenGrayscale()))

    return total_reward

## Training

In [14]:
agent = Agent(hparas)
max_episode = hparas['max_episode']
test_per_n_episode = 10
force_save_per_n_episode = 1000
early_stop_reward = 10

start_s = 0
best_reward = -5.0

checkpoint = tf.train.Checkpoint(
    actor_critic = agent.actor_critic,
    optimizer = agent.optimizer,
)

# Load from old checkpoint
# checkpoint.restore('ckpt_dir/ckpt-?')

In [15]:
ep_reward = []
total_avgr = []
early_stop = False
avg_rewards_list = []

env.reset_game()

for s in range(0, max_episode):
    if early_stop == True:
        break

    rewards = []
    states = []
    actions = []
    log_probs = []
    masks = []
    values = []

    display_frames = [env.getScreenRGB()]
    input_frames = [preprocess_screen(env.getScreenGrayscale())]

    for step in range(hparas['num_steps']):

        state = frames_to_state(input_frames)
        state = tf.expand_dims(state, axis=0)
        prob, value = agent.actor_critic(state)

        dist = tfp.distributions.Categorical(probs=prob[0], dtype=tf.float32)
        action = dist.sample(1)
        log_prob = dist.log_prob(action)

        reward = env.act(env.getActionSet()[int(action.numpy())])

        done = env.game_over()

        states.append(state)
        actions.append(action)
        values.append(value[0])
        log_probs.append(log_prob)
        rewards.append(tf.convert_to_tensor(reward, dtype=tf.float32))
        masks.append(tf.convert_to_tensor(1-int(done), dtype=tf.float32))

        display_frames.append(env.getScreenRGB())
        input_frames.append(preprocess_screen(env.getScreenGrayscale()))

        if done:
            env.reset_game()
            input_frames = [preprocess_screen(env.getScreenGrayscale())]
  
    _, next_value = agent.actor_critic(state)
    values.append(next_value[0])

    returns = compute_gae(rewards, masks, values, hparas['gamma'], hparas['lambda'])

    returns = tf.concat(returns, axis=0)
    log_probs = tf.concat(log_probs, axis=0)
    values = tf.concat(values, axis=0)
    states = tf.concat(states, axis=0)
    actions = tf.concat(actions, axis=0)
    advantage = returns - values[:-1]

    a_loss, c_loss = agent.ppo_update(hparas['ppo_epochs'], hparas['mini_batch_size'], states, actions, log_probs, returns, advantage)
    print('[Episode %d]  Actor loss: %.5f, Critic loss: %.5f' % (s, a_loss, c_loss))

    if s % test_per_n_episode == 0:
        # test agent hparas['test_epochs'] times to get the average reward
        avg_reward = np.mean([test_reward(test_env, agent) for _ in range(hparas['test_epochs'])])
        print("Test average reward is %.1f, Current best average reward is %.1f\n" % (avg_reward, best_reward))
        avg_rewards_list.append(avg_reward)

        if avg_reward > best_reward:
            best_reward = avg_reward
            agent.actor_critic.save('./save/Actor/model_actor_{}_{}'.format(s, avg_reward), save_format="tf")
            checkpoint.save(file_prefix = './save/checkpoints/ckpt')

    if s % force_save_per_n_episode == 0:
        agent.actor_critic.save('./save/Actor/model_actor_{}_{}'.format(s, avg_reward), save_format="tf")
        checkpoint.save(file_prefix = './save/checkpoints/ckpt')
        clip = make_anim(display_frames, fps=60, true_image=True).rotate(-90)
        clip.write_videofile("movie_f/{}_demo-{}.webm".format('Lab15', s), fps=60)
        display(clip.ipython_display(fps=60, autoplay=1, loop=1, maxduration=120))

    if best_reward >= early_stop_reward:
        early_stop = True

[Episode 0]  Actor loss: 82.66204, Critic loss: 57.21462
Test average reward is -5.0, Current best average reward is -5.0





INFO:tensorflow:Assets written to: ./save/Actor/model_actor_0_-5.0\assets


INFO:tensorflow:Assets written to: ./save/Actor/model_actor_0_-5.0\assets


Moviepy - Building video movie_f/Lab15_demo-0.webm.
Moviepy - Writing video movie_f/Lab15_demo-0.webm



                                                                                                                       

Moviepy - Done !
Moviepy - video ready movie_f/Lab15_demo-0.webm
Moviepy - Building video __temp__.mp4.
Moviepy - Writing video __temp__.mp4



                                                                                                                       

Moviepy - Done !
Moviepy - video ready __temp__.mp4


[Episode 1]  Actor loss: 33.04207, Critic loss: 27.90795
[Episode 2]  Actor loss: 24.99636, Critic loss: 20.49699
[Episode 3]  Actor loss: 11.08672, Critic loss: 8.11304
[Episode 4]  Actor loss: 12.08294, Critic loss: 7.23562
[Episode 5]  Actor loss: 5.87113, Critic loss: 5.43597
[Episode 6]  Actor loss: -2.42483, Critic loss: 3.21686
[Episode 7]  Actor loss: -1.10203, Critic loss: 4.15109
[Episode 8]  Actor loss: -0.89661, Critic loss: 3.66200
[Episode 9]  Actor loss: -3.09770, Critic loss: 3.32815
[Episode 10]  Actor loss: -5.76861, Critic loss: 2.11939
Test average reward is -5.0, Current best average reward is -5.0

[Episode 11]  Actor loss: -3.07419, Critic loss: 2.74555
[Episode 12]  Actor loss: -6.54952, Critic loss: 2.27168
[Episode 13]  Actor loss: -2.81601, Critic loss: 1.79866
[Episode 14]  Actor loss: -10.88801, Critic loss: 2.49479
[Episode 15]  Actor loss: -11.01849, Critic loss: 1.84192
[Episode 16]  Actor loss: -13.62716, Critic loss: 1.62625
[Episode 17]  Actor loss: -



INFO:tensorflow:Assets written to: ./save/Actor/model_actor_90_-4.0\assets


INFO:tensorflow:Assets written to: ./save/Actor/model_actor_90_-4.0\assets


[Episode 91]  Actor loss: -2.24275, Critic loss: 0.79165
[Episode 92]  Actor loss: -11.00323, Critic loss: 0.63464
[Episode 93]  Actor loss: -2.54622, Critic loss: 0.74062
[Episode 94]  Actor loss: -6.84256, Critic loss: 0.39983
[Episode 95]  Actor loss: -8.39817, Critic loss: 1.04214
[Episode 96]  Actor loss: -4.42707, Critic loss: 1.17817
[Episode 97]  Actor loss: -4.01366, Critic loss: 0.76096
[Episode 98]  Actor loss: -1.51171, Critic loss: 0.69540
[Episode 99]  Actor loss: -7.56101, Critic loss: 0.45522
[Episode 100]  Actor loss: -3.60336, Critic loss: 0.50906
Test average reward is -5.0, Current best average reward is -4.0

[Episode 101]  Actor loss: -3.93672, Critic loss: 0.89570
[Episode 102]  Actor loss: 0.08583, Critic loss: 0.49431
[Episode 103]  Actor loss: -6.80880, Critic loss: 0.29096
[Episode 104]  Actor loss: -5.24069, Critic loss: 0.61687
[Episode 105]  Actor loss: 0.72490, Critic loss: 0.49424
[Episode 106]  Actor loss: -4.67957, Critic loss: 0.44128
[Episode 107]  A



INFO:tensorflow:Assets written to: ./save/Actor/model_actor_1000_-5.0\assets


INFO:tensorflow:Assets written to: ./save/Actor/model_actor_1000_-5.0\assets


Moviepy - Building video movie_f/Lab15_demo-1000.webm.
Moviepy - Writing video movie_f/Lab15_demo-1000.webm



                                                                                                                       

Moviepy - Done !
Moviepy - video ready movie_f/Lab15_demo-1000.webm
Moviepy - Building video __temp__.mp4.
Moviepy - Writing video __temp__.mp4



                                                                                                                       

Moviepy - Done !
Moviepy - video ready __temp__.mp4


[Episode 1001]  Actor loss: -4.51606, Critic loss: 2.08424
[Episode 1002]  Actor loss: -12.91636, Critic loss: 2.61166
[Episode 1003]  Actor loss: -3.35236, Critic loss: 1.15195
[Episode 1004]  Actor loss: -6.30238, Critic loss: 2.34173
[Episode 1005]  Actor loss: -3.97713, Critic loss: 1.43814
[Episode 1006]  Actor loss: -14.22979, Critic loss: 2.84977
[Episode 1007]  Actor loss: -8.05304, Critic loss: 1.65933
[Episode 1008]  Actor loss: -2.84086, Critic loss: 1.38348
[Episode 1009]  Actor loss: 1.34266, Critic loss: 3.03719
[Episode 1010]  Actor loss: -4.42271, Critic loss: 1.35069
Test average reward is -5.0, Current best average reward is -4.0

[Episode 1011]  Actor loss: -14.01669, Critic loss: 2.20580
[Episode 1012]  Actor loss: -3.09845, Critic loss: 1.85434
[Episode 1013]  Actor loss: -4.78894, Critic loss: 2.02506
[Episode 1014]  Actor loss: -12.11761, Critic loss: 2.33880
[Episode 1015]  Actor loss: 1.24621, Critic loss: 2.82857
[Episode 1016]  Actor loss: -3.13070, Critic lo



INFO:tensorflow:Assets written to: ./save/Actor/model_actor_1810_-1.0\assets


INFO:tensorflow:Assets written to: ./save/Actor/model_actor_1810_-1.0\assets


[Episode 1811]  Actor loss: -14.94451, Critic loss: 2.30417
[Episode 1812]  Actor loss: -1.22264, Critic loss: 1.53401
[Episode 1813]  Actor loss: -10.09165, Critic loss: 1.89091
[Episode 1814]  Actor loss: -23.67672, Critic loss: 3.10034
[Episode 1815]  Actor loss: -13.43042, Critic loss: 3.37354
[Episode 1816]  Actor loss: -1.66389, Critic loss: 1.10940
[Episode 1817]  Actor loss: -2.33308, Critic loss: 3.83372
[Episode 1818]  Actor loss: -1.34885, Critic loss: 3.05367
[Episode 1819]  Actor loss: 0.20951, Critic loss: 3.82538
[Episode 1820]  Actor loss: 4.21705, Critic loss: 2.65465
Test average reward is -5.0, Current best average reward is -1.0

[Episode 1821]  Actor loss: -2.25416, Critic loss: 3.13530
[Episode 1822]  Actor loss: -30.65239, Critic loss: 3.50908
[Episode 1823]  Actor loss: -7.99861, Critic loss: 3.02884
[Episode 1824]  Actor loss: -15.69331, Critic loss: 4.80413
[Episode 1825]  Actor loss: -1.14354, Critic loss: 1.21361
[Episode 1826]  Actor loss: 1.61736, Critic l



INFO:tensorflow:Assets written to: ./save/Actor/model_actor_2000_-5.0\assets


INFO:tensorflow:Assets written to: ./save/Actor/model_actor_2000_-5.0\assets


Moviepy - Building video movie_f/Lab15_demo-2000.webm.
Moviepy - Writing video movie_f/Lab15_demo-2000.webm



                                                                                                                       

Moviepy - Done !
Moviepy - video ready movie_f/Lab15_demo-2000.webm
Moviepy - Building video __temp__.mp4.
Moviepy - Writing video __temp__.mp4



                                                                                                                       

Moviepy - Done !
Moviepy - video ready __temp__.mp4


[Episode 2001]  Actor loss: -3.61285, Critic loss: 3.09800
[Episode 2002]  Actor loss: -8.77566, Critic loss: 1.89493
[Episode 2003]  Actor loss: -16.03132, Critic loss: 8.06832
[Episode 2004]  Actor loss: 10.93362, Critic loss: 5.88829
[Episode 2005]  Actor loss: -2.15576, Critic loss: 4.22245
[Episode 2006]  Actor loss: 9.42213, Critic loss: 3.60194
[Episode 2007]  Actor loss: 7.32265, Critic loss: 2.91885
[Episode 2008]  Actor loss: -3.04329, Critic loss: 1.59919
[Episode 2009]  Actor loss: 1.91267, Critic loss: 1.10542
[Episode 2010]  Actor loss: -16.21532, Critic loss: 2.52642
Test average reward is -5.0, Current best average reward is -1.0

[Episode 2011]  Actor loss: -5.43506, Critic loss: 3.57966
[Episode 2012]  Actor loss: -31.78074, Critic loss: 2.33755
[Episode 2013]  Actor loss: -22.13005, Critic loss: 3.34902
[Episode 2014]  Actor loss: -26.48980, Critic loss: 2.96099
[Episode 2015]  Actor loss: -14.93004, Critic loss: 5.82400
[Episode 2016]  Actor loss: 13.48191, Critic l



INFO:tensorflow:Assets written to: ./save/Actor/model_actor_2240_3.0\assets


INFO:tensorflow:Assets written to: ./save/Actor/model_actor_2240_3.0\assets


[Episode 2241]  Actor loss: -8.63216, Critic loss: 5.18127
[Episode 2242]  Actor loss: -9.47008, Critic loss: 3.10097
[Episode 2243]  Actor loss: -8.64857, Critic loss: 3.26968
[Episode 2244]  Actor loss: -12.87561, Critic loss: 3.70579
[Episode 2245]  Actor loss: -13.70187, Critic loss: 4.07873
[Episode 2246]  Actor loss: -9.61087, Critic loss: 4.76933
[Episode 2247]  Actor loss: -3.38351, Critic loss: 3.41667
[Episode 2248]  Actor loss: -6.51401, Critic loss: 3.36907
[Episode 2249]  Actor loss: -4.45511, Critic loss: 2.05866
[Episode 2250]  Actor loss: -11.23508, Critic loss: 5.17214
Test average reward is -5.0, Current best average reward is 3.0

[Episode 2251]  Actor loss: -12.30083, Critic loss: 4.24265
[Episode 2252]  Actor loss: 13.73034, Critic loss: 3.70525
[Episode 2253]  Actor loss: -16.17175, Critic loss: 3.31060
[Episode 2254]  Actor loss: 3.09770, Critic loss: 2.42356
[Episode 2255]  Actor loss: -9.79830, Critic loss: 2.02296
[Episode 2256]  Actor loss: -17.44547, Critic 



INFO:tensorflow:Assets written to: ./save/Actor/model_actor_2320_8.0\assets


INFO:tensorflow:Assets written to: ./save/Actor/model_actor_2320_8.0\assets


[Episode 2321]  Actor loss: -8.76221, Critic loss: 5.62396
[Episode 2322]  Actor loss: -0.48429, Critic loss: 9.03010
[Episode 2323]  Actor loss: -14.51315, Critic loss: 4.61566
[Episode 2324]  Actor loss: -12.44077, Critic loss: 5.86300
[Episode 2325]  Actor loss: 1.84738, Critic loss: 4.35277
[Episode 2326]  Actor loss: 9.88568, Critic loss: 3.81443
[Episode 2327]  Actor loss: -24.22935, Critic loss: 4.52216
[Episode 2328]  Actor loss: 8.10558, Critic loss: 2.90058
[Episode 2329]  Actor loss: 4.52024, Critic loss: 1.81345
[Episode 2330]  Actor loss: -5.73290, Critic loss: 4.16394
Test average reward is -5.0, Current best average reward is 8.0

[Episode 2331]  Actor loss: -7.54423, Critic loss: 4.05846
[Episode 2332]  Actor loss: -19.92382, Critic loss: 5.53893
[Episode 2333]  Actor loss: -18.28278, Critic loss: 4.43546
[Episode 2334]  Actor loss: -26.43729, Critic loss: 4.81393
[Episode 2335]  Actor loss: -27.62400, Critic loss: 3.61319
[Episode 2336]  Actor loss: -13.04067, Critic l



INFO:tensorflow:Assets written to: ./save/Actor/model_actor_2740_11.0\assets


INFO:tensorflow:Assets written to: ./save/Actor/model_actor_2740_11.0\assets


## Brief report

- 最後跑了 2740 個 episodes，可以達到11.0的Test average reward和8.0的Current best average reward，最終有達到通過1pipe的要求。
- 從影片可以觀察出 agent 都是在最後快撞到 pipe 時才移動
- 相較於上禮拜的lab(Q-learning、SARSA)，這次的lab training成效較差，其中一個原因可能是因為利用raw frames作為input。