#  Implementação de um Agente em Flappy Bird

### Imports

In [2]:
import gym
from gym import spaces
import gym_ple
import numpy as np

pygame 1.9.6
Hello from the pygame community. https://www.pygame.org/contribute.html
couldn't import doomish
Couldn't import doom


## Flappy Bird Sem Visão Computacional

### Ambiente

Modelagem do ambiente do Flappy Bird usando o GameState como observação.

#### Observação:
 * Posição Y do pássaro.
 * Velocidade Y do pássaro.
 * Distância do pássaro até o próximo cano.
 * Posição Y da parte de cima do próximo cano.
 * Posição Y da parte de baixo do próximo cano.
 * Distância do pássaro até o cano depois do próximo cano.
 * Posição Y da parte de cima do cano depois do próximo cano.
 * Posição Y da parte de baixo do cano depois do próximo cano.

In [3]:
class FlappyBirdEnv(gym.Env):
  metadata = {'render.modes': ['human', 'rgb_array']}

  def __init__(self):
    self.n_pipes = 0
    self.env = gym.make("FlappyBird-v0")
    # Define action and observation space
    # They must be gym.spaces objects
    # Example when using discrete actions:
    self.action_space = self.env.action_space
    # Example for using image as input:
    self.observation_space = spaces.Box(low=np.array([0, -10.0, 0, 0, 0, 0, 0, 0]),
                                        high=np.array([512, 10.0, 588.0, 512, 512, 588.0, 512, 512]),
                                        dtype=np.float32)

  def step(self, action):
    observation, reward, done, info = self.env.step(action)
    observation = np.array(list(self.env.game_state.getGameState().values()))
    if reward > 0:
        self.n_pipes += reward
    if done:
        reward = -1
    reward += 0.1
    reward += (75 - abs(observation[0] - (observation[3] + observation[4])/2))*max((300 - observation[2])/300, 0)/750
    return observation, reward, done, info

  def reset(self):
    self.env.reset()
    self.n_pipes = 0
    observation = np.array(list(self.env.game_state.getGameState().values()))
    return observation  # reward, done, info can't be included

  def render(self, mode='human'):
    self.env.render(mode='human')
    
  def close (self):
    self.env.close()

Checando se o nosso ambiente satisfaz as propriedades do Gym.

In [4]:
from stable_baselines.common.env_checker import check_env

env = FlappyBirdEnv()
check_env(env)

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


Criando um vetor com 16 ambientes para multiprocessamento.

In [5]:
import gym

from stable_baselines.common import set_global_seeds, make_vec_env
from stable_baselines.common.vec_env import DummyVecEnv, VecNormalize

env = make_vec_env(FlappyBirdEnv, n_envs = 32)
env = VecNormalize(env, norm_obs=True, norm_reward=False, clip_obs=10.)

### Agente

Criando o modelo de PPO2 com a biblioteca Stable Baselines.

In [6]:
from stable_baselines.common.policies import MlpPolicy
from stable_baselines import PPO2

model = PPO2(MlpPolicy, env, n_steps = 1024, learning_rate=0.0001, nminibatches = 64, lam = 0.95, gamma = 0.99, noptepochs= 15, ent_coef= 0.0, cliprange= 0.2, verbose=1)





Instructions for updating:
Use keras.layers.flatten instead.


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where




Você tem a opção de usar um modelo pré-treinado:

In [12]:
from stable_baselines import PPO2

model = PPO2.load("trained_models/PPO2_1280")
model.set_env(env)

Loading a model without an environment, this model cannot be trained until it has a valid environment.


Criando um Callback

In [31]:
from stable_baselines.common.callbacks import EvalCallback

EvalEnv = make_vec_env(FlappyBirdEnv, n_envs = 1)
EvalEnv = VecNormalize(EvalEnv, norm_obs=True, norm_reward=False, clip_obs=10.)

eval_callback = EvalCallback(EvalEnv, best_model_save_path='./eval_models/',
                             log_path='./logs/', n_eval_episodes = 15,
                             eval_freq=1024,deterministic=True, render=False)

#### Treinamento do Modelo:

In [33]:
# model.learn(total_timesteps=524288, callback=eval_callback)
model.learn(total_timesteps=131072, callback=eval_callback)

Eval num_timesteps=5920, episode_reward=720.11 +/- 578.65
Episode length: 3833.87 +/- 3054.72
New best mean reward!
--------------------------------------
| approxkl           | 0.0027168556  |
| clipfrac           | 0.026800537   |
| ep_len_mean        | 904           |
| ep_reward_mean     | 162           |
| explained_variance | -0.578        |
| fps                | 63            |
| n_updates          | 1             |
| policy_entropy     | 0.34100726    |
| policy_loss        | -0.0019917767 |
| serial_timesteps   | 1024          |
| time_elapsed       | 0             |
| total_timesteps    | 32768         |
| value_loss         | 2.4974778     |
--------------------------------------
Eval num_timesteps=38688, episode_reward=341.18 +/- 288.56
Episode length: 1829.67 +/- 1518.14
-------------------------------------
| approxkl           | 0.0021577924 |
| clipfrac           | 0.021164957  |
| ep_len_mean        | 987          |
| ep_reward_mean     | 178          |
| explained_va

<stable_baselines.ppo2.ppo2.PPO2 at 0x1f4860c4dc8>

#### Testando o Modelo: 

In [24]:
TestEnv = FlappyBirdEnv()

TestEnv = make_vec_env(FlappyBirdEnv, n_envs = 1)
TestEnv = VecNormalize(TestEnv, norm_obs=True, norm_reward=False, clip_obs=10.)

In [30]:
import itertools

obs = TestEnv.reset()
dones = 0
for t in itertools.count():
    TestEnv.render()
    action, _states = model.predict(obs)
    obs, rewards, dones, info = TestEnv.step(action)
    if dones:
        break
    
print(t)
# print(TestEnv.n_pipes)
TestEnv.close()

61


#### Salvando o Modelo:

In [60]:
model.save("trained_models/New_PPO2_")

#### Gravando um Episódio: (Not Working)

Criando o ambiente gravado.

In [11]:
from stable_baselines.common.vec_env import VecVideoRecorder, DummyVecEnv

video_folder = 'videos/'
video_length = 100

VideoEnv = DummyVecEnv([lambda: FlappyBirdEnv()])

obs = VideoEnv.reset()

VideoEnv = VecVideoRecorder(VideoEnv, video_folder,
                       record_video_trigger=lambda x: x == 0, video_length=video_length,
                       name_prefix="FlappyBird")

Rodando o episódio.

In [13]:
obs = VideoEnv.reset()

for _ in range(video_length + 1):
    VideoEnv.render()
    action, _ = model.predict(obs)
    obs, _, _, _ = VideoEnv.step(action)

VideoEnv.close()




Saving video to  D:\Codigos\RL\FlappyBirdPPO2\videos\FlappyBird-step-101-to-step-201.mp4


## Flappy Bird com Visão Computacional

A implementar

In [11]:
ENV_NAME = "FlappyBird-v0"

import gym

from stable_baselines.common.policies import CnnLstmPolicy
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines import PPO2

env_vision = gym.make(ENV_NAME)
# Optional: PPO2 requires a vectorized environment to run
# the env is now wrapped automatically when passing it to the constructor
env_vision = DummyVecEnv([lambda: env_vision])

In [None]:
modelVision = PPO2(CnnLstmPolicy, env_vision, n_steps = 512, nminibatches = 1, lam = 0.98, gamma = 0.999, noptepochs= 15, ent_coef= 0.01, verbose=1)

In [None]:
modelVision.learn(total_timesteps=10000)

In [None]:
obs = env_vision.reset()
dones = 0
while not dones:
    action, _states = modelVision.predict(obs)
    obs, rewards, dones, info = env_vision.step(action)
    env.render()

env_vision.close()