#  Implementação de um Agente em Flappy Bird

### Imports

In [44]:
import gym
from gym import spaces
import gym_ple
import numpy as np

## Flappy Bird Sem Visão Computacional

### Ambiente

Modelagem do ambiente do Flappy Bird usando o GameState como observação.

#### Observação:
 * Posição Y do pássaro.
 * Velocidade Y do pássaro.
 * Distância do pássaro até o próximo cano.
 * Posição Y da parte de cima do próximo cano.
 * Posição Y da parte de baixo do próximo cano.
 * Distância do pássaro até o cano depois do próximo cano.
 * Posição Y da parte de cima do cano depois do próximo cano.
 * Posição Y da parte de baixo do cano depois do próximo cano.

In [51]:
class FlappyBirdEnv(gym.Env):
  metadata = {'render.modes': ['human', 'rgb_array']}

  def __init__(self):
    self.n_pipes = 0
    self.env = gym.make("FlappyBird-v0")
    # Define action and observation space
    # They must be gym.spaces objects
    # Example when using discrete actions:
    self.action_space = self.env.action_space
    # Example for using image as input:
    self.observation_space = spaces.Box(low=np.array([0, -10.0, 0, 0, 0, 0, 0, 0]),
                                        high=np.array([512, 10.0, 588.0, 512, 512, 588.0, 512, 512]),
                                        dtype=np.float32)

  def step(self, action):
    observation, reward, done, info = self.env.step(action)
    observation = np.array(list(self.env.game_state.getGameState().values()))
    if reward > 0:
        self.n_pipes += 1
    if done:
        reward = -1
    reward += 0.1
    reward += (75 - abs(observation[0] - (observation[3] + observation[4])/2))*max((300 - observation[2])/300, 0)/750
    
    if self.n_pipes >= 500:
        done = True
    
    return observation, reward, done, info

  def reset(self):
    self.env.reset()
    self.n_pipes = 0
    observation = np.array(list(self.env.game_state.getGameState().values()))
    return observation  # reward, done, info can't be included

  def render(self, mode='human'):
    return self.env.render(mode=mode)
    
  def close (self):
    self.env.close()

Checando se o nosso ambiente satisfaz as propriedades do Gym.

In [52]:
from stable_baselines.common.env_checker import check_env

env = FlappyBirdEnv()
check_env(env)

Criando um vetor com 16 ambientes para multiprocessamento.

In [53]:
from stable_baselines.common import set_global_seeds, make_vec_env
from stable_baselines.common.vec_env import DummyVecEnv, VecNormalize
import os.path

env = make_vec_env(FlappyBirdEnv, n_envs = 8)

if os.path.isfile('VecNormalize.pkl'):
    env = VecNormalize.load("VecNormalize.pkl", env)
    print ("Importing Environment")
else:
    env = VecNormalize(env, norm_obs=True, norm_reward=False, clip_obs=10.)
    env.save("VecNormalize.pkl")
    print("Normalizing Environment")

Importing Environment


### Agente

Criando o modelo de PPO2 com a biblioteca Stable Baselines.

In [5]:
from stable_baselines.common.policies import MlpPolicy
from stable_baselines import PPO2

model = PPO2(MlpPolicy, env, n_steps = 1024, learning_rate=0.0001, nminibatches = 64, lam = 0.95, gamma = 0.99, noptepochs= 15, ent_coef= 0.0, cliprange= 0.2, verbose=1)





Instructions for updating:
Use keras.layers.flatten instead.


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where




Você tem a opção de usar um modelo pré-treinado:

In [54]:
from stable_baselines import PPO2

model = PPO2.load("trained_models/FrozenEnvPPO2")
model.set_env(env)

Loading a model without an environment, this model cannot be trained until it has a valid environment.


Criando um Callback

In [68]:
from stable_baselines.common.callbacks import EvalCallback

EvalEnv = make_vec_env(FlappyBirdEnv, n_envs = 1)

if os.path.isfile('VecNormalize.pkl'):
    EvalNormEnv = VecNormalize.load("VecNormalize.pkl", EvalEnv)
    print ("Importing Environment")
else:
    EvalNormEnv = VecNormalize(EvalEnv, norm_obs=True, norm_reward=False, clip_obs=10.)
    print("Normalizing Environment")

EvalNormEnv.training = False
    
eval_callback = EvalCallback(EvalNormEnv, best_model_save_path='./eval_models/',
                             log_path='./logs/', n_eval_episodes = 10,
                             eval_freq=1024,deterministic=True, render=False)

Importing Environment


In [70]:
# TODO - Callback do VecNormalize.save()

#### Treinamento do Modelo:

In [69]:
if os.path.isfile('VecNormalize.pkl'):
    EvalNormEnv = VecNormalize.load("VecNormalize.pkl", EvalEnv)
    EvalNormEnv.training = False
    eval_callback.eval_env = EvalNormEnv

model.learn(total_timesteps=16384, callback=eval_callback)
# model.learn(total_timesteps=65536, callback=eval_callback)
env.save("VecNormalize.pkl")

Eval num_timesteps=8192, episode_reward=653.89 +/- 496.08
Episode length: 3517.80 +/- 2638.20
New best mean reward!
--------------------------------------
| approxkl           | 0.0012977485  |
| clipfrac           | 0.011995443   |
| ep_len_mean        | 688           |
| ep_reward_mean     | 124           |
| explained_variance | 0.0832        |
| fps                | 30            |
| n_updates          | 1             |
| policy_entropy     | 0.30509627    |
| policy_loss        | -0.0010997339 |
| serial_timesteps   | 1024          |
| time_elapsed       | 0             |
| total_timesteps    | 8192          |
| value_loss         | 2.7651675     |
--------------------------------------
Eval num_timesteps=16384, episode_reward=775.90 +/- 551.30
Episode length: 4187.90 +/- 2950.30
New best mean reward!
--------------------------------------
| approxkl           | 0.0020303985  |
| clipfrac           | 0.023852538   |
| ep_len_mean        | 755           |
| ep_reward_mean     | 136

#### Testando o Modelo: 

In [71]:
TestEnv = make_vec_env(FlappyBirdEnv, n_envs = 1)

if os.path.isfile('VecNormalize.pkl'):
    TestNormEnv = VecNormalize.load("VecNormalize.pkl", TestEnv)
else:
    TestNormEnv = VecNormalize(TestEnv, norm_obs=True, norm_reward=False, clip_obs=10.)
    
TestNormEnv.training = False

In [72]:
import itertools

obs = TestNormEnv.reset()

for t in itertools.count():
    TestNormEnv.render()
    action, _states = model.predict(obs)
    obs, rewards, dones, info = TestNormEnv.step(action)
    if dones:
        break

print(t)
# print(TestNormEnv.get_attr('n_pipes'))
TestNormEnv.close()

6918


#### Salvando o Modelo:

In [73]:
model.save("trained_models/FrozenEnvPPO2")

#### Gravando um Episódio: (Not Working)

Criando o ambiente gravado.

In [88]:
from stable_baselines.common.vec_env import VecVideoRecorder, DummyVecEnv

video_folder = 'videos/'
video_length = 100

obs = EvalEnv.reset()

VideoEnv = VecVideoRecorder(EvalEnv, video_folder,
                       record_video_trigger=lambda x: x == 0, video_length=video_length,
                       name_prefix="FlappyBird")

Rodando o episódio.

In [92]:
images = []

obs = EvalEnv.reset()
img = EvalEnv.render(mode='rgb_array')

for t in itertools.count():
    images.append(img)
    EvalEnv.render()
    action, _ = model.predict(obs)
    obs, _, dones, _ = EvalEnv.step(action)
    img = EvalEnv.render(mode='rgb_array')
    if dones:
        break

EvalEnv.close()

imageio.mimsave('lander_a2c.gif', images, fps=29)

## Flappy Bird com Visão Computacional

A implementar

In [71]:
ENV_NAME = "FlappyBird-v0"

from stable_baselines.common.vec_env import VecFrameStack
from stable_baselines.common import set_global_seeds, make_vec_env
from stable_baselines.common.vec_env import DummyVecEnv, VecNormalize

env_vision = make_vec_env("FlappyBird-v0", n_envs = 8)
env_vision = VecFrameStack(env_vision, n_stack=4)

In [74]:
from stable_baselines import PPO2
from stable_baselines.common.policies import CnnPolicy

modelVision = PPO2(CnnPolicy, env_vision, n_steps = 128, nminibatches = 4, lam = 0.98, gamma = 0.999, noptepochs= 8, ent_coef= 0.01, verbose=1)

In [75]:
modelVision.learn(total_timesteps=10000)

ResourceExhaustedError: OOM when allocating tensor of shape [512,288,12] and type float
	 [[node input/sub/y (defined at D:\ProgramData\Anaconda3\lib\site-packages\stable_baselines\common\input.py:33) ]]

Original stack trace for 'input/sub/y':
  File "D:\ProgramData\Anaconda3\lib\runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "D:\ProgramData\Anaconda3\lib\runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "D:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "D:\ProgramData\Anaconda3\lib\site-packages\traitlets\config\application.py", line 664, in launch_instance
    app.start()
  File "D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\kernelapp.py", line 583, in start
    self.io_loop.start()
  File "D:\ProgramData\Anaconda3\lib\site-packages\tornado\platform\asyncio.py", line 153, in start
    self.asyncio_loop.run_forever()
  File "D:\ProgramData\Anaconda3\lib\asyncio\base_events.py", line 538, in run_forever
    self._run_once()
  File "D:\ProgramData\Anaconda3\lib\asyncio\base_events.py", line 1782, in _run_once
    handle._run()
  File "D:\ProgramData\Anaconda3\lib\asyncio\events.py", line 88, in _run
    self._context.run(self._callback, *self._args)
  File "D:\ProgramData\Anaconda3\lib\site-packages\tornado\ioloop.py", line 690, in <lambda>
    lambda f: self._run_callback(functools.partial(callback, future))
  File "D:\ProgramData\Anaconda3\lib\site-packages\tornado\ioloop.py", line 743, in _run_callback
    ret = callback()
  File "D:\ProgramData\Anaconda3\lib\site-packages\tornado\gen.py", line 787, in inner
    self.run()
  File "D:\ProgramData\Anaconda3\lib\site-packages\tornado\gen.py", line 748, in run
    yielded = self.gen.send(value)
  File "D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 361, in process_one
    yield gen.maybe_future(dispatch(*args))
  File "D:\ProgramData\Anaconda3\lib\site-packages\tornado\gen.py", line 209, in wrapper
    yielded = next(result)
  File "D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 268, in dispatch_shell
    yield gen.maybe_future(handler(stream, idents, msg))
  File "D:\ProgramData\Anaconda3\lib\site-packages\tornado\gen.py", line 209, in wrapper
    yielded = next(result)
  File "D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 541, in execute_request
    user_expressions, allow_stdin,
  File "D:\ProgramData\Anaconda3\lib\site-packages\tornado\gen.py", line 209, in wrapper
    yielded = next(result)
  File "D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\ipkernel.py", line 300, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "D:\ProgramData\Anaconda3\lib\site-packages\ipykernel\zmqshell.py", line 536, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "D:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2858, in run_cell
    raw_cell, store_history, silent, shell_futures)
  File "D:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2886, in _run_cell
    return runner(coro)
  File "D:\ProgramData\Anaconda3\lib\site-packages\IPython\core\async_helpers.py", line 68, in _pseudo_sync_runner
    coro.send(None)
  File "D:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3063, in run_cell_async
    interactivity=interactivity, compiler=compiler, result=result)
  File "D:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3254, in run_ast_nodes
    if (await self.run_code(code, result,  async_=asy)):
  File "D:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3331, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-74-287b8138bfab>", line 4, in <module>
    modelVision = PPO2(CnnPolicy, env_vision, n_steps = 128, nminibatches = 4, lam = 0.98, gamma = 0.999, noptepochs= 8, ent_coef= 0.01, verbose=1)
  File "D:\ProgramData\Anaconda3\lib\site-packages\stable_baselines\ppo2\ppo2.py", line 96, in __init__
    self.setup_model()
  File "D:\ProgramData\Anaconda3\lib\site-packages\stable_baselines\ppo2\ppo2.py", line 130, in setup_model
    n_batch_step, reuse=False, **self.policy_kwargs)
  File "D:\ProgramData\Anaconda3\lib\site-packages\stable_baselines\common\policies.py", line 602, in __init__
    feature_extraction="cnn", **_kwargs)
  File "D:\ProgramData\Anaconda3\lib\site-packages\stable_baselines\common\policies.py", line 541, in __init__
    scale=(feature_extraction == "cnn"))
  File "D:\ProgramData\Anaconda3\lib\site-packages\stable_baselines\common\policies.py", line 222, in __init__
    scale=scale)
  File "D:\ProgramData\Anaconda3\lib\site-packages\stable_baselines\common\policies.py", line 118, in __init__
    self._obs_ph, self._processed_obs = observation_input(ob_space, n_batch, scale=scale)
  File "D:\ProgramData\Anaconda3\lib\site-packages\stable_baselines\common\input.py", line 33, in observation_input
    processed_observations = ((processed_observations - ob_space.low) / (ob_space.high - ob_space.low))
  File "D:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\ops\math_ops.py", line 888, in binary_op_wrapper
    y, dtype_hint=x.dtype.base_dtype, name="y")
  File "D:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\framework\ops.py", line 1145, in convert_to_tensor_v2
    as_ref=False)
  File "D:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\framework\ops.py", line 1209, in internal_convert_to_tensor
    value, dtype=preferred_dtype, name=name, as_ref=as_ref)
  File "D:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\framework\constant_op.py", line 305, in _constant_tensor_conversion_function
    return constant(v, dtype=dtype, name=name)
  File "D:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\framework\constant_op.py", line 246, in constant
    allow_broadcast=True)
  File "D:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\framework\constant_op.py", line 290, in _constant_impl
    name=name).outputs[0]
  File "D:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\util\deprecation.py", line 507, in new_func
    return func(*args, **kwargs)
  File "D:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\framework\ops.py", line 3616, in create_op
    op_def=op_def)
  File "D:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\framework\ops.py", line 2005, in __init__
    self._traceback = tf_stack.extract_stack()


In [None]:
obs = env_vision.reset()
dones = 0
while not dones:
    action, _states = modelVision.predict(obs)
    obs, rewards, dones, info = env_vision.step(action)
    env.render()

env_vision.close()