# Dependencies

In [None]:
!pip install 'ray[rllib]' torch

In [None]:
!pip install gym\[atari,accept-rom-license\]==0.21.0

# Imports

In [None]:
import numpy as np
import random
import matplotlib.pyplot as plt
from skimage.transform import resize

import ray
import gym
import ray.rllib.agents.ppo as ppo
from ray.rllib.agents.impala import impala
from ray.tune.logger import pretty_print

ray.init(ignore_reinit_error=True)


2022-04-13 14:17:26,730	INFO worker.py:879 -- Calling ray.init() again after it has already been called.


In [None]:
from ray.rllib.env.env_context import EnvContext
from ray.rllib.models import ModelCatalog



In [None]:
# convert to gray scale
def convert_to_gray(img):
    return np.dot(img, [0.2989, 0.5870, 0.1140])

def normalize(img):
    return img / 255

# putting everything together
def preprocess(img):
    #img_g = convert_to_gray_rescale(img)
    img_g = convert_to_gray(img)
    img_t = resize(img_g, (84, 84))
    img_n = normalize(img_t)
    return img_n

In [None]:
ENV_NAME = "Breakout-v0"
env = gym.make(ENV_NAME)

In [None]:
zeros = np.zeros((84,84))

# Environment Wrapper

In [None]:
from gym.spaces import Discrete, Box

class AtariEnv(gym.Env):
    """Class that wrapps the Dungeon Environment to make it 
    compatible with RLLib."""

    def __init__(self, config: EnvContext):

        self.env = gym.make("Breakout-v0")
        self.input_shape = (84,84)
        self.action_space = env.action_space

        zeros = np.zeros(self.input_shape)
        ones = np.ones(self.input_shape)
        self.observation_space = Box(low=zeros, high=ones, shape=self.input_shape, dtype=np.float32)
        
    def reset(self):
        obs = self.env.reset()
        obs = self.preprocess(obs)
        return obs

    def step(self, action):
        assert action in [i for i in range(self.env.action_space.n)]
        obs, reward, done, _ = self.env.step(action)
        obs = self.preprocess(obs)
        return obs, reward, done, {}

    def seed(self, seed=None):
        random.seed(seed)

    def preprocess(self, obs):
        # We normalize and concatenate observations
        obs = np.dot(obs, [0.2989, 0.5870, 0.1140])
        obs = resize(obs, self.input_shape)
        obs /= 255
        return obs

In [None]:
config = impala.DEFAULT_CONFIG.copy()
config['env'] = AtariEnv
config["framework"] = "torch"
config["num_gpus"] = 1
config['num_workers'] = 8

#print(config)


In [None]:
print(pretty_print(config))

_disable_action_flattening: false
_disable_execution_plan_api: false
_disable_preprocessor_api: false
_fake_gpus: false
_lr_vf: 0.0005
_separate_vf_optimizer: false
_tf_policy_handles_more_than_one_loss: false
actions_in_input_normalized: false
always_attach_evaluation_results: false
batch_mode: truncate_episodes
broadcast_interval: 1
callbacks: <class 'ray.rllib.agents.callbacks.DefaultCallbacks'>
clip_actions: false
collect_metrics_timeout: -1
compress_observations: false
create_env_on_driver: false
custom_resources_per_worker: {}
decay: 0.99
eager_max_retraces: 20
eager_tracing: false
entropy_coeff: 0.01
env: <class '__main__.AtariEnv'>
env_config: {}
epsilon: 0.1
evaluation_config: {}
evaluation_duration: 10
evaluation_duration_unit: episodes
evaluation_num_episodes: -1
evaluation_num_workers: 0
evaluation_parallel_to_training: false
exploration_config:
  type: StochasticSampling
explore: true
extra_python_environs_for_driver: {}
extra_python_environs_for_worker: {}
fake_sampler: f

In [None]:
#config['model']['fcnet_activation'] = 'relu'
#config['use_critic'] = True

# Training

In [None]:
trainer = impala.ImpalaTrainer(config=config)

 

In [None]:
for i in range(100):
    # Perform one iteration of training the policy with PPO
    result = trainer.train()

    print("Mean rewards:", result["episode_reward_mean"],"Min rewards:", result['episode_reward_min'], "Max rewards:", result['episode_reward_max'])

    #print(pretty_print(result))




Mean rewards: 0.0 Min rewards: 0.0 Max rewards: 0.0
Mean rewards: 1.564102564102564 Min rewards: 0.0 Max rewards: 5.0
Mean rewards: 1.6282051282051282 Min rewards: 0.0 Max rewards: 5.0
Mean rewards: 1.67 Min rewards: 0.0 Max rewards: 5.0
Mean rewards: 1.64 Min rewards: 0.0 Max rewards: 4.0
Mean rewards: 1.77 Min rewards: 0.0 Max rewards: 4.0
Mean rewards: 1.83 Min rewards: 0.0 Max rewards: 4.0
Mean rewards: 1.89 Min rewards: 0.0 Max rewards: 8.0
Mean rewards: 1.93 Min rewards: 0.0 Max rewards: 8.0
Mean rewards: 1.87 Min rewards: 0.0 Max rewards: 8.0
Mean rewards: 2.01 Min rewards: 0.0 Max rewards: 7.0
Mean rewards: 2.01 Min rewards: 0.0 Max rewards: 7.0
Mean rewards: 2.43 Min rewards: 0.0 Max rewards: 7.0
Mean rewards: 2.53 Min rewards: 0.0 Max rewards: 6.0
Mean rewards: 2.4 Min rewards: 0.0 Max rewards: 8.0
Mean rewards: 2.29 Min rewards: 0.0 Max rewards: 8.0
Mean rewards: 2.03 Min rewards: 0.0 Max rewards: 8.0
Mean rewards: 2.18 Min rewards: 0.0 Max rewards: 6.0
Mean rewards: 2.31 Mi

# Continue Training


In [None]:
for i in range(400):
    # Perform one iteration of training the policy with Impala
    result = trainer.train()
    print("Epoch:", i)
    print("Mean rewards:", result["episode_reward_mean"],"Min rewards:", result['episode_reward_min'], "Max rewards:", result['episode_reward_max'])

    #print(pretty_print(result))




Epoch: 0
Mean rewards: 37.29 Min rewards: 7.0 Max rewards: 282.0
Epoch: 1
Mean rewards: 37.37 Min rewards: 7.0 Max rewards: 282.0
Epoch: 2
Mean rewards: 37.51 Min rewards: 7.0 Max rewards: 282.0
Epoch: 3
Mean rewards: 38.33 Min rewards: 7.0 Max rewards: 282.0
Epoch: 4
Mean rewards: 38.26 Min rewards: 7.0 Max rewards: 282.0
Epoch: 5
Mean rewards: 35.88 Min rewards: 7.0 Max rewards: 67.0
Epoch: 6
Mean rewards: 35.57 Min rewards: 7.0 Max rewards: 67.0
Epoch: 7
Mean rewards: 35.87 Min rewards: 7.0 Max rewards: 67.0
Epoch: 8
Mean rewards: 36.01 Min rewards: 7.0 Max rewards: 67.0
Epoch: 9
Mean rewards: 36.43 Min rewards: 7.0 Max rewards: 67.0
Epoch: 10
Mean rewards: 36.63 Min rewards: 7.0 Max rewards: 67.0
Epoch: 11
Mean rewards: 37.69 Min rewards: 14.0 Max rewards: 67.0
Epoch: 12
Mean rewards: 36.08 Min rewards: 11.0 Max rewards: 74.0
Epoch: 13
Mean rewards: 35.62 Min rewards: 11.0 Max rewards: 74.0
Epoch: 14
Mean rewards: 35.45 Min rewards: 11.0 Max rewards: 74.0
Epoch: 15
Mean rewards: 36