In [1]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense

import gym
import argparse
import numpy as np

In [2]:
tf.keras.backend.set_floatx('float64')

In [3]:
! wget http://www.atarimania.com/roms/Roms.rar
! mkdir /content/ROM/
! unrar e /content/Roms.rar /content/ROM/
! python -m atari_py.import_roms /content/ROM/

--2021-09-25 09:20:33--  http://www.atarimania.com/roms/Roms.rar
Resolving www.atarimania.com (www.atarimania.com)... 195.154.81.199
Connecting to www.atarimania.com (www.atarimania.com)|195.154.81.199|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 11128004 (11M) [application/x-rar-compressed]
Saving to: ‘Roms.rar’


2021-09-25 09:21:19 (243 KB/s) - ‘Roms.rar’ saved [11128004/11128004]


UNRAR 5.50 freeware      Copyright (c) 1993-2017 Alexander Roshal


Extracting from /content/Roms.rar

Extracting  /content/ROM/HC ROMS.zip                                      36%  OK 
Extracting  /content/ROM/ROMS.zip                                         74% 99%  OK 
All OK
copying adventure.bin from ROMS/Adventure (1980) (Atari, Warren Robinett) (CX2613, CX2613P) (PAL).bin to /usr/local/lib/python3.7/dist-packages/atari_py/atari_roms/adventure.bin
copying air_raid.bin from ROMS/Air Raid (Men-A-Vision) (PAL) ~.bin to /usr/local/lib/python3.7/dist-pac

In [4]:
class Arguments:
    def __init__(self, actor_lr=0.0005, critic_lr=0.001, update_interval=5, gamma=0.99):
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.update_interval = update_interval
        self.gamma = gamma

args = Arguments()

In [5]:
def pre_process_image(frame):
    if frame.shape == (80, 80):
        return frame
    
    processed = np.mean(frame, 2, keepdims=False)
    cropped = processed[35:195]
    result = cropped[::2, ::2]

    return result

In [6]:
from tensorflow.keras.layers import Flatten

In [7]:
IMG_SHAPE = (80, 80)

In [8]:
class Actor:
    def __init__(self, state_dim, action_dim):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.model = self.create_model()
        self.opt = tf.keras.optimizers.Adam(args.actor_lr)

    def create_model(self):
        return tf.keras.Sequential([
            # Input(self.state_dim),
            Input(IMG_SHAPE),
            Flatten(),
            Dense(32, activation='relu'),
            Dense(16, activation='relu'),
            Dense(self.action_dim, activation='softmax')
        ])

    def compute_loss(self, actions, logits, advantages):
        ce_loss = tf.keras.losses.SparseCategoricalCrossentropy(
            from_logits=True)
        actions = tf.cast(actions, tf.int32)
        policy_loss = ce_loss(
            actions, logits, sample_weight=tf.stop_gradient(advantages))
        return policy_loss

    def train(self, states, actions, advantages):
        with tf.GradientTape() as tape:
            logits = self.model(states, training=True)
            loss = self.compute_loss(
                actions, logits, advantages)
        grads = tape.gradient(loss, self.model.trainable_variables)
        self.opt.apply_gradients(zip(grads, self.model.trainable_variables))
        return loss

In [9]:
class Critic:
    def __init__(self, state_dim):
        self.state_dim = state_dim
        self.model = self.create_model()
        self.opt = tf.keras.optimizers.Adam(args.critic_lr)

    def create_model(self):
        return tf.keras.Sequential([
            # Input(self.state_dim), (n, 80, 80)
            Input(IMG_SHAPE),
            Flatten(),
            Dense(32, activation='relu'),
            Dense(16, activation='relu'),
            Dense(16, activation='relu'),
            Dense(1, activation='linear')
        ])

    def compute_loss(self, v_pred, td_targets):
        mse = tf.keras.losses.MeanSquaredError()
        return mse(td_targets, v_pred)

    def train(self, states, td_targets):
        with tf.GradientTape() as tape:
            v_pred = self.model(states, training=True)
            assert v_pred.shape == td_targets.shape
            loss = self.compute_loss(v_pred, tf.stop_gradient(td_targets))
        grads = tape.gradient(loss, self.model.trainable_variables)
        self.opt.apply_gradients(zip(grads, self.model.trainable_variables))
        return loss

In [14]:
class Agent:
    def __init__(self, env):
        self.env = env
        self.state_dim = self.env.observation_space.shape
        self.action_dim = self.env.action_space.n
        self.actor = Actor(self.state_dim, self.action_dim)
        self.critic = Critic(self.state_dim)

    def td_target(self, reward, next_state, done):
        if done:
            return reward
        # v_value = self.critic.model.predict(
        #     np.reshape(next_state, [1, self.state_dim]))
        v_value = self.critic.model.predict(next_state)
        return np.reshape(reward + args.gamma * v_value[0], [1, 1])

    def advatnage(self, td_targets, baselines):
        return td_targets - baselines

    def list_to_batch(self, list):
        batch = list[0]
        for elem in list[1:]:
            batch = np.append(batch, elem, axis=0)
        return batch

    def train(self, max_episodes=1000):
        for ep in range(max_episodes):
            state_batch = []
            action_batch = []
            td_target_batch = []
            advatnage_batch = []
            episode_reward, done = 0, False

            state = self.env.reset()

            while not done:
                probs = self.actor.model.predict(np.array([pre_process_image(state)]))

                action = np.random.choice(self.action_dim, p=probs[0])

                next_state, reward, done, _ = self.env.step(action)

                state = np.array([pre_process_image(state)])
                action = np.reshape(action, [1, 1])
                next_state = np.array([pre_process_image(next_state)])
                reward = np.reshape(reward, [1, 1])

                td_target = self.td_target(reward * 0.01, next_state, done)
                advantage = self.advatnage(
                    td_target, self.critic.model.predict(state))
                
                state_batch.append(state)
                action_batch.append(action)
                td_target_batch.append(td_target)
                advatnage_batch.append(advantage)

                if len(state_batch) >= args.update_interval or done:
                    states = self.list_to_batch(state_batch)
                    actions = self.list_to_batch(action_batch)
                    td_targets = self.list_to_batch(td_target_batch)
                    advantages = self.list_to_batch(advatnage_batch)

                    actor_loss = self.actor.train(states, actions, advantages)
                    critic_loss = self.critic.train(states, td_targets)

                    state_batch = []
                    action_batch = []
                    td_target_batch = []
                    advatnage_batch = []

                episode_reward += reward[0][0]
                state = next_state[0]

            print('EP{} EpisodeReward={}'.format(ep, episode_reward))

In [11]:
env_name = 'YarsRevenge-v0'
#env_name = 'CartPole-v1'
env = gym.make(env_name)
print(env.observation_space.shape)
print(env.action_space.n)
#agent = Agent(env)
#agent.train()

(210, 160, 3)
18


In [15]:
agent = Agent(env)
agent.train()

  '"`sparse_categorical_crossentropy` received `from_logits=True`, but '


EP0 EpisodeReward=3639.0
EP1 EpisodeReward=2535.0
EP2 EpisodeReward=1690.0
EP3 EpisodeReward=1352.0
EP4 EpisodeReward=1690.0
EP5 EpisodeReward=1859.0
EP6 EpisodeReward=2535.0
EP7 EpisodeReward=1183.0
EP8 EpisodeReward=1183.0
EP9 EpisodeReward=2873.0
EP10 EpisodeReward=1859.0
EP11 EpisodeReward=2873.0
EP12 EpisodeReward=1183.0
EP13 EpisodeReward=2366.0
EP14 EpisodeReward=1521.0
EP15 EpisodeReward=1352.0
EP16 EpisodeReward=2535.0
EP17 EpisodeReward=1014.0
EP18 EpisodeReward=1521.0
EP19 EpisodeReward=1014.0
EP20 EpisodeReward=2366.0
EP21 EpisodeReward=1690.0
EP22 EpisodeReward=1014.0
EP23 EpisodeReward=2028.0


KeyboardInterrupt: ignored