<a href="https://colab.research.google.com/github/DunkleCat/a3c-tensorflow/blob/master/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Header


## Libraries


In [None]:
!pip install tf-agents

import csv

import tensorflow as tf
import tensorflow.keras as keras
import tensorflow.keras.layers as layers
from tf_agents.utils import value_ops

import gym
import numpy as np
from threading import Thread, Lock
from multiprocessing import cpu_count

tf.keras.backend.set_floatx('float64')

# device_name = tf.test.gpu_device_name()
# if device_name != '/device:GPU:0':
#   raise SystemError('GPU device not found')
# print('Found GPU at: {}'.format(device_name))

## Global variables

In [2]:
def getNumberOfWorkers():
  return cpu_count()
  # return 1
  
def getMaxEpisodes():
  return 10000

def getBatchSize():
  return 5

def env_name():
  return "CartPole-v1"

def getTypology():
  return ("classic",)
  # Options:
  # ("classic",)
  # ("atari", "visual")
  # ("atari", "ram")

CUR_EPISODE = 0

# Random Agent


In [None]:
import gym
env = gym.make(env_name())
for i_episode in range(20):
  observation = env.reset()
  episode_reward, done = 0, False
  while not done:
    # env.render()
    action = env.action_space.sample()
    observation, reward, done, info = env.step(action)
    episode_reward += reward
  print("Episode finished with a reward of {}".format(episode_reward))
  episode_reward = 0
env.close()

## Main

In [4]:
def main():
  agent = Agent()
  agent.train()

# ActorCritic


## Help functions

In [5]:
def get_input_classic(input, single):
  
  if not single:
    return input
  else:
    shape = input.shape
    n_samples = 1
    new_input = np.ndarray((n_samples,
                            shape[0]))
    new_input[0] = input
    return new_input

def get_input_atari(input, single):

  if single:
    shape = input.shape
    n_samples = 1
  else:
    shape = input[0].shape
    n_samples = len(input)

  if getTypology()[1] is "visual":
    new_input = np.ndarray((n_samples,
                            1,
                            shape[0],
                            shape[1],
                            shape[2]))
  elif getTypology()[1] is "ram":
    new_input = np.ndarray((n_samples,
                            1,
                            shape[0]))

  if single:
      new_input[0][0] = input
  else:
    for k in range(len(input)):
        new_input[k][0] = input[k]

  return new_input

def init_state_shape(state_shape):
  if getTypology()[0] is "classic":
    return state_shape
  elif getTypology()[1] is "visual":
      return np.ndarray(shape = (1,
                                 state_shape[0],
                                 state_shape[1],
                                 state_shape[2])).shape
  elif getTypology()[1] is "ram":
    return np.ndarray(shape = (1,
                               state_shape[0])).shape

## ActorCritic

In [6]:
class ActorCritic:

  def __init__(self, state_shape, action_shape):
    self.state_shape = init_state_shape(state_shape)    
    self.action_shape = action_shape
    self.model = create_model(state_shape, action_shape)
    self.opt = tf.keras.optimizers.Adam(0.0005)

  def get_action(self, input):
    input = eval("get_input_" + getTypology()[0] + "(input, single = True)")
    action_dist, _ = self.model.predict(input)
    return np.random.choice(self.action_shape, p = action_dist[0])

  def get_value(self, input, single):
    input = eval("get_input_" + getTypology()[0] + "(input, single = single)")
    _, v = self.model.predict(input)
    return v

  def train(self, input, actions, advantages, discounted_rewards):

    def compute_loss(actions, action_dist, advantages, v_pred, discounted_rewards):
      # Compute policy loss
      scc = tf.keras.losses.SparseCategoricalCrossentropy()
      policy_loss = 0.5 * scc(actions, action_dist, sample_weight = tf.stop_gradient(advantages))
      # Compute entropy
      cc = tf.keras.losses.CategoricalCrossentropy()
      entropy = 0.01 * cc(action_dist, action_dist)
      # Compute value loss
      mse = tf.keras.losses.MeanSquaredError()
      value_loss = mse(v_pred, discounted_rewards)

      return policy_loss + value_loss - entropy
  
    with tf.GradientTape() as tape:
      input = eval("get_input_" + getTypology()[0] + "(input, single = False)")
      action_dist, v_pred = self.model(input, training = True)
      loss = compute_loss(actions, action_dist, advantages, v_pred, discounted_rewards)
    grads = tape.gradient(loss, self.model.trainable_variables)
    self.opt.apply_gradients(zip(grads, self.model.trainable_variables))

    return loss

## Model

In [7]:
def create_model(input_shape, output_shape):
  t = getTypology()[0]
  if t is "classic":
    return ClassicModel(input_shape, output_shape)
  elif t is "ram":
    return RamModel(input_shape, output_shape)
  elif t is "visual":
    return VisualModel(input_shape, output_shape)
  else:  
    raise ValueError("Bad model typology")

  # function_name = "create_model_" + getTypology()[0]
  # function_arguments = "(input_shape, output_shape, \"softmax\", \"None\")"
  # return eval(function_name + function_arguments)

def create_model_atari(input_shape, output_shape, activation_function):
  if getTypology()[1] is "visual":
    return keras.Sequential(
        [
         layers.Dropout(0.5),
         layers.Dense(256,
                      activation="relu"),
         # layers.Dense(128,
         #              activation="relu"),
         # layers.Dense(64,
         #              activation="relu"),
         layers.Dense(output_shape, 
                      activation = activation_function)
        ]
    )

class Vision(layers.Layer):

  def __init__(self, name = "visual", **kwargs):
    super(Vision, self).__init__(name = name, **kwargs)
    self.rnn1 = layers.ConvLSTM2D(16,4,2, 
                                  data_format='channels_last',
                                  dropout = 0.5,
                                  return_sequences = True)
    self.rnn2 = layers.ConvLSTM2D(32,4,1, 
                                  data_format='channels_last',
                                  dropout = 0.5,
                                  return_sequences = True)
    self.norm = layers.BatchNormalization()
    self.avrg = layers.AveragePooling2D()
    self.flat = layers.Flatten()

  def call(self, inputs):
    x = self.rnn1(inputs)
    x = self.rnn2(x)
    x = self.norm(x)
    x = self.avrg(x)
    return self.flat(x)

class VisualModel(tf.keras.Model):

  def __init__(self, input_shape, output_shape):
    super(RamModel, self).__init__()
    self.eye = Vision()
    self.dens = layers.Dense(256, activation = "relu")
    self.probs = layers.Dense(output_shape, activation = "softmax")
    self.value = layers.Dense(1, activation = "linear")

  def call(self, inputs):
    x = self.eye(inputs)
    x = self.dens(x)
    p = self.probs(x)
    v = self.value(x)
    return p, v

class RamModel(tf.keras.Model):

  def __init__(self, input_shape, output_shape):
    super(RamModel, self).__init__()
    self.lstm1 = layers.LSTM(32)
    self.dens1 = layers.Dense(32, activation = "relu")
    self.dens2 = layers.Dense(64, activation = "relu")
    self.dens3 = layers.Dense(32, activation = "relu")
    self.probs = layers.Dense(output_shape, activation = "softmax")
    self.value = layers.Dense(1, activation = "linear")

  def call(self, inputs, training = False):
    x = self.lstm1(inputs)
    x = self.dens1(x)
    x = self.dens2(x)
    x = self.dens3(x)
    p = self.probs(x)
    v = self.value(x)
    return p, v
    
class ClassicModel(tf.keras.Model):

  def __init__(self, input_shape, output_shape):
    super(ClassicModel, self).__init__()
    self.dens1 = layers.Dense(32, activation = "relu")
    self.norm1 = layers.BatchNormalization()
    self.drop1 = layers.Dropout(0.2)
    self.dens2 = layers.Dense(32, activation = "relu")
    self.dens3 = layers.Dense(16, activation = "relu")
    self.probs = layers.Dense(output_shape, activation = "softmax")
    self.value = layers.Dense(1, activation = "linear")

  def call(self, inputs, training = False):
    x = self.dens1(inputs)
    x = self.norm1(x)
    if training:
      x = self.drop1(x)
    # x = self.dens2(x)
    # x = self.dens3(x)
    p = self.probs(x)
    v = self.value(x)
    return p, v

# Agent

In [8]:
class Agent:
    def __init__(self):
        env = gym.make(env_name())
        self.state_shape = env.observation_space.shape
        self.action_shape = env.action_space.n
        env.close()
        
        self.global_actor_critic = ActorCritic(self.state_shape, self.action_shape)

    def train(self):
        workers = []

        with open("/content/drive/My Drive/Machine Learning/logs/" + str(env_name()) + "_loss", "w") as csv_file:
          fieldnames = ['episode', 'loss']
          csv_writer = csv.DictWriter(csv_file, fieldnames = fieldnames)
          csv_writer.writeheader()
        with open("/content/drive/My Drive/Machine Learning/logs/" + str(env_name()) + "_reward", "w") as csv_file:
          fieldnames = ['episode', 'reward']
          csv_writer = csv.DictWriter(csv_file, fieldnames = fieldnames)
          csv_writer.writeheader()

        for i in range(getNumberOfWorkers()):
            workers.append(WorkerAgent(self.global_actor_critic))
        for worker in workers:
            worker.start()

        for worker in workers:
            worker.join()

# WorkerAgent

In [9]:
class WorkerAgent(Thread):
    def __init__(self, global_actor_critic):
        Thread.__init__(self)
        self.lock = Lock()
        self.env = gym.make(env_name())
        self.state_shape = env.observation_space.shape
        self.action_shape = env.action_space.n

        self.global_actor_critic = global_actor_critic
        self.actor_critic = ActorCritic(self.state_shape, self.action_shape)
        
        self.actor_critic.model.set_weights(self.global_actor_critic.model.get_weights())
        
    def train(self):

        def list_to_batch(list):
            batch = list[0]

            for elem in list[1:]:
                batch = np.append(batch, elem, axis = 0)
            return batch

        global CUR_EPISODE

        while getMaxEpisodes() >= CUR_EPISODE:
            state_batch = []
            action_batch = []
            reward_batch = []
            episode_reward, episode_loss, done = 0, 0, False

            state = self.env.reset()

            while not done:
                # self.env.render()
                action = self.actor_critic.get_action(state)
                next_state, reward, done, _ = self.env.step(action) 
                episode_reward += reward

                reward = np.reshape(reward, [1, 1])

                state_batch.append(state)
                action_batch.append(action)
                reward_batch.append(reward)

                if len(state_batch) >= getBatchSize() or done:
                    actions = np.array(action_batch)
                    states = np.array(state_batch)
                    rewards = np.array(reward_batch)
                    
                    final_value = self.actor_critic.get_value(next_state, single = True)
                    discounted_rewards = np.ones(rewards.shape)
                    discounted_rewards[-1] = 0 if done else 0.99 * final_value
                    for i in reversed(range(1, len(rewards)-1)):
                      discounted_rewards[i] = 0.99 * discounted_rewards[i+1] + rewards[i]

                    values = self.actor_critic.get_value(states, single = False)
                    advantages = discounted_rewards - values
                    # print(advantages)                    

                    with self.lock:
                        loss = self.global_actor_critic.train(states, actions, advantages, discounted_rewards)
                        self.actor_critic.model.set_weights(self.global_actor_critic.model.get_weights())
                        
                        episode_loss += loss
        
                    state_batch = []
                    action_batch = []
                    reward_batch = []

                state = next_state

            if CUR_EPISODE%10 is 0:
              print("EP{}: Reward = {}, Loss = {}".format(CUR_EPISODE, 
                                                          episode_reward, 
                                                          episode_loss))
              self.global_actor_critic.model.save("/content/drive/My Drive/Machine Learning/models/" + str(env_name()))
              
              with open("/content/drive/My Drive/Machine Learning/logs/" + str(env_name()) + "_loss", "w") as csv_file:
                fieldnames = ['episode', 'loss']
                csv_writer = csv.DictWriter(csv_file, fieldnames = fieldnames)
                csv_writer.writerow({'episode': CUR_EPISODE, 'loss': episode_actor_loss})
              with open("/content/drive/My Drive/Machine Learning/logs/" + str(env_name()) + "_reward", "w") as csv_file:
                fieldnames = ['episode', 'reward']
                csv_writer = csv.DictWriter(csv_file, delimiter=',')
                csv_writer.writerow({'episode': CUR_EPISODE, 'reward': episode_reward})
              
            CUR_EPISODE += 1

    def run(self):
        self.train()

# Entrypoint

In [None]:
if __name__ == "__main__":
  main()
else:
  main()

# Load model