In [None]:
#more details on the code can be found here https://www.tensorflow.org/agents/tutorials/1_dqn_tutorial
#!!! if this code was to be used by anyone other than the original author, certain lines might have to be tweaked (such as the google drive path for examples)
!pip install gym
!pip install 'imageio==2.4.0'
!pip install PILLOW
!pip install -pyglet
!pip install tf-agents
!pip install optuna
!pip install -q 'xvfbwrapper==0.2.9'


In [None]:
from __future__ import absolute_import, division, print_function

import base64
import imageio
import IPython
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import PIL.Image
import gym
import os
from functools import partial
import tempfile

from google.colab import files

import tensorflow as tf

import tensorflow_datasets as tfds

import tf_agents

from tf_agents.agents.dqn import dqn_agent
from tf_agents.drivers import dynamic_step_driver
from tf_agents.environments import suite_gym
from tf_agents.environments import tf_py_environment
from tf_agents.eval import metric_utils
from tf_agents.metrics import tf_metrics
from tf_agents.networks import q_network
from tf_agents.policies import random_tf_policy, epsilon_greedy_policy, policy_saver
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.trajectories import trajectory
from tf_agents.utils import common

from gym import envs


In [None]:
from google.colab import drive
drive.mount('/content/drive')
import sys
sys.path.insert(0, "/content/drive/My Drive/Jass_RL/JassOpenAIGym/")
import gym_Jass

In [None]:
#setup hyperparameters
num_iterations = 500000 # @param {type:"integer"}

initial_collect_steps = 100  # @param {type:"integer"} 
collect_steps_per_iteration = 1  # @param {type:"integer"}
replay_buffer_max_length = 25000  # @param {type:"integer"}

batch_size = 128  # @param {type:"integer"}
learning_rate = 1e-2  # @param {type:"number"}
log_interval = 200  # @param {type:"integer"}

num_eval_episodes = 10  # @param {type:"integer"}
eval_interval = 1000  # @param {type:"integer"}


In [None]:
#initialize the TPUs
resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='grpc://' + os.environ['COLAB_TPU_ADDR'])
tf.config.experimental_connect_to_cluster(resolver)
# This is the TPU initialization code that has to be at the beginning.
tf.tpu.experimental.initialize_tpu_system(resolver)
print("All devices: ", tf.config.list_logical_devices('TPU'))

strategy = tf.distribute.TPUStrategy(resolver)


In [None]:
#definition of the epsilon decay
train_step = tf.Variable(0, trainable=False, name='global_step', dtype=tf.int64)
def exponentially_decaying_epsilon(train_step, start_epsilon = 1, end_epsilon =0.01, decay = 0.999995, step_number = num_iterations):
  #convert tensor to normal integer
  train_step = train_step.numpy()
  epsilon = max(start_epsilon * decay**train_step, end_epsilon)
  return epsilon
exponentially_decaying_epsilon(train_step)


In [None]:
#loads the gym, set ups the environment, the q-network, the optimizer and the learning agent
train_py_env = suite_gym.load("Jass-v0")
eval_py_env = suite_gym.load("Jass-v0")


train_env = tf_py_environment.TFPyEnvironment(train_py_env)
eval_env = tf_py_environment.TFPyEnvironment(eval_py_env)

#there are 180 input nodes and 9 output nodes, fc_layer_params sets the number of nodes in the hidden layers
fc_layer_params = (128, 36)

with strategy.scope():


  q_net = q_network.QNetwork(
      train_env.observation_spec(),
      train_env.action_spec(),
      fc_layer_params=fc_layer_params)

optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate)



with strategy.scope():

  
  global_step = tf.compat.v1.train.get_or_create_global_step()
  train_step = tf.Variable(0, trainable=False, name='global_step', dtype=tf.int64)
  partial_exponentially_decaying_eps = partial(exponentially_decaying_epsilon, train_step)

  agent = dqn_agent.DqnAgent(
      train_env.time_step_spec(),
      train_env.action_spec(),
      q_network=q_net,
      epsilon_greedy= partial_exponentially_decaying_eps,
      optimizer=optimizer,
      gamma = 0.99,
      td_errors_loss_fn=common.element_wise_squared_loss,
      train_step_counter=global_step)

  agent.initialize()

eval_policy = agent.policy
collect_policy = agent.collect_policy

random_policy = random_tf_policy.RandomTFPolicy(train_env.time_step_spec(),
                                                train_env.action_spec())
example_environment = tf_py_environment.TFPyEnvironment(
    suite_gym.load('Jass-v0'))

time_step = example_environment.reset()

random_policy.action(time_step)











In [None]:
#summarizes the specs of the Q network that has been setup
q_net.summary()

In [None]:
#computes the average return over a number of episodes (for testing purposes)
def compute_avg_return(environment, policy, num_episodes=10):

  total_return = 0.0
  for _ in range(num_episodes):
    time_step = environment.reset()
    episode_return = 0.0

    while not time_step.is_last():
      action_step = policy.action(time_step)
      time_step = environment.step(action_step.action)
      episode_return += time_step.reward
    total_return += episode_return

  avg_return = total_return / num_episodes
  return avg_return.numpy()[0]
compute_avg_return(eval_env, random_policy, num_eval_episodes)



In [None]:
#testing of step and episode, not relevant for the training of the models
env = suite_gym.load('Jass-v0')
tf_env = tf_py_environment.TFPyEnvironment(env)

time_step = tf_env.reset()
rewards = []
steps = []
num_episodes = 5

for _ in range(num_episodes):
  episode_reward = 0
  episode_steps = 0
  while not time_step.is_last():
    action = tf.random.uniform([1], 0, 2, dtype=tf.int32)
    time_step = tf_env.step(action)
    episode_steps += 1
    episode_reward += time_step.reward.numpy()
  rewards.append(episode_reward)
  steps.append(episode_steps)
  time_step = tf_env.reset()

num_steps = np.sum(steps)
avg_length = np.mean(steps)
avg_reward = np.mean(rewards)

print('num_episodes:', num_episodes, 'num_steps:', num_steps)
print('avg_length', avg_length, 'avg_reward:', avg_reward)


In [None]:
#connects google colab and google cloud, for authentification purposes
from google.colab import auth

auth.authenticate_user()

In [None]:
#setup of the replay buffer
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    data_spec=agent.collect_data_spec,
    batch_size=train_env.batch_size,
    max_length=replay_buffer_max_length)

def collect_step(environment, policy, buffer):
  time_step = environment.current_time_step()
  action_step = policy.action(time_step)
  next_time_step = environment.step(action_step.action)
  traj = trajectory.from_transition(time_step, action_step, next_time_step)

  # Add trajectory to the replay buffer
  buffer.add_batch(traj)

def collect_data(env, policy, buffer, steps):
  for _ in range(steps):
    collect_step(env, policy, buffer)

collect_data(train_env, random_policy, replay_buffer, initial_collect_steps)

dataset = replay_buffer.as_dataset(
    num_parallel_calls=3, 
    sample_batch_size=batch_size, 
    num_steps=2).prefetch(3)

iterator = iter(dataset)



In [None]:
#set up of the Google cloud environment, in order for the modells to be saved there
project_id = "gentle-ally-303519"
bucket_name = "rl_jass"
!gcloud config set project {project_id}

In [None]:
#implementing checkpointer and policy saver to later load the model
checkpoint_dir = "gs://rl_jass"
train_checkpointer = common.Checkpointer(
    ckpt_dir=checkpoint_dir,
    max_to_keep=1,
    agent=agent,
    policy=agent.policy,
    replay_buffer=replay_buffer,
    global_step=global_step
)

policy_dir = "gs://rl_jass"
tf_policy_saver = policy_saver.PolicySaver(agent.policy)


In [None]:
#training of the RL-algorithm
try:
  %%time
except:
  pass


# (Optional) Optimize by wrapping some of the code in a graph using TF function.
agent.train = common.function(agent.train)

# Reset the train step
agent.train_step_counter.assign(0)

# Evaluate the agent's policy once before training.
avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes)
returns = [avg_return]

loss = []

for _ in range(num_iterations):

  # Collect a few steps using collect_policy and save to the replay buffer.
  collect_data(train_env, agent.collect_policy, replay_buffer, collect_steps_per_iteration)

  # Sample a batch of data from the buffer and update the agent's network.
  experience, unused_info = next(iterator)
  train_loss = agent.train(experience).loss



  step = agent.train_step_counter.numpy()

  if step % log_interval == 0:
    print('step = {0}: loss = {1}'.format(step, train_loss))
  if step % eval_interval == 0:
    avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes)
    print('step = {0}: Average Return = {1}'.format(step, avg_return))
    print(exponentially_decaying_epsilon(agent.train_step_counter))
    loss.append(train_loss.numpy())
    returns.append(avg_return)
  if step % 50000 == 0:
    train_checkpointer.save(global_step)
    np.savez("outfile.npz", loss, returns)
    !cp outfile.npz "/content/drive/My Drive/"
  if step == 500000:
    tf_policy_saver.save(policy_dir)

In [None]:
#loss and average returns are being saved in the outfile.npz, so that they don't get lost after training
outfile_npz= np.load("outfile.npz")
loss = outfile_npz["arr_0"].tolist()
returns = outfile_npz["arr_1"].tolist()


In [None]:
#visualization of average returns
iterations = range(0, num_iterations + 1, eval_interval)
plt.plot(iterations, returns)
plt.ylabel('Average Return')
plt.xlabel('Iterations')
plt.ylim(top=1)


In [None]:
#visualization of the loss
iterations = range(0, num_iterations, eval_interval)
plt.plot(iterations, loss)
plt.ylabel('Loss')
plt.xlabel('Iterations')
#plt.ylim(top=15000000)
plt.axis([0, 100000, 0, 100000])


In [None]:
#loading the modell from Google cloud / initializing the checkpointer from Google Cloud if training has to be continued and then continuing the training process
policy = agent.policy

global_step = tf.compat.v1.train.get_global_step()

policy_checkpointer = common.Checkpointer(ckpt_dir= checkpoint_dir, global_step = global_step, policy=policy, agent = agent)
policy_checkpointer.initialize_or_restore()

outfile_npz= np.load("outfile.npz")
loss = outfile_npz["arr_0"].tolist()
returns = outfile_npz["arr_1"].tolist()

try:
  %%time
except:
  pass

my_policy = agent.collect_policy


# (Optional) Optimize by wrapping some of the code in a graph using TF function.
agent.train = common.function(agent.train)

# Evaluate the agent's policy once before training.
avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes)

for _ in range(num_iterations):

  # Collect a few steps using collect_policy and save to the replay buffer.
  collect_data(train_env, agent.collect_policy, replay_buffer, collect_steps_per_iteration)

  # Sample a batch of data from the buffer and update the agent's network.
  experience, unused_info = next(iterator)
  train_loss = agent.train(experience).loss



  step = agent.train_step_counter.numpy()

  if step % log_interval == 0:
    print('step = {0}: loss = {1}'.format(step, train_loss))
  if step % eval_interval == 0:
    avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes)
    print('step = {0}: Average Return = {1}'.format(step, avg_return))
    print(exponentially_decaying_epsilon(agent.train_step_counter))
    loss.append(train_loss.numpy())
    returns.append(avg_return)
  if step % 50000 == 0:
    train_checkpointer.save(global_step)
    tf_policy_saver.save(policy_dir)
    np.savez("outfile.npz", loss, returns)
    !cp outfile.npz "/content/drive/My Drive/"



In [None]:
#evaluating the environment for another 10'000 games and returning certain key statistics
eval_py_env = suite_gym.load("Jass-v0")
eval_env = tf_py_environment.TFPyEnvironment(eval_py_env)

saved_policy = tf.compat.v2.saved_model.load(policy_dir)
num_eval_episodes = 10000
avg_return = compute_avg_return(eval_env, saved_policy, num_eval_episodes)

print(eval_py_env.game_dict)
print(avg_return)

In [None]:
#plotting the exponential decay
def exponentially_decaying_epsilon_plot(start_epsilon = 1, end_epsilon = 0.05, decay = 0.999995, step_number = num_iterations):
  epsilon = start_epsilon
  steps = []
  step_counter = 0
  epsilon_arr = []
  for i in range(step_number): 
    step_counter += 1
    steps.append(step_counter)
    epsilon = max(epsilon * decay, end_epsilon)
    epsilon_arr.append(epsilon)
  
  plt.plot(steps, epsilon_arr)
exponentially_decaying_epsilon_plot()


In [None]:
#plotting a linear decay
def linearly_decaying_epsilon(start_epsilon = 1, end_epsilon = 0.01, step_number = num_iterations, plot = False, train = True):
  #until when does the epsilon decay (number of steps to go from start_epsilon to end_epsilon) in this case at 85% of the learning process
  num_learn_iterations = num_iterations * 0.85
  epsilon = start_epsilon
  steps = []
  step_counter = 0
  epsilon_arr = []
  for i in range(num_iterations):
    if train == True:
      step_counter = agent.train_step_counter.numpy()
    else: 
      step_counter += 1
    steps.append(step_counter)
    epsilon = max(end_epsilon, epsilon - ((start_epsilon-end_epsilon) / num_learn_iterations))
    epsilon_arr.append(epsilon)
  if plot == True:
    plt.plot(steps, epsilon_arr)
linearly_decaying_epsilon(plot = True, train = False)
