# Reversi AI using ACM (Actor Critic Model)

In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import random

from game import Reversi

In [2]:
SEED = 42 # deterministic randomness
GAMMA = 0.999 # past reward multiplier
MAX_STEPS_PER_EPISODE = 200 # epochs
EPS = np.finfo(np.float32).eps.item() # smallest number to prevent divided by zero

In [3]:
NUM_INPUTS = 64
NUM_ACTIONS = 64
NUM_LAYERS = 3
NUM_HIDDEN = 512

inputs = layers.Input(shape=(NUM_INPUTS,))
for i in range(NUM_LAYERS):
    common = layers.Dense(NUM_HIDDEN, activation="relu")(inputs)
action = layers.Dense(NUM_ACTIONS, activation="softmax")(common)
critic = layers.Dense(1)(common)

model = keras.Model(inputs=inputs, outputs=[action, critic])

In [4]:
optimizer = keras.optimizers.Adam(learning_rate=0.001)
huber_loss = keras.losses.Huber()
action_probs_history = []
critic_value_history = []
rewards_history = []
running_reward = 0
episode_count = 0
reversi = Reversi()

while True:
    reversi.reset()
    state = reversi.encode_board1()
    episode_reward = 0
    with tf.GradientTape() as tape:
        for timestep in range(1, MAX_STEPS_PER_EPISODE):
            state = tf.convert_to_tensor(state)
            state = tf.expand_dims(state, 0)

            # Predict action probabilities and estimated future rewards
            # from environment state
            action_probs, critic_value = model(state)
#             print('np squeese action probs\n', sum(np.squeeze(action_probs)))
#             print('critic value\n', critic_value)
            critic_value_history.append(critic_value[0, 0])

            # Sample action from action probability distribution
            action = random.choices([i for i in range(NUM_ACTIONS)], weights=np.squeeze(action_probs))[0]
#             print('action\n', action)
#             print('\n\n')
#             print(action)
#             print(np.random.randint(0, NUM_ACTIONS - 1))
#             action = (action + np.random.randint(0, NUM_ACTIONS - 1)) // NUM_ACTIONS
#             print(action)
            action_probs_history.append(tf.math.log(action_probs[0, action]))

            # Apply the sampled action in our environment
            state, reward, done = reversi.place_ai(action)
            rewards_history.append(reward)
            episode_reward += reward

            if done:
                break

        # Update running reward to check condition for solving
        running_reward = 0.05 * episode_reward + (1 - 0.05) * running_reward

        # Calculate expected value from rewards
        # - At each timestep what was the total reward received after that timestep
        # - Rewards in the past are discounted by multiplying them with gamma
        # - These are the labels for our critic
        returns = []
        discounted_sum = 0
        for r in rewards_history[::-1]:
            discounted_sum = r + GAMMA * discounted_sum
            returns.insert(0, discounted_sum)

        # Normalize
        returns = np.array(returns)
        returns = (returns - np.mean(returns)) / (np.std(returns) + EPS)
        returns = returns.tolist()

        # Calculating loss values to update our network
        history = zip(action_probs_history, critic_value_history, returns)
        actor_losses = []
        critic_losses = []
        for log_prob, value, ret in history:
            # At this point in history, the critic estimated that we would get a
            # total reward = `value` in the future. We took an action with log probability
            # of `log_prob` and ended up recieving a total reward = `ret`.
            # The actor must be updated so that it predicts an action that leads to
            # high rewards (compared to critic's estimate) with high probability.
            diff = ret - value
            actor_losses.append(-log_prob * diff)  # actor loss

            # The critic must be updated so that it predicts a better estimate of
            # the future rewards.
            critic_losses.append(
                huber_loss(tf.expand_dims(value, 0), tf.expand_dims(ret, 0))
            )

        # Backpropagation
        loss_value = sum(actor_losses) + sum(critic_losses)
        grads = tape.gradient(loss_value, model.trainable_variables)
        optimizer.apply_gradients(zip(grads, model.trainable_variables))

        # Clear the loss and reward history
        action_probs_history.clear()
        critic_value_history.clear()
        rewards_history.clear()

    # Log details
    episode_count += 1
    if episode_count == 5000:
        break
    if episode_count % 100 == 0:
        template = "running reward: {:.2f} at episode {}"
        print(reversi.board)
        print(template.format(running_reward, episode_count))
        print(action_probs)
        print(action)

    if episode_count == 1000000:  # Condition to consider the task solved
        print("Solved at episode {}!".format(episode_count))
        break

[[ 0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0]
 [ 0  0  0  1 -1  0  0  0]
 [ 0  0  0 -1  1  0  0  0]
 [ 0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0]]
running reward: -993.97 at episode 100
tf.Tensor(
[[0.0002469  0.00128174 0.00523652 0.00054493 0.00033599 0.12370772
  0.00195288 0.01254548 0.00321688 0.00151973 0.00126542 0.00054767
  0.01632736 0.02367902 0.00158527 0.00037296 0.00411024 0.01311124
  0.00109243 0.02143705 0.02591346 0.00392206 0.00112102 0.04282488
  0.00321869 0.00141718 0.00096082 0.01262287 0.00337403 0.00144326
  0.03306351 0.00027819 0.00353406 0.01810935 0.02988353 0.00036076
  0.00349793 0.00610184 0.04675406 0.00390733 0.14559722 0.00272624
  0.2177346  0.04340715 0.00028375 0.00132516 0.00226774 0.00102777
  0.00046138 0.0038288  0.0277439  0.00396839 0.00043477 0.00044318
  0.0012087  0.00130397 0.00964708 0.00045058 0.00297774 0.00069303
  0.0031023  0.00625435 0.00187887 0.04480708]], shap

[[ 0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0]
 [ 0  0  0  1 -1  0  0  0]
 [ 0  0  0 -1  1  0  0  0]
 [ 0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0]]
running reward: -999.97 at episode 800
tf.Tensor(
[[5.3951351e-07 2.8359243e-06 1.9761866e-05 1.9087763e-06 6.5521021e-07
  9.4649220e-01 1.4136185e-05 1.3456016e-03 9.5684154e-06 4.0029304e-06
  3.7363334e-06 1.3816486e-06 1.2665747e-05 2.5994438e-04 6.1811018e-04
  6.2413602e-07 1.0238857e-05 6.8731293e-05 2.2632137e-06 2.2355404e-03
  7.1957236e-04 1.0229722e-05 3.6670094e-06 8.5308676e-04 6.3687959e-03
  7.4866507e-06 2.3316090e-06 8.5309533e-05 9.8925018e-05 2.2524841e-06
  1.0055448e-04 5.7789356e-07 3.0727457e-05 4.3709519e-05 3.0713115e-04
  8.2237671e-07 2.3798519e-03 6.8571479e-03 2.8326618e-04 1.3575100e-05
  1.4803586e-04 1.7673232e-05 5.5321580e-04 1.0682909e-04 8.7881745e-07
  3.3513263e-06 4.3108407e-06 2.2082977e-06 8.8348963e-07 3.5960542e-03
  9.8481064e-

[[ 0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0]
 [ 0  0  0  1 -1  0  0  0]
 [ 0  0  0 -1  1  0  0  0]
 [ 0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0]]
running reward: -1000.00 at episode 1500
tf.Tensor(
[[3.4757879e-06 1.3725642e-05 7.6478864e-05 1.1876186e-05 3.7301213e-06
  6.2831616e-01 8.6362248e-05 3.7964855e-03 3.9904698e-05 1.9832936e-05
  1.7976819e-05 7.5883663e-06 4.6482517e-05 6.3994335e-04 1.5948983e-03
  3.4317311e-06 3.9354916e-05 2.4212693e-04 1.1493079e-05 4.4224574e-03
  1.3333109e-03 4.2496969e-05 1.7952440e-05 3.5128898e-03 1.2038311e-02
  4.0038307e-05 1.1344003e-05 3.1943372e-04 2.6718731e-04 1.0627890e-05
  3.4676021e-04 3.3801578e-06 2.1619760e-04 1.4553970e-04 9.1102871e-04
  4.5911956e-06 2.9727952e-03 7.9762340e-03 4.1912796e-04 5.0349150e-05
  3.6992444e-04 9.9897465e-05 1.4573617e-03 3.1586381e-04 5.8079240e-06
  1.6043046e-05 2.0954749e-05 1.0766980e-05 5.0517124e-06 4.1855774e-03
  1.5475166

[[ 0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0]
 [ 0  0  0  1 -1  0  0  0]
 [ 0  0  0 -1  1  0  0  0]
 [ 0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0]]
running reward: -1000.00 at episode 2200
tf.Tensor(
[[2.59504134e-07 1.22158326e-06 1.47839710e-05 2.09784116e-06
  3.53040036e-07 1.57457578e-03 4.59698378e-04 3.56965786e-04
  4.52554559e-06 1.81673192e-06 1.59393790e-06 8.48627565e-07
  2.98497071e-06 1.97992213e-05 3.68297420e-04 2.59510557e-07
  5.06806600e-06 5.44585273e-05 9.47103558e-07 2.15592645e-05
  1.20367649e-05 5.65754135e-06 1.90700280e-06 3.07843620e-05
  3.45044304e-04 1.93096821e-05 9.90703370e-07 1.02660255e-04
  2.36624874e-05 9.48899981e-07 9.82036072e-05 2.98860073e-07
  9.87163007e-01 1.73400840e-05 4.20416181e-05 5.04995626e-07
  3.56854289e-05 7.19304895e-04 1.45308841e-05 4.15648719e-06
  1.55826747e-05 8.46050607e-05 3.33378994e-05 9.70662950e-06
  9.13726296e-07 1.73211322e-06 2.42809961e-06 8.

[[ 0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0]
 [ 0  0  0  1 -1  0  0  0]
 [ 0  0  0 -1  1  0  0  0]
 [ 0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0]]
running reward: -1000.00 at episode 2900
tf.Tensor(
[[2.99575845e-07 1.40907582e-06 1.73746994e-05 2.44280000e-06
  4.07055666e-07 1.68134039e-03 6.97249547e-04 3.99918179e-04
  5.23561130e-06 2.09600762e-06 1.83499662e-06 9.80621166e-07
  3.40849556e-06 2.18883597e-05 4.20451368e-04 2.98553914e-07
  5.87273962e-06 6.42072118e-05 1.09077632e-06 2.34286508e-05
  1.31915558e-05 6.57940882e-06 2.19523395e-06 3.37382517e-05
  3.77587014e-04 2.32355997e-05 1.14116631e-06 1.23104604e-04
  2.69983502e-05 1.09301504e-06 1.16705698e-04 3.44546407e-07
  9.85927939e-01 2.00002378e-05 4.68053368e-05 5.84983525e-07
  3.85045460e-05 8.09529738e-04 1.63022960e-05 4.76526020e-06
  1.75739769e-05 1.04708335e-04 3.70099006e-05 1.10122073e-05
  1.06008997e-06 2.00309796e-06 2.81757252e-06 9.

[[ 0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0]
 [ 0  0  0  1 -1  0  0  0]
 [ 0  0  0 -1  1  0  0  0]
 [ 0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0]]
running reward: -1000.00 at episode 3600
tf.Tensor(
[[2.13972683e-07 1.00026182e-06 1.27574367e-05 1.77834465e-06
  2.91488902e-07 1.12352520e-03 1.00162318e-02 2.66687886e-04
  3.75230752e-06 1.49869948e-06 1.31116678e-06 7.00851842e-07
  2.40682562e-06 1.49446550e-05 2.38307694e-04 2.12720366e-07
  4.23240090e-06 4.77121066e-05 7.77131731e-07 1.59965130e-05
  9.03091313e-06 4.73196633e-06 1.55470025e-06 2.31594313e-05
  2.53683043e-04 1.84797573e-05 8.10586016e-07 9.48792585e-05
  1.91820363e-05 7.81161305e-07 8.84535548e-05 2.46758560e-07
  9.81269717e-01 1.41730570e-05 3.16190162e-05 4.16766795e-07
  2.64422179e-05 5.05719043e-04 1.09825687e-05 3.39380244e-06
  1.21873572e-05 1.02775484e-04 2.49842815e-05 7.71278701e-06
  7.69821725e-07 1.44215699e-06 2.03497962e-06 6.

[[ 0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0]
 [ 0  0  0  1 -1  0  0  0]
 [ 0  0  0 -1  1  0  0  0]
 [ 0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0]]
running reward: -1000.00 at episode 4300
tf.Tensor(
[[1.5299597e-07 7.0251002e-07 9.5271726e-06 1.3280629e-06 2.1259929e-07
  6.2454632e-04 9.2402595e-01 1.6375008e-04 2.6563298e-06 1.0583234e-06
  9.3457925e-07 5.0619155e-07 1.6742815e-06 9.6185113e-06 1.5530079e-04
  1.5390519e-07 3.1010834e-06 3.5625693e-05 5.5329332e-07 9.9503613e-06
  5.6598315e-06 3.3731192e-06 1.1050338e-06 1.4329367e-05 1.5691953e-04
  1.5701182e-05 5.7568803e-07 7.6031640e-05 1.3356947e-05 5.6084508e-07
  6.9039823e-05 1.8032780e-07 7.0614576e-02 1.0079517e-05 2.0057045e-05
  3.0077877e-07 1.6476493e-05 3.1882070e-04 7.1108093e-06 2.3735868e-06
  8.1904100e-06 7.9650992e-05 1.5886171e-05 5.2511682e-06 5.7091989e-07
  1.0406782e-06 1.4881545e-06 4.7505341e-07 2.3257194e-07 1.8971456e-05
  2.1237116