##### Copyright 2020 The TensorFlow Authors.

In [None]:
#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Actor-Critic Method For explainibility


**Actor-Critic methods**

Actor-Critic methods are [temporal difference (TD) learning](https://en.wikipedia.org/wiki/Temporal_difference_learning) methods that represent the policy function independent of the value function. 

A policy function (or policy) returns a probability distribution over actions that the agent can take based on the given state.
A value function determines the expected return for an agent starting at a given state and acting according to a particular policy forever after.

In the Actor-Critic method, the policy is referred to as the *actor* that proposes a set of possible actions given a state, and the estimated value function is referred to as the *critic*, which evaluates actions taken by the *actor* based on the given policy.

In this tutorial, both the *Actor* and *Critic* will be represented using one neural network with two outputs.


In [2]:
import collections
import numpy as np
import tensorflow as tf
import tqdm
import pandas as pd
from matplotlib import pyplot as plt
from tensorflow.keras import layers
from typing import Any, List, Sequence, Tuple

ModuleNotFoundError: No module named 'tensorflow'

## Model

The *Actor* and *Critic* will be modeled using one neural network that generates the action probabilities and critic value respectively. We use model subclassing to define the model. 

During the forward pass, the model will take in the state as the input and will output both action probabilities and critic value $V$, which models the state-dependent [value function](https://spinningup.openai.com/en/latest/spinningup/rl_intro.html#value-functions). The goal is to train a model that chooses actions based on a policy $\pi$ that maximizes expected [return](https://spinningup.openai.com/en/latest/spinningup/rl_intro.html#reward-and-return).

For Cartpole-v0, there are four values representing the state: cart position, cart-velocity, pole angle and pole velocity respectively. The agent can take two actions to push the cart left (0) and right (1) respectively.

Refer to [OpenAI Gym's CartPole-v0 wiki page](http://www.derongliu.org/adp/adp-cdrom/Barto1983.pdf) for more information.


In [3]:
class ActorCritic(tf.keras.Model):
  """Combined actor-critic network."""

  def __init__(
      self, 
      num_actions: int, 
      num_hidden_units: int):
    """Initialize."""
    super().__init__()

    self.common = layers.Dense(num_hidden_units, activation="relu")
    self.actor = layers.Dense(num_actions)
    self.critic = layers.Dense(1)

  def call(self, inputs: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
    x = self.common(inputs)
    # x will be the 128 or num_hidden_units neurons
    return self.actor(x), self.critic(x)

NameError: name 'tf' is not defined

In [51]:
# reading CSV
from scipy.stats import zscore

df = pd.read_csv("soccerdatacsv.csv")
# dropping columns if shots is nan
df = df[df['Shots'].notna()]
# use shots, tackles, passes, dribbles, disp
# adjusting all entries to between 0 and 1 - need to zscore stuff
cols = [2,4,7,9,16,27]
df = df[df.columns[cols]]
df = df.apply(zscore)
print(df)

       Shots  KeyPasses  Offsides  UnsTouches    Passes  performance
0  -0.619834  -0.644531 -0.392232   -0.822179 -0.433577     0.098511
1   0.021374  -0.644531 -0.392232    0.298974  0.387482     1.098106
3  -0.619834  -0.644531 -0.392232   -0.822179 -0.087868    -1.846353
5  -0.619834  -0.644531 -0.392232    0.298974 -0.044654    -0.249174
7  -0.619834  -0.644531  2.549510    1.420127  0.906046    -0.607725
9  -0.619834   0.373149 -0.392232    0.298974  0.473909     0.674364
11 -0.619834  -0.644531 -0.392232    0.298974  1.381396    -0.042736
13  0.021374  -0.644531 -0.392232    1.420127 -0.952140    -0.901084
15  0.662581   0.373149 -0.392232    2.541279  0.257841     0.543982
17  0.021374  -0.644531 -0.392232    1.420127 -0.217509     1.358870
19  0.662581   1.390830  2.549510    2.541279  0.949259     0.141971
21 -0.619834  -0.644531 -0.392232    0.298974 -1.773199    -1.129252
23 -0.619834  -0.644531 -0.392232   -0.822179 -1.341063    -0.672916
25 -0.619834  -0.644531 -0.392232 

In [30]:
# set up actor critic network
num_actions = 5
num_hidden_units = 32

model = ActorCritic(num_actions, num_hidden_units)
model.build((None,5))
model.summary()
# print(model.layers[0].get_weights())
# print(model.layers[1].weights)
a = np.array(model.layers[1].get_weights())   
model.layers[1].set_weights(a + 0.1)  
# start modeling with 5 attributes
start_state = np.array([1,1,1,1,1])
tensor_state = tf.constant([1,1,1,1,1], dtype=tf.float32)

Model: "actor_critic_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              multiple                  192       
_________________________________________________________________
dense_4 (Dense)              multiple                  165       
_________________________________________________________________
dense_5 (Dense)              multiple                  33        
Total params: 390
Trainable params: 390
Non-trainable params: 0
_________________________________________________________________


In [5]:
# testing stuff
df.iloc[0]['performance']
# print(model.layers[1].weights)

7.18

In [52]:
# collecting train data by running simulation
# reward function, given action of what parameters on variables to modify calc reward based on how close those vars model the true performance
# what is done?? state is like the hidden state 
# action will be a value between -1 and 1, mulitplied by 10 
# time maps to place in state
# READ IN CSV = ENVIRONMENT
# GLOBAL timestep
# use shots, tackles, passes, dribbles, disp
step = 0
# state = np.array([1,1,1,1,1])
# # action is wether to increase or decrease the state exponet
# def custom_env_step(action: np.array) -> (np.array, float, bool):
#   # check if done
#   if step > len(df):
#     return np.array([1,1,1]), 2.0, True
#   # action affects state
#   tempstate = [state[i] + 1 if x == 1 else (state[i] -1 if x == -1 else state[i]) for i,x in enumerate(action) ] 
#   # calculate the reward, mutliply values by 10 so exponents work bc .5^2 is smaller than .5
#   # total = sum(np.power(b,f) for f, b in zip(start_state,save.numpy()[0] ))
#   model_predict_perf = sum([np.power(value*10,state[index]) for index,value in enumerate(df.iloc[[step],[2,4,7,9,16]].values[0]) ])
#   true_perf = df.iloc[step]['performance']
#   reward = 10/ np.absolute(model_predict_perf-true_perf)
#   reward = tf.convert_to_tensor(reward)
#   reward = tf.cast(reward, tf.float32)
#   return tempstate, reward, False
def custom_env_step2(action: tf.Tensor) -> (tf.Tensor, float, bool):
  # check if done
  if step > len(df):
    return np.array([1,1,1]), 2.0, True
  # action affects state
  tempstate = tensor_state + action
  print('temp', tempstate)
  # calculate the reward, mutliply values by 10 so exponents work bc .5^2 is smaller than .5
  fromcsv = tf.convert_to_tensor(df.iloc[[step],[0,1,2,3,4]].values[0])
  fromcsv = tf.cast(fromcsv, tf.float32)
  model_predict_perf = tf.pow(fromcsv*10,tempstate)
  true_perf = tf.convert_to_tensor(df.iloc[step]['performance'])
  true_perf = tf.cast(true_perf, tf.float32)
  reward = tf.abs(10 / (tf.reduce_sum(model_predict_perf) - true_perf))
  return tempstate, reward, False

# s,r,b = custom_env_step2(tensor_state)
# print('s',s)
# print('r',r)
losses = []
# simulation for train data
def run_perfomance_episode(
    initial_state: np.array,  
    model: tf.keras.Model, 
    max_steps: int) -> List[tf.Tensor]:
  """Runs a single episode to collect training data."""
  # action_probs = np.array([])
  # critic_values = np.array([])
  # rewards = np.array([])
  action_probs = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True)
  critic_values = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True)
  rewards = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True)

  state = initial_state

  for t in range(max_steps):
    # Convert state into a batched tensor (batch size = 1)
    state = tf.expand_dims(state, 0)
  
    # Run the model and to get action probabilities and critic value
    action_values, critic_value = model(state)
    b = tf.keras.activations.tanh(action_values)
    b = tf.round(b)
    print('model actions',b)
    # actions_list = [round(e, 0) for e in b.numpy()]

    # Store critic values
    # squeeze. Removes dimensions of size 1 from the shape of a tensor.
    # critic_values = np.append(critic_values,critic_value)
    critic_values = critic_values.write(0, tf.squeeze(critic_value))

    # Store log probability of the action chosen
    action_probs = action_probs.write(0, b)
    # action_probs =np.append(action_probs,actions_list)
  
    # Apply action to the environment to get next state and reward
    state, reward, done = custom_env_step2(b)
    # state.set_shape(initial_state_shape)
  
    # Store reward
    # rewards = np.append(rewards,reward)
    rewards = rewards.write(t, reward)


    # if tf.cast(done, tf.bool):
    #   break
    action_probs = action_probs.stack()
    critic_values = critic_values.stack()
    rewards = rewards.stack()
  return action_probs, critic_values, rewards

#store actions, values, rewards in global vars
print('tensor state', tensor_state)
episoderesults = run_perfomance_episode(tensor_state,model,1 )
print(episoderesults)

tensor state tf.Tensor([1. 1. 1. 1. 1.], shape=(5,), dtype=float32)
model actions tf.Tensor([[1. 0. 1. 1. 0.]], shape=(1, 5), dtype=float32)
temp tf.Tensor([[2. 1. 2. 2. 1.]], shape=(1, 5), dtype=float32)
(<tf.Tensor: shape=(1, 1, 5), dtype=float32, numpy=array([[[1., 0., 1., 1., 0.]]], dtype=float32)>, <tf.Tensor: shape=(1,), dtype=float32, numpy=array([-0.10710674], dtype=float32)>, <tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.09047959], dtype=float32)>)


In [8]:
# calculating actor critic loss, loss function
huber_loss = tf.keras.losses.Huber(reduction=tf.keras.losses.Reduction.SUM)
# values from critic, returns computed from rewards
def compute_loss(
    action_probs: tf.Tensor,  
    values: tf.Tensor,  
    returns: tf.Tensor) -> tf.Tensor:
  """Computes the combined actor-critic loss."""

  advantage = returns - values

  action_log_probs = tf.math.log(action_probs)
  # reduce sum is just summing all the elements
  actor_loss = -tf.math.reduce_sum(action_log_probs * advantage)

  critic_loss = huber_loss(values, returns)

  return actor_loss + critic_loss


# getting expected returns or sum of rewards, rewards now are worth more than rewards later, this helps fcn converge
def get_expected_return(
    rewards: tf.Tensor, 
    gamma: float, 
    standardize: bool = True) -> tf.Tensor:
  """Compute expected returns per timestep."""

  n = tf.shape(rewards)[0]
  returns = tf.TensorArray(dtype=tf.float32, size=n)

  # Start from the end of `rewards` and accumulate reward sums
  # into the `returns` array
  rewards = tf.cast(rewards[::-1], dtype=tf.float32)
  discounted_sum = tf.constant(0.0)
  discounted_sum_shape = discounted_sum.shape
  for i in tf.range(n):
    reward = rewards[i]
    discounted_sum = reward + gamma * discounted_sum
    discounted_sum.set_shape(discounted_sum_shape)
    returns = returns.write(i, discounted_sum)
  returns = returns.stack()[::-1]

  if standardize:
    returns = ((returns - tf.math.reduce_mean(returns)) / 
               (tf.math.reduce_std(returns) + eps))

  return returns

In [9]:
# updating NN weights for optimization
optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)

# initial_state: tf.Tensor, 
@tf.function
def train_step(
    initial_state: np.array, 
    model: tf.keras.Model, 
    optimizer: tf.keras.optimizers.Optimizer, 
    gamma: float, 
    max_steps_per_episode: int) -> tf.Tensor:
  """Runs a model training step."""
  with tf.GradientTape() as tape:
    tape.watch(model.trainable_variables)
    # Run the model for one episode to collect training data
    action_probs, values, rewards  = run_perfomance_episode(initial_state,model,1 )
    print(rewards)
    print(get_expected_return(rewards, 0.95))
    # return ^^ in tensors to not destory the graph chain
    lossey = compute_loss(action_probs, values, rewards)
  grads = tape.gradient(lossey, model.trainable_variables)
  optimizer.apply_gradients(zip(grads, model.trainable_variables))

  episode_reward = tf.math.reduce_sum(rewards)

  return episode_reward

In [53]:
# putting it all together
max_episodes = 10000
max_steps_per_episode = 1000

# Cartpole-v0 is considered solved if average reward is >= 195 over 100 
# consecutive trials
reward_threshold = 195
running_reward = 0

# Discount factor for future rewards
gamma = 0.99

# with tqdm.trange(max_episodes) as t:
#manual loop for now
new_state = tf.constant([1,1,1,1,1], dtype=tf.float32)
# print('OG new', new_state)
# episoderesults = run_perfomance_episode(new_state,model,1 )
# print(episoderesults)
# train_step(new_state,model,optimizer,0.8, max_steps_per_episode  )
# print(episoderesults[0][0])
# new_state = (episoderesults[0][0] + new_state)[0]
# print('new state' , new_state)
for i in range(5):
  print('episode', i)
  print('OG new', new_state)
  episoderesults = run_perfomance_episode(new_state,model,1 )
  print(episoderesults)
  # train_step(new_state,model,optimizer,0.8, max_steps_per_episode  )
  print(episoderesults[0][0])
  new_state = (episoderesults[0][0] + new_state)[0]
  print('new state' , new_state)

episode 0
OG new tf.Tensor([1. 1. 1. 1. 1.], shape=(5,), dtype=float32)
model actions tf.Tensor([[1. 0. 1. 1. 0.]], shape=(1, 5), dtype=float32)
temp tf.Tensor([[2. 1. 2. 2. 1.]], shape=(1, 5), dtype=float32)
(<tf.Tensor: shape=(1, 1, 5), dtype=float32, numpy=array([[[1., 0., 1., 1., 0.]]], dtype=float32)>, <tf.Tensor: shape=(1,), dtype=float32, numpy=array([-0.10710674], dtype=float32)>, <tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.09047959], dtype=float32)>)
tf.Tensor([[1. 0. 1. 1. 0.]], shape=(1, 5), dtype=float32)
new state tf.Tensor([2. 1. 2. 2. 1.], shape=(5,), dtype=float32)
episode 1
OG new tf.Tensor([2. 1. 2. 2. 1.], shape=(5,), dtype=float32)
model actions tf.Tensor([[1. 1. 1. 1. 0.]], shape=(1, 5), dtype=float32)
temp tf.Tensor([[2. 2. 2. 2. 1.]], shape=(1, 5), dtype=float32)
(<tf.Tensor: shape=(1, 1, 5), dtype=float32, numpy=array([[[1., 1., 1., 1., 0.]]], dtype=float32)>, <tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.55251276], dtype=float32)>, <tf.Tensor:

In [56]:
# testing
print(df.iloc[[step],[0,1,2,3,4]].values[0])
value = df.iloc[[step],[0,1,2,3,4]].values[0]
print(df.iloc[0]['performance'])
state = np.array([6., 5.,  6.,  6.0,  1.0])
sum([np.power(value*10,state[index]) if value > 0 else 1 for index,value in enumerate(value) ])

[-0.6198336  -0.64453085 -0.39223227 -0.8221786  -0.43357672]
0.09851079871438467


5