In [2]:
# Neural Network
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input, ReLU
from tensorflow.keras.optimizers import Adam
# Environment
import gym
# Further support
import numpy as np
import time
import scipy.signal
from tqdm import tqdm

Init Plugin
Init Graph Optimizer
Init Kernel


# TrajectoryStorage


In [3]:
class Storage:
    '''
    Contains all information the agent collects interacting with the environment.
    '''


    def __init__(self):
        '''
        Initializes empty lists as storages all observation variables during trajectory
        '''
        # Saves information about the current state of the agent at each step
        self.observations = []

        # Saves actions made and rewards achieved
        self.actions = []
        self.logits = []
        self.rewards = []
        self.BaselineEstimate = []

        # finished episodes will be completely stored in this list 
        self.episodes = []


    def store(self, observation, action, logits, reward, BaselineEstimate):
        '''
        Adds given information to the storage.

        Args:
        observation(obj): information (e.g. pixel values) about current state of agent
        action(float): Output of the actor network. Describes the action taken
        logits():
        reward(floats): Rewards collected by agent
        BaselineEstimate():
        '''
        self.observations.append(observation)
        self.actions.append(action)
        self.logits.append(logits)
        self.rewards.append(reward)
        self.BaselineEstimate.append(BaselineEstimate) # value of critics network
        

    def conclude_episode(self):
        '''
        Append all collect values to episodes list once one episode is finished.
        Computes all rewards collected in one episode. Prepares storage for next episode.
        '''
        self.episodes.append(
            [self.observations,
             self.actions, 
             self.logits,
             self.rewards,
             self.BaselineEstimate,
             # Get the return of the whole episode 
             sum(self.rewards))
             
        # Empty the arrays for new trajectory
        self.observations.clear()
        self.actions.clear()
        self.logits.clear()
        self.rewards.clear()
        self.BaselineEstimate.clear()

     
    def get_episodes(self):
        '''
        Returns list containing finished trajectories stored in self.episodes
        and the amount of episodes passed.
        '''
        return self.episodes, len(self.episodes)
        
        

# Actor Model


In [4]:
class Actor(Model):
    '''
    Neural network computing the actions the agent will take
    '''


    def __init__(self):
        '''
        Initialize layer architecture for Actor Network.
        '''
        # Subclassing API
        super(Actor, self).__init__()
        
        self.l = [
            # Three Dense Layers with random initial parameters having a standart deviation of 0.01
            Dense(128, activation="relu", kernel_initializer=tf.random_normal_initializer
            (stddev=0.01)),
            Dense(128, activation="relu", kernel_initializer=tf.random_normal_initializer
            (stddev=0.01)),
            Dense(64, activation="relu", kernel_initializer=tf.random_normal_initializer
            (stddev=0.01)),
            
            # Output layer with softmax activation function applied to for neurons.
            # Outputs prpobability for each of our for actions 
            # (Do nothing, fire left orientation engine, fire main engine, fire right orientation engine)
            Dense(4, activation="softmax")
        ]


    #@tf.function        
    def call(self, x):
        '''
        Iterates input x through network to create softmax ouutput.

        Args:
        x(): Network input. Pixel values representing the current state of the agent
        '''
        for l in self.l:
            x = l(x)
        return x


#####  logits = actor(observation) -> actor must be in capitol, gets instantiated twice, maybe idea is wrong
#@tf.function
def sample_action(observation):
    '''
    Calls the actor network with state of the agent and returns the network object + the samnpled action

    Args:
    observation(): Representation of actors state. Same as x in the call function. 
    '''
    # Create actor object
    logits = actor(observation)
   # tf.print(type(logits))
    # Sample action from the Softmax output of the network
    action = tf.squeeze(tf.random.categorical(logits, 1), axis=1)
   # tf.print(action)
    return logits, action

# Critic Model

In [5]:
class Critic(Model):
    '''
    Represents the value function of the network. 
    Input is a certain state and output a float value for that state.
    '''


    def __init__(self):
        '''
        Initialize layer architecture for Actor Network.
        '''
        # Subclassing API
        super(Critic, self).__init__()

        self.l = [
            # Three Dense Layers with ReLu activation function
            # Random initial parameters having a standart deviation of 0.01
            Dense(128, activation="relu", kernel_initializer=tf.random_normal_initializer
            (stddev=0.01)),
            Dense(128, activation="relu", kernel_initializer=tf.random_normal_initializer
            (stddev=0.01)),
            Dense(64, activation="relu", kernel_initializer=tf.random_normal_initializer
            (stddev=0.01)),

            # Output layer with Tanh activation function to get float output value ([-1;1])
            # Random initial parameters having a standart deviation of 0.01
            Dense(1, activation="tanh", kernel_regularizer=tf.random_normal_initializer(stddev=0.01))
        ]


    #@tf.function 
    def call(self, x):
        '''
        Iterates input x through network to create tanh output between -1 and 1 
        giving input state x a value.

        Args:
        x(): Network input. Pixel values representing the current state of the agent.
        '''
        for l in self.l:
            x = l(x)
        return x

In [6]:
'''
Define Hyperparameters
'''

# Number of iterations
epochs = 1
# Leads to ~10 Episodes per epoch, then compute new parameters (smaller batching)
steps_per_epoch = 1000 

# Learning rate for actor and critic
lr_actor = 3e-4
lr_critic = 3e-4

# Movements in environment (state-space) to collect training data
train_policy_iterations = 80
train_value_iterations = 80

# Parameter to decide how strongly the policy ratio gets clipped therefore how much policy (actor network)
#  updates we allow
# The selected 0.2 is the number proposed by the original paper by OpenAI
clip_ratio = 0.2

#
target_kl = 0.01


# Update weights with Adam optimizer
optimizer = Adam()

# To toggle displaying of environment
render = False

# Discount variable for rewards to whey immediate rewards stronger
gamma = 0.99

In [7]:
# Reset all states generated by Keras
tf.keras.backend.clear_session()

# Define environment
env = gym.make("LunarLander-v2")
# Get dimensions of state and amount of possible actions (4 for LunarLander-v2)
observation_dimensions = env.observation_space.shape[0]
num_actions = env.action_space.n

# create Storage object to save observations, actions, rewards etc. during trajectory
T = Storage()

# initialize actor and critics model
observation_input = Input(shape=(observation_dimensions,), dtype=tf.float32)
actor = Actor()
critic = Critic()

# Initialize: observation(agent state), 
# episode return(summed rewards for singe ) and 
# episode length(amount of steps taken (=frames) before agent finished)
observation, episode_return, episode_length = env.reset(), 0, 0

Metal device set to: Apple M1

systemMemory: 16.00 GB
maxCacheSize: 5.33 GB



2022-04-01 12:47:34.348637: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-04-01 12:47:34.348758: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [8]:
'''
Agent takes steps in environment according to current policy. Information gets saved to update policy.
'''


episodes_total = 0
# Iteration of whole training process
for epoch in range(epochs):

    # Initialize values for return, length and episodes
    sum_return = 0
    sum_length = 0
    num_episodes = 0

    # Each timestep t of steps_per_epoch (in paper denoted as capital T)
    #  allows takes on action in a state and saves the information in storage object
    for t in tqdm(range(steps_per_epoch)):

        # Toggles displaying of environment
        if render:
            env.render()

        # Reshaping observation to fit as input for Actor network (policy)
        observation = observation.reshape(1,-1)

        # Obtain action and logits for this observation by our actor
        logits, action = sample_action(observation=observation)
        
        # Take action in environment and obtain the rewards for it
        # Variable done represents wether agent has finished 
        # The last variable would be diagnostic information, not needed for training
        observation_new, reward, done, _ = env.step(action[0].numpy())

        # Sum up rewards over this episode and count amount of frames
        episode_return += reward
        episode_length += 1

        # Get the Base-Estimate from the Critics network
        base_estimate = critic(observation)

        # Store Variables collected in this timestep t
        T.store(observation=observation, action=action, logits=logits, reward=reward, BaselineEstimate=base_estimate)
        # Save the new state of our agent
        observation = observation_new
        
        # check if terminal state is reached in environment
        if done:
            # Save information about episode
            T.conclude_episode()
            # Refresh environment and reset return and length value
            observation, episode_return, episode_length = env.reset(), 0, 0

    # obtain all episodes saved in storage
    episodes, amount_episodes = T.get_episodes()

  

        


100%|██████████| 1000/1000 [00:03<00:00, 271.54it/s]


In [None]:
# episodes [episode][particular values // 0: Observations, 1: actions, 2: logits, 3, rewards, 4: BaselineEstimates from Critics]
#print(episodes[0][4])
#print(f'Number of Episodes = {amount_episodes}')



### Advantagefunction


# for i in b_estimates:
#   print(i.numpy())

# Discounted sum of rewards
# Saves list of all rewards in new variable 
#rewards = episodes[0][3]


    



In [10]:
### MIGHT NOT WORK
#  output for: discounted_reward([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 0.99)
#  -> [8.91, 7.920000000000001, 6.930000000000001, 5.94, 4.95, 3.96, 2.9699999999999998, 1.98, 0.99, 0]
# 
#  ###

def discounted_reward(rewards, gamma):
    '''
    weighs all rewards in a way such that immediate rewards have a stronger impact than possible future rewards.

    Args:
    rewards(list): list of all rewards collected by the agent in episode t (?)
    gamma(float): Hyperparameter determining how much future rewards should be weighed in
    '''
    # To select the next reward
    i = 0
    discounted_rewards = []

    # Iterates through every reward and appends a discounted version to the output
    for r in rewards:
        disc = 0
        for t in rewards[i:-1]:
            discount_t = gamma ** t
            disc += t * discount_t
        i += 1
        discounted_rewards.append(disc)

    # returns list of discounted rewards.
    return discounted_rewards
    
    


In [11]:
'''
Computes advantages.
'''

# Saves list of all rewards in new variable 
rewards = episodes[0][3]
# Get discounted sum of rewards 
disc_sum = discounted_reward(rewards, gamma)


# Estimated Value of the current situtation from the critics network
b_estimates = episodes[0][4] 

# Convert lists to np arrays and flatten
disc_sum_np = np.array(disc_sum)
b_estimates_np = np.array(b_estimates)
b_estimates_np = b_estimates_np.flatten()

# substract arrays to obtain advantages
advantages = np.subtract(disc_sum_np, b_estimates_np)
print(advantages)





[-2.35283260e+02 -2.34935956e+02 -2.33778553e+02 -2.30530271e+02
 -2.31134611e+02 -2.31665327e+02 -2.30913749e+02 -2.31794131e+02
 -2.33008851e+02 -2.31881619e+02 -2.31563902e+02 -2.31108577e+02
 -2.29256318e+02 -2.28340632e+02 -2.26305194e+02 -2.25007255e+02
 -2.25642241e+02 -2.25227027e+02 -2.23800056e+02 -2.21611360e+02
 -2.20228909e+02 -2.19945791e+02 -2.19777143e+02 -2.17261652e+02
 -2.14520389e+02 -2.11413892e+02 -2.09469299e+02 -2.08439508e+02
 -2.07871874e+02 -2.05125732e+02 -2.04040228e+02 -2.02319820e+02
 -2.02136934e+02 -2.02197151e+02 -2.00515737e+02 -1.97576560e+02
 -1.96587467e+02 -1.93456302e+02 -1.91390592e+02 -1.90252301e+02
 -1.88193467e+02 -1.87124362e+02 -1.84108562e+02 -1.82056672e+02
 -1.80022864e+02 -1.78774245e+02 -1.76945881e+02 -1.75911674e+02
 -1.77898696e+02 -1.77353453e+02 -1.74853551e+02 -1.75222053e+02
 -1.74639334e+02 -1.71780002e+02 -1.69993101e+02 -1.68227590e+02
 -1.67326396e+02 -1.64657098e+02 -1.63422462e+02 -1.60809907e+02
 -1.59814817e+02 -1.58160

# LogProbs and Ratio Computation

We need the ratio of probabilities for an action at state t of the 'new' model vs the old model (maybe because of entropy there is a difference)

In [31]:

#creating oneHot vector with size actions space and getting the log for the probability of choosing this action

print(f'log old: {logits_old}')
print(f'log new: {logits_new}')


### this function currently only takes one single action and 2 sets of logits and computes the ratio of that

def get_ratio(action, logits_old, logits_new):

    #get the Logarithmic version of all logits for computational efficiency
    log_prob_old = tf.nn.log_softmax(logits_old)
    log_prob_new = tf.nn.log_softmax(logits_new)

    # encode in OneHotVector and reduce to sum, giving the log_prob for the action the agent took for both policies
    logprobability_old = tf.reduce_sum(
        tf.one_hot(action, num_actions) * log_prob_old, axis=1
    )
    logprobability_new = tf.reduce_sum(
        tf.one_hot(action, num_actions) * log_prob_new, axis=1
    )
    # get the ratio of new over old prob
    ratio = tf.exp(logprobability_new - logprobability_old)

    print(ratio)




# using one Episode as example to get the prob ratio of old vs new
logits_old = episodes[0][2][0]
action = episodes[0][1][0].numpy()
obs = episodes[0][0]
logits_new = []
for i in obs:
    tensor = tf.convert_to_tensor(i)
    new, _ = sample_action(tensor)
    logits_new.append(new) 

logits_new = logits_new[0]

get_ratio(action, logits_old, logits_new)



log old: [[0.24999921 0.24999891 0.24999474 0.25000718]]
log new: [[0.24999921 0.24999891 0.24999474 0.25000718]]
tf.Tensor([1.], shape=(1,), dtype=float32)
