In [1]:
import gym
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Configuration parameters for the whole setup
seed = 42
gamma = 0.99  # Discount factor for past rewards
max_steps_per_episode = 10000
env = gym.make("CartPole-v0")  # Create the environment
env.seed(seed)
eps = np.finfo(np.float32).eps.item()  # Smallest number such that 1.0 + eps != 1.0

num_inputs = 4
num_actions = 2
num_hidden = 128

inputs = layers.Input(shape=(num_inputs,))
common = layers.Dense(num_hidden, activation="relu")(inputs)
action = layers.Dense(num_actions, activation="softmax")(common)
critic = layers.Dense(1)(common)

model = keras.Model(inputs=inputs, outputs=[action, critic])

optimizer = keras.optimizers.Adam(learning_rate=0.007)
huber_loss = keras.losses.Huber()
action_probs_history = []
critic_value_history = []
rewards_history = []
running_reward = 0
episode_count = 0

while True:  # Run until solved
    state = env.reset()
    episode_reward = 0
    with tf.GradientTape() as tape:
        for timestep in range(1, max_steps_per_episode):
            # env.render(); Adding this line would show the attempts
            # of the agent in a pop up window.

            state = tf.convert_to_tensor(state)
            state = tf.expand_dims(state, 0)

            # Predict action probabilities and estimated future rewards
            # from environment state
            action_probs, critic_value = model(state)
            critic_value_history.append(critic_value[0, 0])

            # Sample action from action probability distribution
            action = np.random.choice(num_actions, p=np.squeeze(action_probs))
            action_probs_history.append(tf.math.log(action_probs[0, action]))

            # Apply the sampled action in our environment
            state, reward, done, _ = env.step(action)
            rewards_history.append(reward)
            episode_reward += reward

            if done:
                break

        # Update running reward to check condition for solving
        running_reward = 0.05 * episode_reward + (1 - 0.05) * running_reward

        # Calculate expected value from rewards
        # - At each timestep what was the total reward received after that timestep
        # - Rewards in the past are discounted by multiplying them with gamma
        # - These are the labels for our critic
        returns = []
        discounted_sum = 0
        for r in rewards_history[::-1]:
            discounted_sum = r + gamma * discounted_sum
            returns.insert(0, discounted_sum)

        # Normalize
        returns = np.array(returns)
        returns = (returns - np.mean(returns)) / (np.std(returns) + eps)
        returns = returns.tolist()

        # Calculating loss values to update our network
        history = zip(action_probs_history, critic_value_history, returns)
        actor_losses = []
        critic_losses = []
        for log_prob, value, ret in history:
            # At this point in history, the critic estimated that we would get a
            # total reward = `value` in the future. We took an action with log probability
            # of `log_prob` and ended up recieving a total reward = `ret`.
            # The actor must be updated so that it predicts an action that leads to
            # high rewards (compared to critic's estimate) with high probability.
            diff = ret - value
            actor_losses.append(-log_prob * diff)  # actor loss

            # The critic must be updated so that it predicts a better estimate of
            # the future rewards.
            critic_losses.append(
                huber_loss(tf.expand_dims(value, 0), tf.expand_dims(ret, 0))
            )

        # Backpropagation
        loss_value = sum(actor_losses) + sum(critic_losses)
        grads = tape.gradient(loss_value, model.trainable_variables)
        optimizer.apply_gradients(zip(grads, model.trainable_variables))

        # Clear the loss and reward history
        action_probs_history.clear()
        critic_value_history.clear()
        rewards_history.clear()

    # Log details
    episode_count += 1
    template = "running reward: {:.2f} at episode {}"
    print(template.format(running_reward, episode_count))

    if running_reward > 195:  # Condition to consider the task solved
        print("Solved at episode {}!".format(episode_count))
        break


2022-02-27 16:35:58.080883: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-02-27 16:35:58.080912: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
  logger.warn(
2022-02-27 16:35:59.874397: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-02-27 16:35:59.875445: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-02-27 16:35:59.875669: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublas.so.11'; dler

running reward: 1.10 at episode 1
running reward: 1.65 at episode 2
running reward: 3.31 at episode 3
running reward: 4.00 at episode 4
running reward: 8.10 at episode 5
running reward: 8.84 at episode 6
running reward: 9.90 at episode 7
running reward: 10.86 at episode 8
running reward: 12.11 at episode 9
running reward: 13.56 at episode 10
running reward: 14.63 at episode 11
running reward: 14.45 at episode 12
running reward: 15.93 at episode 13
running reward: 15.68 at episode 14
running reward: 15.74 at episode 15
running reward: 15.66 at episode 16
running reward: 15.42 at episode 17
running reward: 15.15 at episode 18
running reward: 14.90 at episode 19
running reward: 14.80 at episode 20
running reward: 16.31 at episode 21
running reward: 17.20 at episode 22
running reward: 17.59 at episode 23
running reward: 17.46 at episode 24
running reward: 19.03 at episode 25
running reward: 19.08 at episode 26
running reward: 18.73 at episode 27
running reward: 20.49 at episode 28
running 

running reward: 112.03 at episode 226
running reward: 113.73 at episode 227
running reward: 111.89 at episode 228
running reward: 113.10 at episode 229
running reward: 115.30 at episode 230
running reward: 114.93 at episode 231
running reward: 115.88 at episode 232
running reward: 115.24 at episode 233
running reward: 115.83 at episode 234
running reward: 116.74 at episode 235
running reward: 118.50 at episode 236
running reward: 118.17 at episode 237
running reward: 119.42 at episode 238
running reward: 121.94 at episode 239
running reward: 118.05 at episode 240
running reward: 120.10 at episode 241
running reward: 121.49 at episode 242
running reward: 123.12 at episode 243
running reward: 125.36 at episode 244
running reward: 128.19 at episode 245
running reward: 129.78 at episode 246
running reward: 133.29 at episode 247
running reward: 136.63 at episode 248
running reward: 139.80 at episode 249
running reward: 137.46 at episode 250
running reward: 140.58 at episode 251
running rewa

running reward: 161.59 at episode 442
running reward: 163.51 at episode 443
running reward: 165.34 at episode 444
running reward: 167.07 at episode 445
running reward: 168.72 at episode 446
running reward: 170.28 at episode 447
running reward: 171.77 at episode 448
running reward: 173.18 at episode 449
running reward: 174.52 at episode 450
running reward: 175.79 at episode 451
running reward: 177.00 at episode 452
running reward: 178.15 at episode 453
running reward: 179.25 at episode 454
running reward: 180.28 at episode 455
running reward: 181.27 at episode 456
running reward: 182.21 at episode 457
running reward: 183.10 at episode 458
running reward: 183.94 at episode 459
running reward: 184.74 at episode 460
running reward: 185.51 at episode 461
running reward: 186.23 at episode 462
running reward: 186.92 at episode 463
running reward: 187.57 at episode 464
running reward: 188.19 at episode 465
running reward: 188.78 at episode 466
running reward: 189.35 at episode 467
running rewa

ImportError: cannot import name 'plot_model' from 'keras.utils' (/home/fulton/.local/lib/python3.9/site-packages/keras/utils/__init__.py)