In [1]:
from collections import deque
import gym
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from tensorflow import keras

2023-02-01 14:44:34.846139: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-01 14:44:34.955327: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-10.1/lib64${LD_LIBRARY_PATH:+:}
2023-02-01 14:44:34.955342: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-02-01 14:44:35.519840: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.

In [2]:
def epsilon_greedy_policy(state, model, epsilon=0):
    if np.random.rand() < epsilon:
        return np.random.randint(2)
    else:
        Q_values = model.predict(state[np.newaxis], verbose=0)[0]
        return np.argmax(Q_values)

def play_one_step(env, state, epsilon, model):
    action = epsilon_greedy_policy(state, model, epsilon)
    (
        next_state,
        reward,
        terminated,
        truncated,
        info,
    ) = env.step(action)
    done = 1 if terminated or truncated else 0
    return next_state, reward, done, action

def sample_experiences(batch_size, replay_buffer):
    indices = np.random.randint(len(replay_buffer), size=batch_size)
    batch = [replay_buffer[index] for index in indices]
    states, actions, rewards, next_states, dones = [
        np.array([experience[field_index] for experience in batch])
        for field_index in range(5)
    ]
    return states, actions, rewards, next_states, dones

def training_step(
    batch_size,
    n_outputs,
    discount_factor,
    replay_buffer,
    model,
    loss_fn,
    optimizer,
):
    states, actions, rewards, next_states, dones = sample_experiences(
        batch_size=batch_size, replay_buffer=replay_buffer
    )
    next_Q_values = model.predict(next_states, verbose=0)
    max_next_Q_values = np.max(next_Q_values, axis=1)
    target_Q_values = rewards + (1 - dones) * discount_factor * max_next_Q_values
    mask = tf.one_hot(actions, n_outputs)
    with tf.GradientTape() as tape:
        all_Q_values = model(states)
        Q_values = tf.reduce_sum(all_Q_values * mask, axis=1, keepdims=True)
        loss = tf.reduce_mean(loss_fn(target_Q_values, Q_values))
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

In [3]:
env = gym.make("CartPole-v1", render_mode="human")
input_shape = [4]  # == env.observation_space.shape
n_outputs = 2  # == env.action_space.n
model = keras.models.Sequential(
    [
        keras.layers.Dense(32, activation="elu", input_shape=input_shape),
        keras.layers.Dense(32, activation="elu"),
        keras.layers.Dense(n_outputs),
    ]
)

replay_buffer = deque(maxlen=2000)
batch_size = 32
discount_factor = 0.95
optimizer = keras.optimizers.Nadam(learning_rate=1e-2)
loss_fn = keras.losses.mean_squared_error
rewards = []
best_score = 0
for episode in range(100):
    state, info = env.reset()
    for step in range(200):
        epsilon = max(1 - episode / 500, 0.01)
        next_state, reward, done, action = play_one_step(env, state, epsilon, model)
        replay_buffer.append((state, action, reward, next_state, done))
        state = next_state
        if done:
            break
    # extra code – displays debug info, stores data for the next figure, and
    # keeps track of the best model weights so far
    print(
        f"\rEpisode: {episode + 1}, Steps: {step + 1}, eps: {epsilon:.3f}", end=""
    )
    rewards.append(step)
    if step >= best_score:
        best_weights = model.get_weights()
        best_score = step
    if episode > 50:
        training_step(
            batch_size,
            n_outputs,
            discount_factor,
            replay_buffer,
            model,
            loss_fn,
            optimizer,
        )

model.set_weights(best_weights)
# env.close

plt.figure(figsize=(8, 4))
plt.plot(rewards)
plt.xlabel("Episode", fontsize=14)
plt.ylabel("Sum of rewards", fontsize=14)
plt.grid(True)
plt.show()

TypeError: __init__() got an unexpected keyword argument 'render_mode'