In [1]:
pip install gym 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
pip install numpy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
pip install tensorflow 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import gym
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Dense

In [3]:
# Create a simple policy network
def create_policy_network(input_dim, output_dim):
    model = tf.keras.Sequential([
        Dense(64, activation='relu', input_shape=(input_dim,)),
        Dense(64, activation='relu'),
        Dense(output_dim, activation='softmax')
    ])
    return model

In [4]:
# Choose an action according to the policy network output
def choose_action(policy_network, state):
    action_probs = policy_network(state[np.newaxis,:]).numpy().flatten()
    action = np.random.choice(range(len(action_probs)), p=action_probs)
    return action

# Compute the discounted rewards
def discounted_rewards(rewards, gamma):
    discounted = np.zeros_like(rewards)
    cumulative = 0
    for i in reversed(range(len(rewards))):
        cumulative = cumulative * gamma + rewards[i]
        discounted[i] = cumulative
    return discounted


In [5]:
# Train the policy network using policy gradient
def train_policy_network(policy_network, states, actions, rewards, gamma=0.99):
    discounted_r = discounted_rewards(rewards, gamma)
    discounted_r -= np.mean(discounted_r)
    discounted_r /= np.std(discounted_r)

    with tf.GradientTape() as tape:
        logits = policy_network(states)
        action_masks = tf.one_hot(actions, num_actions)
        log_probs = tf.reduce_sum(action_masks * tf.math.log(logits), axis=1)
        loss = -tf.reduce_sum(log_probs * discounted_r)

    grads = tape.gradient(loss, policy_network.trainable_variables)
    optimizer.apply_gradients(zip(grads, policy_network.trainable_variables))


In [None]:
env = gym.make('CartPole')
num_actions = env.action_space.n
state_dim = env.observation_space.shape[0]
print(state_dim)
print(num_actions)

policy_network = create_policy_network(state_dim, num_actions)
optimizer = tf.keras.optimizers.Adam(lr=0.001)

num_episodes = 1000
for episode in range(num_episodes):
    state = env.reset()
    episode_states, episode_actions, episode_rewards = [], [], []
    action = choose_action(policy_network, state)

    while True:
        next_state, reward, done, _ = env.step(action)

        episode_states.append(state)
        episode_actions.append(action)
        episode_rewards.append(reward)

        state = next_state
        action = choose_action(policy_network, state)

        if done:
            break

    # Train the policy network using the collected data
    states = np.array(episode_states, dtype=np.float32)
    actions = np.array(episode_actions, dtype=np.int32)
    rewards = np.array(episode_rewards, dtype=np.float32)

    train_policy_network(policy_network, states, actions, rewards)

    print(f'Episode {episode + 1}, '
          f'Total reward: {sum(episode_rewards):.2f}, '
          f'Episode length: {len(episode_rewards)}')



4
2
Episode 1, Total reward: 18.00, Episode length: 18
Episode 2, Total reward: 29.00, Episode length: 29
Episode 3, Total reward: 21.00, Episode length: 21
Episode 4, Total reward: 53.00, Episode length: 53
Episode 5, Total reward: 25.00, Episode length: 25
Episode 6, Total reward: 19.00, Episode length: 19
Episode 7, Total reward: 10.00, Episode length: 10
Episode 8, Total reward: 12.00, Episode length: 12
Episode 9, Total reward: 25.00, Episode length: 25
Episode 10, Total reward: 19.00, Episode length: 19
Episode 11, Total reward: 21.00, Episode length: 21
Episode 12, Total reward: 42.00, Episode length: 42
Episode 13, Total reward: 54.00, Episode length: 54
Episode 14, Total reward: 13.00, Episode length: 13
Episode 15, Total reward: 25.00, Episode length: 25
Episode 16, Total reward: 32.00, Episode length: 32
Episode 17, Total reward: 16.00, Episode length: 16
Episode 18, Total reward: 37.00, Episode length: 37
Episode 19, Total reward: 23.00, Episode length: 23
Episode 20, Total