# Policy Gradients Implementations in Python

#### Policy Gradient Methods
Policy Gradient Methods are a class of reinforcement learning algorithms that learn a policy directly by optimising the parameters of a policy network. Instead of learning Q-values like Q-learning or DQN, policy gradient methods focus on finding the optimal action-selection strategy that maximises cumulative rewards. A popular approach is the REINFORCE algorithm, where actions are sampled from a policy distribution, and the policy is updated using gradients based on rewards

In [None]:
# import necessary libraries
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
import gym

# set up the environment
env = gym.make("CartPole-v1")
state_shape = env.observation_space.shape[0]
num_actions = env.action_space.n

# parameters
learning_rate = 0.01
gamma = 0.99  # discount factor


# policy network
def build_policy_model():
    model = tf.keras.Sequential(
        [
            layers.Dense(24, activation="relu", input_shape=(state_shape,)),
            layers.Dense(24, activation="relu"),
            layers.Dense(num_actions, activation="softmax"),
        ]
    )
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate))
    return model

policy_model = build_policy_model()

# functon to select an action based on policy
def choose_action(state):
    state = np.array(state).reshape([1, state_shape]) # reshape state to (1, state_shape)
    probabilities = policy_model.predict(state)
    return np.random.choice(num_actions, p=probabilities[0])
    
# function to calculate returns (discounted rewards)
def discount_rewards(rewards):
    discounted = np.zeros_like(rewards)
    cumulative = 0
    for i in reversed(range(len(rewards))):
        cumulative = cumulative * gamma + rewards[i]
        discounted[i] = cumulative
    return discounted - np.mean(discounted) # normalize

# training function
def train_on_episode(states, actions, rewards):
    discounted_rewards = discount_rewards(rewards)
    with tf.GradientTape() as tape:
        action_probs = policy_model(tf.convert_to_tensor(states, dtype=tf.float32), training=True)
        action_indices = tf.stack([tf.range(len(actions)),actions], axis=1)
        selected_action_probs = tf.gather_nd(action_probs, action_indices)
        loss = -tf.reduce_mean(tf.math.log(selected_action_probs) * discounted_rewards)
    gradients = tape.gradient(loss, policy_model.trainable_variables)
    policy_model.optimizer.apply_gradients(zip(gradients, policy_model.trainable_variables))

# main training loop
num_episodes = 100
for episode in range(num_episodes):
    state, _ = env.reset()
    episodes_states, episodes_actions, episodes_rewards = [], [], []
    while True:
        action = choose_action(state)
        next_state, reward, done, truncated, _ = env.step(action)
        done = done or truncated # end the episode if truncated
        episodes_states.append(state)
        episodes_actions.append(action)
        episodes_rewards.append(reward)
        state = next_state
        if done:
            episodes_states = np.vstack(episodes_states)
            train_on_episode(episodes_states, np.array(episodes_actions), np.array(episodes_rewards))
            print(f"Episode: {episode+1}/{num_episodes}, Reward: {np.sum(episodes_rewards)}")
            break


# conflict between numpy version and other libraries

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 104ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


AttributeError: module 'numpy' has no attribute 'bool8'