#Policy Gradient Methods:

#Deep Deterministic Policy Gradient (DDPG):
* Combines deep learning with policy gradients for continuous action spaces.
* Maintains a deterministic policy and learns a Q-function using neural networks.

In [None]:
import tensorflow as tf
import numpy as np
import gym

def create_actor_network(state_dim, action_dim):
    inputs = tf.keras.layers.Input(shape=(state_dim,))
    net = tf.keras.layers.Dense(400, activation='relu')(inputs)
    net = tf.keras.layers.Dense(300, activation='relu')(net)
    outputs = tf.keras.layers.Dense(action_dim, activation='tanh')(net)  # tanh activation for bounded actions
    model = tf.keras.Model(inputs, outputs)
    return model

def create_critic_network(state_dim, action_dim):
    state_input = tf.keras.layers.Input(shape=(state_dim,))
    action_input = tf.keras.layers.Input(shape=(action_dim,))

    # State pathway
    state_net = tf.keras.layers.Dense(400, activation='relu')(state_input)
    state_net = tf.keras.layers.Dense(300, activation=None)(state_net)

    # Action pathway
    action_net = tf.keras.layers.Dense(300, activation=None)(action_input)

    # Combine state and action pathways
    net = tf.keras.layers.Add()([state_net, action_net])
    net = tf.keras.layers.Activation('relu')(net)

    outputs = tf.keras.layers.Dense(1, activation=None)(net)  # Q-value output
    model = tf.keras.Model([state_input, action_input], outputs)
    return model

class DDPGAgent:
    def __init__(self, state_dim, action_dim, action_high):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.action_high = action_high

        # Initialize actor and critic networks
        self.actor_network = create_actor_network(state_dim, action_dim)
        self.critic_network = create_critic_network(state_dim, action_dim)

        # Target networks for stability
        self.target_actor_network = create_actor_network(state_dim, action_dim)
        self.target_critic_network = create_critic_network(state_dim, action_dim)
        self.target_actor_network.set_weights(self.actor_network.get_weights())
        self.target_critic_network.set_weights(self.critic_network.get_weights())

        # Optimizers
        self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
        self.critic_optimizer = tf.keras.optimizers.Adam(learning_rate=0.002)

        # Hyperparameters
        self.tau = 0.005  # Soft update parameter

    def get_action(self, state):
        state = np.reshape(state, [1, self.state_dim])
        action = self.actor_network(state)
        action = self.action_high * action.numpy()[0]
        return action

    def update_target_networks(self):
        actor_weights = self.actor_network.get_weights()
        critic_weights = self.critic_network.get_weights()
        target_actor_weights = self.target_actor_network.get_weights()
        target_critic_weights = self.target_critic_network.get_weights()

        for i in range(len(actor_weights)):
            target_actor_weights[i] = self.tau * actor_weights[i] + (1 - self.tau) * target_actor_weights[i]

        for i in range(len(critic_weights)):
            target_critic_weights[i] = self.tau * critic_weights[i] + (1 - self.tau) * target_critic_weights[i]

        self.target_actor_network.set_weights(target_actor_weights)
        self.target_critic_network.set_weights(target_critic_weights)

    def train(self, state, action, reward, next_state, done):
        state = np.reshape(state, [1, self.state_dim])  # Reshape state to (1, state_dim)
        next_state = np.reshape(next_state, [1, self.state_dim])  # Reshape next_state to (1, state_dim)
        action = np.reshape(action, [1, self.action_dim])  # Reshape action to (1, action_dim)

        # Convert inputs to tensors
        states = tf.convert_to_tensor(state, dtype=tf.float32)
        actions = tf.convert_to_tensor(action, dtype=tf.float32)
        rewards = tf.convert_to_tensor(reward, dtype=tf.float32)
        next_states = tf.convert_to_tensor(next_state, dtype=tf.float32)
        terminals = tf.convert_to_tensor(done, dtype=tf.float32)

        with tf.GradientTape() as tape:
            target_actions = self.target_actor_network(next_states)
            target_q_values = self.target_critic_network([next_states, target_actions])
            target_values = rewards + (1. - terminals) * 0.99 * target_q_values

            q_values = self.critic_network([states, actions])
            critic_loss = tf.keras.losses.MSE(target_values, q_values)

        critic_grads = tape.gradient(critic_loss, self.critic_network.trainable_variables)
        self.critic_optimizer.apply_gradients(zip(critic_grads, self.critic_network.trainable_variables))

        with tf.GradientTape() as tape:
            actions_pred = self.actor_network(states)
            critic_value = self.critic_network([states, actions_pred])
            actor_loss = -tf.math.reduce_mean(critic_value)

        actor_grads = tape.gradient(actor_loss, self.actor_network.trainable_variables)
        self.actor_optimizer.apply_gradients(zip(actor_grads, self.actor_network.trainable_variables))

        # Update target networks
        self.update_target_networks()

env = gym.make('Pendulum-v1')  # Use Pendulum-v1 instead of Pendulum-v0
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
action_high = env.action_space.high[0]

agent = DDPGAgent(state_dim, action_dim, action_high)

num_episodes = 10
batch_size = 64

for episode in range(num_episodes):
    state = env.reset()
    episode_reward = 0

    for t in range(200):  # Maximum of 200 steps per episode
        action = agent.get_action(state)
        next_state, reward, done, _ = env.step(action)
        agent.train(state, action, reward, next_state, done)
        episode_reward += reward
        state = next_state

        if done:
            break

    print(f"Episode: {episode + 1}, Reward: {episode_reward:.2f}")

env.close()


Episode: 1, Reward: -1482.75
Episode: 2, Reward: -1483.88
Episode: 3, Reward: -1574.26
Episode: 4, Reward: -1175.47
Episode: 5, Reward: -1622.30
Episode: 6, Reward: -1678.79
Episode: 7, Reward: -1502.47
Episode: 8, Reward: -1522.43
Episode: 9, Reward: -1445.88
Episode: 10, Reward: -1595.49
