In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import gym
import numpy as np
import matplotlib.pyplot as plt
from collections import deque
from torch.distributions import MultivariateNormal

# Policy Network Definition
class PolicyNetwork(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_size=128):
        super(PolicyNetwork, self).__init__()
        self.fc1 = nn.Linear(state_dim, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, action_dim)
        self.log_std = nn.Parameter(torch.zeros(action_dim))  # Log standard deviation for continuous actions
        
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        mean = self.fc3(x)
        std = torch.exp(self.log_std)
        return mean, std

# Value Network Definition
class ValueNetwork(nn.Module):
    def __init__(self, state_dim, hidden_size=128):
        super(ValueNetwork, self).__init__()
        self.fc1 = nn.Linear(state_dim, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, 1)
        
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

# TRPO Step (Using Conjugate Gradient and Line Search)
def trpo_step(policy, states, actions, advantages, old_log_probs, max_kl=0.01):
    # Compute policy gradient
    mean, std = policy(states)
    dist = MultivariateNormal(mean, torch.diag(std**2))
    log_probs = dist.log_prob(actions)
    loss = (log_probs.exp() / old_log_probs.exp() * advantages).mean()
    
    grads = torch.autograd.grad(loss, policy.parameters(), retain_graph=True)
    flat_grads = torch.cat([grad.view(-1) for grad in grads])
    
    def fisher_vector_product(vector):
        kl = (old_log_probs - log_probs).mean()
        grads = torch.autograd.grad(kl, policy.parameters(), create_graph=True)
        flat_grads = torch.cat([grad.view(-1) for grad in grads])
        return flat_grads @ vector
    
    step_dir = conjugate_gradient(fisher_vector_product, -flat_grads)
    max_step = torch.sqrt(2 * max_kl / (step_dir @ fisher_vector_product(step_dir))) * step_dir
    
    # Line search for best step size
    new_params = get_flat_params_from(policy) + max_step
    set_flat_params_to(policy, new_params)

# Train TRPO on Mujoco Environment
def train_trpo(env_name='LunarLanderContinuous-v2', episodes=1000):
    env = gym.make(env_name)
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    policy = PolicyNetwork(state_dim, action_dim)
    value_net = ValueNetwork(state_dim)
    optimizer = optim.Adam(value_net.parameters(), lr=0.01)
    
    reward_history = []
    
    for episode in range(episodes):
        state = env.reset()
        episode_rewards = []
        log_probs = []
        states = []
        actions = []
        
        for _ in range(200):
            state_tensor = torch.tensor(state, dtype=torch.float32)
            mean, std = policy(state_tensor)
            dist = MultivariateNormal(mean, torch.diag(std**2))
            action = dist.sample()
            log_prob = dist.log_prob(action)
            next_state, reward, done, _ = env.step(action.numpy())
            
            states.append(state_tensor)
            actions.append(action)
            log_probs.append(log_prob)
            episode_rewards.append(reward)
            state = next_state
            
            if done:
                break
        
        # Compute advantages
        returns = compute_returns(episode_rewards)
        advantages = returns - value_net(torch.stack(states)).squeeze()
        
        # Update value function
        value_loss = nn.functional.mse_loss(value_net(torch.stack(states)).squeeze(), returns)
        optimizer.zero_grad()
        value_loss.backward()
        optimizer.step()
        
        # Perform TRPO step
        trpo_step(policy, torch.stack(states), torch.stack(actions), advantages, torch.stack(log_probs))
        reward_history.append(sum(episode_rewards))
        
        if episode % 50 == 0:
            print(f"Episode {episode}, Reward: {sum(episode_rewards)}")
    
    env.close()
    return reward_history

# Run training and plot results
reward_history = train_trpo()
plt.plot(reward_history)
plt.xlabel("Episodes")
plt.ylabel("Total Reward")
plt.title("TRPO Training Performance")
plt.show()


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import gym
import numpy as np
import matplotlib.pyplot as plt
from torch.distributions import MultivariateNormal

# Policy Network Definition
class PolicyNetwork(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_size=128):
        super(PolicyNetwork, self).__init__()
        self.fc1 = nn.Linear(state_dim, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, action_dim)
        self.log_std = nn.Parameter(torch.zeros(action_dim))  # Log standard deviation for continuous actions
        
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        mean = self.fc3(x)
        std = torch.exp(self.log_std)
        return mean, std

# Value Network Definition
class ValueNetwork(nn.Module):
    def __init__(self, state_dim, hidden_size=128):
        super(ValueNetwork, self).__init__()
        self.fc1 = nn.Linear(state_dim, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, 1)
        
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

# PPO Training Loop
def train_ppo(env_name='LunarLanderContinuous-v2', episodes=1000, clip_epsilon=0.2, gamma=0.99):
    env = gym.make(env_name)
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    policy = PolicyNetwork(state_dim, action_dim)
    value_net = ValueNetwork(state_dim)
    policy_optimizer = optim.Adam(policy.parameters(), lr=0.0003)
    value_optimizer = optim.Adam(value_net.parameters(), lr=0.001)
    
    reward_history = []
    
    for episode in range(episodes):
        state = env.reset()
        episode_rewards = []
        log_probs = []
        states = []
        actions = []
        values = []
        
        for _ in range(200):
            state_tensor = torch.tensor(state, dtype=torch.float32)
            mean, std = policy(state_tensor)
            dist = MultivariateNormal(mean, torch.diag(std**2))
            action = dist.sample()
            log_prob = dist.log_prob(action)
            value = value_net(state_tensor)
            
            next_state, reward, done, _ = env.step(action.numpy())
            
            states.append(state_tensor)
            actions.append(action)
            log_probs.append(log_prob)
            values.append(value)
            episode_rewards.append(reward)
            
            state = next_state
            
            if done:
                break
        
        # Compute advantages
        returns = compute_returns(episode_rewards, gamma)
        advantages = returns - torch.stack(values).squeeze()
        
        # Compute policy loss with PPO clipping
        mean, std = policy(torch.stack(states))
        new_dist = MultivariateNormal(mean, torch.diag(std**2))
        new_log_probs = new_dist.log_prob(torch.stack(actions))
        ratio = torch.exp(new_log_probs - torch.stack(log_probs))
        
        surrogate1 = ratio * advantages
        surrogate2 = torch.clamp(ratio, 1 - clip_epsilon, 1 + clip_epsilon) * advantages
        policy_loss = -torch.min(surrogate1, surrogate2).mean()
        
        policy_optimizer.zero_grad()
        policy_loss.backward()
        policy_optimizer.step()
        
        # Update value function
        value_loss = nn.functional.mse_loss(torch.stack(values).squeeze(), returns)
        value_optimizer.zero_grad()
        value_loss.backward()
        value_optimizer.step()
        
        reward_history.append(sum(episode_rewards))
        
        if episode % 50 == 0:
            print(f"Episode {episode}, Reward: {sum(episode_rewards)}")
    
    env.close()
    return reward_history

# Run training and plot results
reward_history = train_ppo()
plt.plot(reward_history)
plt.xlabel("Episodes")
plt.ylabel("Total Reward")
plt.title("PPO Training Performance")
plt.show()e

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchrl
import gym
import numpy as np
import matplotlib.pyplot as plt
from torchrl.envs import GymEnv
from torchrl.data import TensorDictReplayBuffer
from torchrl.modules import ProbabilisticActor, ValueOperator
from torchrl.objectives import ClipPPOLoss
from torchrl.trainers import make_trainer

# Define environment
env_name = "LunarLanderContinuous-v2"
env = GymEnv(env_name)
state_dim = env.observation_spec.shape[0]
action_dim = env.action_spec.shape[0]

# Define policy network
class PolicyNetwork(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_size=128):
        super().__init__()
        self.fc1 = nn.Linear(state_dim, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, action_dim)
        self.log_std = nn.Parameter(torch.zeros(action_dim))
        
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        mean = self.fc3(x)
        std = torch.exp(self.log_std)
        return mean, std

# Define value network
class ValueNetwork(nn.Module):
    def __init__(self, state_dim, hidden_size=128):
        super().__init__()
        self.fc1 = nn.Linear(state_dim, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, 1)
        
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

# Initialize networks
policy = PolicyNetwork(state_dim, action_dim)
value_net = ValueNetwork(state_dim)

# Create actor and critic operators
actor = ProbabilisticActor(policy, action_spec=env.action_spec, distribution_class=torch.distributions.Normal)
critic = ValueOperator(value_net)

# Define PPO Loss
ppo_loss = ClipPPOLoss(actor, critic, clip_epsilon=0.2, entropy_bonus=True)

# Set up optimizers
policy_optimizer = optim.Adam(policy.parameters(), lr=3e-4)
value_optimizer = optim.Adam(value_net.parameters(), lr=1e-3)

# Set up replay buffer
buffer = TensorDictReplayBuffer(size=100000)

# Define trainer
trainer = make_trainer(
    loss_module=ppo_loss,
    env=env,
    buffer=buffer,
    optimizer=[policy_optimizer, value_optimizer],
    batch_size=64,
    max_epochs=1000,
)

# Train PPO model
trainer.train()

# Plot results
plt.plot(trainer.episode_rewards)
plt.xlabel("Episodes")
plt.ylabel("Total Reward")
plt.title("PPO Training Performance with TorchRL")
plt.show()