In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchrl
import gym
import numpy as np
import matplotlib.pyplot as plt
from torchrl.envs import GymEnv
from torchrl.modules import MLP, ProbabilisticActor, ValueOperator
from torchrl.data import TensorDictReplayBuffer
from torchrl.objectives import DQNLoss, ClipPPOLoss
from torchrl.trainers import make_trainer

# Introduction to TorchRL
print("TorchRL Version:", torchrl.__version__)

# Environment setup
env_name = "CartPole-v1"
env = GymEnv(env_name)
state_dim = env.observation_spec.shape[0]
action_dim = env.action_spec.shape[0] if env.action_spec.ndimension() > 0 else env.action_spec.shape

# Exploring the environment
def explore_env(env, episodes=5):
    """ Run a few episodes to explore the environment and observe its behavior."""
    for episode in range(episodes):
        state = env.reset()
        done = False
        total_reward = 0
        while not done:
            action = env.action_spec.sample()
            next_state, reward, done, _ = env.step(action)
            total_reward += reward
        print(f"Episode {episode+1}: Total Reward: {total_reward}")

explore_env(env)

# Define Policy Network
class PolicyNetwork(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_size=128):
        super().__init__()
        self.fc = MLP(in_features=state_dim, hidden_sizes=[hidden_size, hidden_size], out_features=action_dim)
        self.log_std = nn.Parameter(torch.zeros(action_dim))
    
    def forward(self, x):
        mean = self.fc(x)
        std = torch.exp(self.log_std)
        return mean, std

# Define Value Network
class ValueNetwork(nn.Module):
    def __init__(self, state_dim, hidden_size=128):
        super().__init__()
        self.fc = MLP(in_features=state_dim, hidden_sizes=[hidden_size, hidden_size], out_features=1)
    
    def forward(self, x):
        return self.fc(x)

# Initialize Networks
policy = PolicyNetwork(state_dim, action_dim)
value_net = ValueNetwork(state_dim)

# Create actor and critic operators
actor = ProbabilisticActor(policy, action_spec=env.action_spec, distribution_class=torch.distributions.Normal)
critic = ValueOperator(value_net)

# Define Loss Functions
ppo_loss = ClipPPOLoss(actor, critic, clip_epsilon=0.2, entropy_bonus=True)
dqn_loss = DQNLoss(critic, gamma=0.99)

# Set up Optimizers
policy_optimizer = optim.Adam(policy.parameters(), lr=3e-4)
value_optimizer = optim.Adam(value_net.parameters(), lr=1e-3)

# Set up Replay Buffer
buffer = TensorDictReplayBuffer(size=100000)

# Define Trainer for PPO
ppo_trainer = make_trainer(
    loss_module=ppo_loss,
    env=env,
    buffer=buffer,
    optimizer=[policy_optimizer, value_optimizer],
    batch_size=64,
    max_epochs=500,
)

# Train PPO Model
ppo_trainer.train()

# Evaluating the trained model
def evaluate_agent(env, policy, episodes=10):
    """ Evaluate the trained policy in the environment."""
    total_rewards = []
    for episode in range(episodes):
        state = env.reset()
        done = False
        episode_reward = 0
        while not done:
            state_tensor = torch.tensor(state, dtype=torch.float32)
            mean, _ = policy(state_tensor)
            action = mean.detach().numpy()
            state, reward, done, _ = env.step(action)
            episode_reward += reward
        total_rewards.append(episode_reward)
        print(f"Evaluation Episode {episode+1}: Reward = {episode_reward}")
    print(f"Average Reward: {np.mean(total_rewards)}")

evaluate_agent(env, policy)

# Plot Training Results
plt.plot(ppo_trainer.episode_rewards)
plt.xlabel("Episodes")
plt.ylabel("Total Reward")
plt.title("PPO Training Performance with TorchRL")
plt.show()

# Summary
print("TorchRL provides a modular and efficient way to implement RL algorithms, integrating seamlessly with PyTorch.")