In [11]:
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

# Define the Actor network
class Actor(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(Actor, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.Tanh(),
            nn.Linear(64, 64),
            nn.Tanh(),
            nn.Linear(64, output_dim),
            nn.Softmax(dim=-1),
        )

    def forward(self, state):
        return self.network(state)

# Define the Critic network
class Critic(nn.Module):
    def __init__(self, input_dim):
        super(Critic, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.Tanh(),
            nn.Linear(64, 64),
            nn.Tanh(),
            nn.Linear(64, 1)
        )

    def forward(self, state):
        return self.network(state)

# Set up the environment
env = gym.make('CartPole-v1')


# Get the number of action space and observation space from the environment
n_actions = env.action_space.n
n_states = env.observation_space.shape[0]

# Initialize Actor and Critic networks
actor = Actor(input_dim=n_states, output_dim=n_actions)
critic = Critic(input_dim=n_states)

# Set up optimizers for both Actor and Critic
actor_optimizer = optim.Adam(actor.parameters(), lr=1e-5)
critic_optimizer = optim.Adam(critic.parameters(), lr=1e-3)

# Hyperparameters
gamma = 0.99  # Discount factor

# Placeholder for the training loop
def train():
    state = env.reset()
    done = False
    while not done:
        # Convert state to a tensor
        print(state[1])
        state = torch.FloatTensor(state[0])

        # Get action probabilities from the actor network
        action_probs = actor(state)

        # Sample action from the probability distribution
        action = np.random.choice(np.arange(n_actions), p=action_probs.detach().numpy())

        # Take action in the environment
        next_state, reward, done, _ = env.step(action)

        # Compute the value of the current state
        state_value = critic(state)

        # Compute the value of the next state
        next_state = torch.FloatTensor(next_state)
        next_state_value = critic(next_state)

        # Compute the advantage
        advantage = reward + (1 - done) * gamma * next_state_value - state_value

        # Update Critic
        critic_loss = advantage.pow(2).mean()
        critic_optimizer.zero_grad()
        critic_loss.backward()
        critic_optimizer.step()

        # Update Actor
        actor_loss = -torch.log(action_probs[action]) * advantage.detach()
        actor_optimizer.zero_grad()
        actor_loss.backward()
        actor_optimizer.step()

        # Move to the next state
        state = next_state.numpy()

# Run the training loop
train()

{}


ValueError: too many values to unpack (expected 4)