In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
import gym
import numpy as np

# Initialize the GridWorld environment
env = gym.make('MiniGrid-Empty-5x5-v0')

# Hyperparameters
gamma = 0.99
lr = 3e-4
clip_epsilon = 0.2
epochs = 10
rollout_len = 2048
batch_size = 64
n_updates = 1000

# Actor-Critic Network with correct input shape handling
class ActorCritic(nn.Module):
    def __init__(self, obs_shape, n_actions):
        super(ActorCritic, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(obs_shape[0], 16, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1),
            nn.ReLU()
        )
        self.fc = nn.Linear(32 * obs_shape[1] * obs_shape[2], 64)
        self.actor = nn.Linear(64, n_actions)
        self.critic = nn.Linear(64, 1)

    def forward(self, x):
        x = self.conv(x)  # (batch_size, 32, height, width)
        x = x.view(x.size(0), -1)  # Flatten
        x = torch.relu(self.fc(x))
        return self.actor(x), self.critic(x)

    def get_action(self, state):
        logits, _ = self.forward(state)
        probs = torch.softmax(logits, dim=-1)
        action = torch.multinomial(probs, 1).item()
        return action, probs

# Initialize the network and optimizer
obs_shape = (env.observation_space['image'].shape[2],  # Channels
             env.observation_space['image'].shape[0],  # Height
             env.observation_space['image'].shape[1])  # Width
n_actions = env.action_space.n
model = ActorCritic(obs_shape, n_actions)
optimizer = optim.Adam(model.parameters(), lr=lr)

# Training Loop with Correct Input Handling
for update in range(n_updates):
    states, actions, rewards, values, dones, log_probs = [], [], [], [], [], []
    state = env.reset()[0]['image']  # Get the image observation

    for _ in range(rollout_len):
        # Preprocess state: add batch dimension and convert to tensor
        state_tensor = torch.tensor(state, dtype=torch.float32).permute(2, 0, 1).unsqueeze(0)

        action, probs = model.get_action(state_tensor)
        value = model(state_tensor)[1]

        next_state, reward, done, _, _ = env.step(action)
        next_state = next_state['image']

        states.append(state_tensor)
        actions.append(torch.tensor(action))
        rewards.append(reward)
        values.append(value)
        dones.append(done)
        log_probs.append(torch.log(probs.squeeze(0)[action]))

        state = next_state
        if done:
            state = env.reset()[0]['image']

    # Convert collected data into tensors
    states = torch.cat(states)
    actions = torch.tensor(actions)
    log_probs_old = torch.cat(log_probs)
    returns = torch.tensor(rewards, dtype=torch.float32)
    advantages = returns - torch.tensor(values, dtype=torch.float32).squeeze(1)

    # PPO update step (omitted here for brevity, same as before)

    print(f"Update {update + 1} / {n_updates}")

print("Training complete!")


  if not isinstance(terminated, (bool, np.bool8)):


RuntimeError: zero-dimensional tensor (at position 0) cannot be concatenated

In [6]:

env = gym.make('MiniGrid-Empty-5x5-v0')  # A 5x5 grid with simple navigation


NameNotFound: Environment `MiniGrid-Empty-5x5` doesn't exist.

In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical

# Simple GridWorld environment
class GridWorld:
    def __init__(self, size=5):
        self.size = size
        self.reset()

    def reset(self):
        self.position = [0, 0]
        self.goal = [self.size-1, self.size-1]
        return self._get_state()

    def step(self, action):
        # 0: up, 1: right, 2: down, 3: left
        directions = [(-1, 0), (0, 1), (1, 0), (0, -1)]
        new_pos = [
            max(0, min(self.size-1, self.position[0] + directions[action][0])),
            max(0, min(self.size-1, self.position[1] + directions[action][1]))
        ]
        self.position = new_pos
        done = (self.position == self.goal)
        reward = 10 if done else -1
        return self._get_state(), reward, done

    def _get_state(self):
        return self.position[0] * self.size + self.position[1]

# Actor-Critic Network
class ActorCritic(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(ActorCritic, self).__init__()
        self.actor = nn.Sequential(
            nn.Linear(state_dim, 64),
            nn.ReLU(),
            nn.Linear(64, action_dim),
            nn.Softmax(dim=-1)
        )
        self.critic = nn.Sequential(
            nn.Linear(state_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, state):
        return self.actor(state), self.critic(state)

# PPO Algorithm
def ppo_update(ppo_epochs, mini_batch_size, states, actions, old_log_probs, returns, advantages, clip_param, actor_critic, optimizer):
    for _ in range(ppo_epochs):
        for _ in range(len(states) // mini_batch_size):
            rand_ids = np.random.randint(0, len(states), mini_batch_size)
            
            batch_states = states[rand_ids]
            batch_actions = actions[rand_ids]
            batch_old_log_probs = old_log_probs[rand_ids]
            batch_returns = returns[rand_ids]
            batch_advantages = advantages[rand_ids]

            probs, state_values = actor_critic(batch_states)
            dist = Categorical(probs)
            entropy = dist.entropy().mean()
            new_log_probs = dist.log_prob(batch_actions)

            ratio = (new_log_probs - batch_old_log_probs).exp()
            surr1 = ratio * batch_advantages
            surr2 = torch.clamp(ratio, 1.0 - clip_param, 1.0 + clip_param) * batch_advantages
            actor_loss = -torch.min(surr1, surr2).mean()

            critic_loss = nn.MSELoss()(state_values, batch_returns.unsqueeze(1))
            
            loss = actor_loss + 0.5 * critic_loss - 0.01 * entropy

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

# Main training loop
def train_ppo(env, actor_critic, optimizer, num_episodes, update_interval):
    for episode in range(num_episodes):
        state = env.reset()
        done = False
        episode_reward = 0

        states, actions, rewards, log_probs, values = [], [], [], [], []

        while not done:
            state_tensor = torch.FloatTensor([state])
            probs, value = actor_critic(state_tensor)
            dist = Categorical(probs)
            action = dist.sample()
            
            next_state, reward, done = env.step(action.item())
            
            states.append(state_tensor)
            actions.append(action)
            rewards.append(reward)
            log_probs.append(dist.log_prob(action))
            values.append(value)

            state = next_state
            episode_reward += reward

        # Compute returns and advantages
        returns = []
        advantages = []
        running_return = 0
        running_advantage = 0
        for t in reversed(range(len(rewards))):
            running_return = rewards[t] + 0.99 * running_return
            running_advantage = running_return - values[t].item()
            returns.insert(0, running_return)
            advantages.insert(0, running_advantage)

        states = torch.cat(states)
        actions = torch.cat(actions)
        old_log_probs = torch.cat(log_probs)
        returns = torch.tensor(returns)
        advantages = torch.tensor(advantages)

        # PPO update
        if episode % update_interval == 0:
            ppo_update(4, 32, states, actions, old_log_probs, returns, advantages, 0.2, actor_critic, optimizer)

        if episode % 10 == 0:
            print(f"Episode {episode}, Reward: {episode_reward}")

# Initialize environment and model
env = GridWorld(size=5)
state_dim = env.size ** 2
action_dim = 4
actor_critic = ActorCritic(state_dim, action_dim)
optimizer = optim.Adam(actor_critic.parameters(), lr=3e-4)

# Train the model
train_ppo(env, actor_critic, optimizer, num_episodes=1000, update_interval=20)

RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x1 and 25x64)

In [5]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical

class GridWorld:
    def __init__(self, size=5):
        self.size = size
        self.reset()

    def reset(self):
        self.position = [0, 0]
        self.goal = [self.size-1, self.size-1]
        return self._get_state()

    def step(self, action):
        directions = [(-1, 0), (0, 1), (1, 0), (0, -1)]
        new_pos = [
            max(0, min(self.size-1, self.position[0] + directions[action][0])),
            max(0, min(self.size-1, self.position[1] + directions[action][1]))
        ]
        self.position = new_pos
        done = (self.position == self.goal)
        reward = 10 if done else -1
        return self._get_state(), reward, done

    def _get_state(self):
        return self.position[0] * self.size + self.position[1]

class ActorCritic(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(ActorCritic, self).__init__()
        self.actor = nn.Sequential(
            nn.Linear(state_dim, 64),
            nn.ReLU(),
            nn.Linear(64, action_dim),
            nn.Softmax(dim=-1)
        )
        self.critic = nn.Sequential(
            nn.Linear(state_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, state):
        print(f"Forward input shape: {state.shape}")  # Debug print
        return self.actor(state), self.critic(state)

def ppo_update(ppo_epochs, mini_batch_size, states, actions, old_log_probs, returns, advantages, clip_param, actor_critic, optimizer):
    for _ in range(ppo_epochs):
        for _ in range(len(states) // mini_batch_size):
            rand_ids = np.random.randint(0, len(states), mini_batch_size)
            
            batch_states = states[rand_ids]
            batch_actions = actions[rand_ids]
            batch_old_log_probs = old_log_probs[rand_ids]
            batch_returns = returns[rand_ids]
            batch_advantages = advantages[rand_ids]

            probs, state_values = actor_critic(batch_states)
            dist = Categorical(probs)
            entropy = dist.entropy().mean()
            new_log_probs = dist.log_prob(batch_actions)

            ratio = (new_log_probs - batch_old_log_probs).exp()
            surr1 = ratio * batch_advantages
            surr2 = torch.clamp(ratio, 1.0 - clip_param, 1.0 + clip_param) * batch_advantages
            actor_loss = -torch.min(surr1, surr2).mean()

            critic_loss = nn.MSELoss()(state_values, batch_returns.unsqueeze(1))
            
            loss = actor_loss + 0.5 * critic_loss - 0.01 * entropy

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

def train_ppo(env, actor_critic, optimizer, num_episodes, update_interval):
    for episode in range(num_episodes):
        state = env.reset()
        done = False
        episode_reward = 0

        states, actions, rewards, log_probs, values = [], [], [], [], []

        while not done:
            state_tensor = torch.FloatTensor([state])  # Changed this line
            print(f"State tensor shape: {state_tensor.shape}")  # Debug print
            probs, value = actor_critic(state_tensor)
            dist = Categorical(probs)
            action = dist.sample()
            
            next_state, reward, done = env.step(action.item())
            
            states.append(state_tensor)
            actions.append(action)
            rewards.append(reward)
            log_probs.append(dist.log_prob(action))
            values.append(value)

            state = next_state
            episode_reward += reward

        # Compute returns and advantages
        returns = []
        advantages = []
        running_return = 0
        running_advantage = 0
        for t in reversed(range(len(rewards))):
            running_return = rewards[t] + 0.99 * running_return
            running_advantage = running_return - values[t].item()
            returns.insert(0, running_return)
            advantages.insert(0, running_advantage)

        states = torch.cat(states)
        actions = torch.cat(actions)
        old_log_probs = torch.cat(log_probs)
        returns = torch.tensor(returns)
        advantages = torch.tensor(advantages)

        # PPO update
        if episode % update_interval == 0:
            ppo_update(4, 32, states, actions, old_log_probs, returns, advantages, 0.2, actor_critic, optimizer)

        if episode % 10 == 0:
            print(f"Episode {episode}, Reward: {episode_reward}")

# Initialize environment and model
env = GridWorld(size=5)
state_dim = 1  # Changed this line
action_dim = 4
actor_critic = ActorCritic(state_dim, action_dim)
optimizer = optim.Adam(actor_critic.parameters(), lr=3e-4)

# Train the model
train_ppo(env, actor_critic, optimizer, num_episodes=1000, update_interval=20)

State tensor shape: torch.Size([1])
Forward input shape: torch.Size([1])
State tensor shape: torch.Size([1])
Forward input shape: torch.Size([1])
State tensor shape: torch.Size([1])
Forward input shape: torch.Size([1])
State tensor shape: torch.Size([1])
Forward input shape: torch.Size([1])
State tensor shape: torch.Size([1])
Forward input shape: torch.Size([1])
State tensor shape: torch.Size([1])
Forward input shape: torch.Size([1])
State tensor shape: torch.Size([1])
Forward input shape: torch.Size([1])
State tensor shape: torch.Size([1])
Forward input shape: torch.Size([1])
State tensor shape: torch.Size([1])
Forward input shape: torch.Size([1])
State tensor shape: torch.Size([1])
Forward input shape: torch.Size([1])
State tensor shape: torch.Size([1])
Forward input shape: torch.Size([1])
State tensor shape: torch.Size([1])
Forward input shape: torch.Size([1])
State tensor shape: torch.Size([1])
Forward input shape: torch.Size([1])
State tensor shape: torch.Size([1])
Forward input s

RuntimeError: zero-dimensional tensor (at position 0) cannot be concatenated

In [16]:
class PPO(PPOBase):
    def _get_torch_obs(self, obs):
        # This method processes observations and could introduce shape changes
        obs = self.policy.obs_to_tensor(obs)
        print(f"Processed observation shape: {obs.shape}")  # Check the shape here
        return obs


NameError: name 'PPOBase' is not defined

In [7]:
float(rewards2

TypeError: only length-1 arrays can be converted to Python scalars