In [4]:
import gymnasium as gym
import gymnasium_robotics

gym.register_envs(gymnasium_robotics)
env = gym.make("FetchPickAndPlace-v4", render_mode="human")
observation, info = env.reset(seed=42)
print(observation)

{'observation': array([ 1.34193475e+00,  7.49101049e-01,  5.34725189e-01,  1.44951439e+00,
        8.08311461e-01,  4.24702090e-01,  1.07579639e-01,  5.92104125e-02,
       -1.10023099e-01,  3.83587563e-06,  6.44539257e-08,  1.36444225e-17,
       -4.38776463e-17,  9.33831897e-16,  5.26138825e-06,  7.50031136e-08,
        2.22065813e-05, -2.87319743e-18,  1.47598746e-17,  1.66271754e-18,
       -5.26138825e-06, -7.50031136e-08,  2.46214240e-05, -9.88156877e-07,
        7.65319420e-08]), 'achieved_goal': array([1.44951439, 0.80831146, 0.42470209]), 'desired_goal': array([1.22018822, 0.89178776, 0.42469975])}


In [None]:
import gymnasium as gym

import time  # 👈 import time module
gym.register_envs(gymnasium_robotics)
env = gym.make("FetchPickAndPlace-v4", render_mode="human")
obs, info = env.reset(seed=42)

for step in range(100):
    action = env.action_space.sample()
    # print(action)
    obs, reward, terminated, truncated, info = env.step(action)
    env.render()
    time.sleep(0.1)  # 👈 pause 100ms between frames; increase to 0.5 if too fast
    if terminated or truncated:
        obs, info = env.reset()

env.close()


In [None]:
import gymnasium as gym
import gymnasium_robotics
import torch
import torch.nn.functional as F
import numpy as np
import random

from src.agents import Actor, Critic
gym.register_envs(gymnasium_robotics)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

env = gym.make("FetchPickAndPlace-v4")
obs_dim = env.observation_space['observation'].shape[0] + env.observation_space['desired_goal'].shape[0]  # 25 + 3 = 28
act_dim = env.action_space.shape[0]  # 4

# Initialize actor-critic networks
actor = Actor().to(device)
critic = Critic().to(device)
actor_target = Actor().to(device)
critic_target = Critic().to(device)

actor_target.load_state_dict(actor.state_dict())
critic_target.load_state_dict(critic.state_dict())

actor_optimizer = torch.optim.Adam(actor.parameters(), lr=1e-4)
critic_optimizer = torch.optim.Adam(critic.parameters(), lr=1e-3)

gamma = 0.98
tau = 0.005  # soft update coefficient

# Simple replay buffer
replay_buffer = []

def soft_update(target, source, tau):
    for t_param, s_param in zip(target.parameters(), source.parameters()):
        t_param.data.copy_(tau * s_param.data + (1.0 - tau) * t_param.data)

# Training loop
num_episodes = 500
num_time_steps = 500
batch_size = 64

for ep in range(num_episodes):
    obs, _ = env.reset()
    obs_input = np.concatenate([obs['observation'], obs['desired_goal']])

    for t in range(num_time_steps):
        # Convert to tensor
        obs_tensor = torch.tensor(obs_input, dtype=torch.float32).unsqueeze(0).to(device)

        # Actor selects action
        with torch.no_grad():
            action = actor(obs_tensor).squeeze(0).cpu().numpy()

        # Add exploration noise
        action += np.random.normal(0, 0.1, size=act_dim)
        action = np.clip(action, -1.0, 1.0)

        # Step in environment
        obs_tp1, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated

        # Flatten next observation
        obs_tp1_input = np.concatenate([obs_tp1['observation'], obs_tp1['desired_goal']])

        # Save to buffer
        replay_buffer.append((obs_input, action, reward, done, obs_tp1_input))
        if len(replay_buffer) > 1e6:
            replay_buffer.pop(0)

        # Prepare for next step
        obs_input = obs_tp1_input

        # Skip training if buffer is too small
        if len(replay_buffer) < batch_size:
            continue

        # Sample batch
        batch = random.sample(replay_buffer, batch_size)
        s, a, r, d, s_next = zip(*batch)

        s = torch.tensor(s, dtype=torch.float32).to(device)
        a = torch.tensor(a, dtype=torch.float32).to(device)
        r = torch.tensor(r, dtype=torch.float32).unsqueeze(1).to(device)
        d = torch.tensor(d, dtype=torch.float32).unsqueeze(1).to(device)
        s_next = torch.tensor(s_next, dtype=torch.float32).to(device)

        # ----- Critic update -----
        with torch.no_grad():
            a_next = actor_target(s_next)
            q_next = critic_target(s_next, a_next)
            y = r + gamma * (1.0 - d) * q_next

        q = critic(s, a)
        critic_loss = F.mse_loss(q, y)
        critic_optimizer.zero_grad()
        critic_loss.backward()
        critic_optimizer.step()

        # ----- Actor update -----
        a_pred = actor(s)
        q_pred = critic(s, a_pred)
        actor_loss = -q_pred.mean()
        actor_optimizer.zero_grad()
        actor_loss.backward()
        actor_optimizer.step()

        # ----- Soft update -----
        soft_update(actor_target, actor, tau)
        soft_update(critic_target, critic, tau)

    print(f"Episode {ep+1} done, reward {reward}")

env.close()


Episode 1 done.
Episode 2 done.
Episode 3 done.
Episode 4 done.
Episode 5 done.
Episode 6 done.
Episode 7 done.
Episode 8 done.
Episode 9 done.
Episode 10 done.
Episode 11 done.
Episode 12 done.
Episode 13 done.
Episode 14 done.
Episode 15 done.
Episode 16 done.
Episode 17 done.
Episode 18 done.
Episode 19 done.
Episode 20 done.
Episode 21 done.
Episode 22 done.
Episode 23 done.
Episode 24 done.
Episode 25 done.
Episode 26 done.
Episode 27 done.
Episode 28 done.
Episode 29 done.
Episode 30 done.
Episode 31 done.
Episode 32 done.
Episode 33 done.
Episode 34 done.
Episode 35 done.
Episode 36 done.
Episode 37 done.
Episode 38 done.
Episode 39 done.
Episode 40 done.
Episode 41 done.
Episode 42 done.
Episode 43 done.
Episode 44 done.
Episode 45 done.
Episode 46 done.
Episode 47 done.
Episode 48 done.
Episode 49 done.
Episode 50 done.
Episode 51 done.
Episode 52 done.
Episode 53 done.
Episode 54 done.
Episode 55 done.
Episode 56 done.
Episode 57 done.
Episode 58 done.
Episode 59 done.
Episod

In [9]:
import gymnasium as gym
import gymnasium_robotics
import torch
import numpy as np
import time
gym.register_envs(gymnasium_robotics)
env = gym.make("FetchPickAndPlace-v4", render_mode="human")
obs, info = env.reset(seed=42)

for step in range(100):
    obs_input = np.concatenate([obs['observation'], obs['desired_goal']])
    obs_tensor = torch.tensor(obs_input, dtype=torch.float32).unsqueeze(0).to(device)

    with torch.no_grad():
        action = actor(obs_tensor).squeeze(0).cpu().numpy()

    obs, reward, terminated, truncated, info = env.step(action)
    env.render()
    time.sleep(0.1)

    if terminated or truncated:
        obs, info = env.reset()

env.close()
