<a href="https://colab.research.google.com/github/EthanCui2008/Kaggle/blob/Main/Robot_Gym.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import gymnasium
import numpy as np
from gymnasium import spaces
class ContinuousXYRobotEnv(gymnasium.Env):
    def __init__(self):
        super(ContinuousXYRobotEnv, self).__init__()

        self.GYM_SIZE = (27, 10)  # Size of the gym (width, height)
        self.ROBOT_SIZE = (0.5, 0.5)  # Size of the robot (width, height)
        self.ROBOT_MASS = 1.0  # Mass of the robot

        # Define action and observation space
        # Actions: [move_x, move_y, pick_up/drop]
        self.action_space = spaces.Box(low=np.array([-1, -1, 0]), high=np.array([1, 1, 1]), dtype=np.float32)

        # Observations: [robot_x, robot_y, object_x, object_y, goal_x, goal_y, holding_object]
        self.observation_space = spaces.Box(low=0, high=10, shape=(7,), dtype=np.float32)

        self.seed()
        self.reset()

    def seed(self, seed=None):
        self.np_random, _ = gym.utils.seeding.np_random(seed)

    def reset(self, seed=None, options=None):
        if seed is not None:
            self.seed(seed)
        self.robot_pos = np.array([5.0, 5.0], dtype=np.float32)
        self.object_pos = np.array([self.np_random.uniform(0, 10), self.np_random.uniform(0, 10)], dtype=np.float32)
        self.goal_pos = np.array([self.np_random.uniform(0, 10), self.np_random.uniform(0, 10)], dtype=np.float32)
        self.holding_object = 0
        return self._get_obs(), {}

    def _get_obs(self):
        return np.concatenate([self.robot_pos, self.object_pos, self.goal_pos, [self.holding_object]]).astype(np.float32)

    def step(self, action):
        move_x, move_y, pick_up = action

        # Update robot position with continuous values
        self.robot_pos[0] = np.clip(self.robot_pos[0] + move_x, 0, 10)
        self.robot_pos[1] = np.clip(self.robot_pos[1] + move_y, 0, 10)

        # Reward for moving closer to the object
        distance_to_object = np.linalg.norm(self.robot_pos - self.object_pos)
        reward = 1.0 / (distance_to_object + 1)

        # Picking up or dropping the object
        if pick_up > 0.5 and np.linalg.norm(self.robot_pos - self.object_pos) < 0.5 and not self.holding_object:
            self.holding_object = 1
            reward += 10  # Reward for picking up the object
        elif pick_up > 0.5 and np.linalg.norm(self.robot_pos - self.goal_pos) < 0.5 and self.holding_object:
            self.holding_object = 0
            reward += 50  # Reward for delivering the object to the goal

        # Check if episode is done
        done = False
        if not self.holding_object and np.linalg.norm(self.robot_pos - self.goal_pos) < 0.5:
            done = True

        return self._get_obs(), reward, done, False, {}

    def render(self, mode='human'):
        print(f"Robot Position: {self.robot_pos}")
        print(f"Object Position: {self.object_pos}")
        print(f"Goal Position: {self.goal_pos}")
        print(f"Holding Object: {self.holding_object}")


In [None]:
env = ContinuousXYRobotEnv()

In [None]:
from stable_baselines3 import PPO
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.evaluation import evaluate_policy

# Check the environment
check_env(env)

# Create the PPO model
model = PPO("MlpPolicy", env, verbose=1)

# Train the agent
model.learn(total_timesteps=100000)

# Save the model
model.save("ppo_custom_env")

# Load the trained model
model = PPO.load("ppo_custom_env")

# Evaluate the policy
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=10)
print(f"Mean reward: {mean_reward} +/- {std_reward}")

# Render the result after training
obs, _ = env.reset()
done = False
while not done:
    action, _states = model.predict(obs)
    obs, reward, done, _, _ = env.step(action)
    env.render()
    print(f"Reward: {reward}")



Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 270      |
|    ep_rew_mean     | 37.3     |
| time/              |          |
|    fps             | 1251     |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 265         |
|    ep_rew_mean          | 70.4        |
| time/                   |             |
|    fps                  | 351         |
|    iterations           | 2           |
|    time_elapsed         | 11          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.010501184 |
|    clip_fraction        | 0.122       |
|    clip_range           | 0.2         |
|    entropy_loss   



KeyboardInterrupt: 

In [None]:
!pip install stable_baselines3

  and should_run_async(code)


Collecting stable_baselines3
  Downloading stable_baselines3-2.3.2-py3-none-any.whl (182 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/182.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m174.1/182.3 kB[0m [31m5.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.3/182.3 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting gymnasium<0.30,>=0.28.1 (from stable_baselines3)
  Downloading gymnasium-0.29.1-py3-none-any.whl (953 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
Collecting farama-notifications>=0.0.1 (from gymnasium<0.30,>=0.28.1->stable_baselines3)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.13->stable_baselines3)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-