In [5]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np

In [24]:
# Create Env
class MultiGoalEnvWithObstacles(gym.Env):
    def __init__(self, grid_size=10, num_goals=3, num_obstacles=5):
        super(MultiGoalEnvWithObstacles, self).__init__()
        
        self.grid_size = grid_size
        self.num_goals = num_goals
        self.num_obstacles = num_obstacles
        
        self.action_space = spaces.Discrete(4)  # 상, 하, 좌, 우
        self.observation_space = spaces.Box(low=0, high=grid_size-1, shape=(2,), dtype=np.int32)
        
        self.reset()

    def reset(self, seed=None, options=None):
        if seed is not None:
            np.random.seed(seed)

        self.robot_pos = np.array([0, 0])
        self.goals = self._generate_non_overlapping_positions(self.num_goals)
        self.obstacles = self._generate_non_overlapping_positions(self.num_obstacles, self.goals)
        self.visited_goals = set()
        self.steps = 0
        return self.robot_pos, {}

    def _generate_non_overlapping_positions(self, num_positions, exclude_positions=[]):
        positions = set(exclude_positions)
        while len(positions) < num_positions + len(exclude_positions):
            pos = tuple(np.random.randint(0, self.grid_size, size=2))
            if pos != (0, 0):  # 로봇의 초기 위치 제외
                positions.add(pos)
        return list(positions - set(exclude_positions))

    def step(self, action):
            new_pos = self.robot_pos.copy()
            if action == 0:
                new_pos[1] += 1  # 상
            elif action == 1:
                new_pos[1] -= 1  # 하
            elif action == 2:
                new_pos[0] -= 1  # 좌
            elif action == 3:
                new_pos[0] += 1  # 우

            # 그리드 경계를 벗어나지 않도록 처리
            new_pos = np.clip(new_pos, 0, self.grid_size-1)
            
            # 장애물에 부딪히지 않도록 처리
            if tuple(new_pos) not in self.obstacles:
                self.robot_pos = new_pos
            
            self.steps += 1
            
            reward = -1  # 기본 보상은 -1
            if tuple(new_pos) in self.obstacles:
                reward = -5  # 장애물에 부딪힐 경우 패널티
            for i, goal in enumerate(self.goals):
                if np.array_equal(self.robot_pos, goal):
                    reward = 20  # 목표 지점 도달 시 보상 증가
                    self.visited_goals.add(i)
            
            done = len(self.visited_goals) == self.num_goals or self.steps >= 100  # 모든 목표를 방문하거나 최대 스텝 수를 넘으면 종료
            
            return self.robot_pos, reward, done, {}, {}


In [25]:
from stable_baselines3 import PPO

# 환경 생성
env = MultiGoalEnvWithObstacles(grid_size=10, num_goals=3, num_obstacles=5)

# 모델 생성
model = PPO('MlpPolicy', env, verbose=1)

# 모델 학습
model.learn(total_timesteps=50000)  # 학습 시간을 50000으로 증가

# 모델 저장
model.save("ppo_robot_vacuum_multi_goal_obstacles")

# 모델 로드 및 평가
model = PPO.load("ppo_robot_vacuum_multi_goal_obstacles")
obs, _ = env.reset()
for _ in range(100):
    action, _states = model.predict(obs)
    obs, rewards, done, truncated, info = env.step(action)
    env.render()
    print()  # 줄 바꿈을 추가하여 각 스텝을 구분
    if done or truncated:
        obs, _ = env.reset()


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 100      |
|    ep_rew_mean     | -79.3    |
| time/              |          |
|    fps             | 2915     |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 99.6        |
|    ep_rew_mean          | -81.2       |
| time/                   |             |
|    fps                  | 1301        |
|    iterations           | 2           |
|    time_elapsed         | 3           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.010297662 |
|    clip_fraction        | 0.0917      |
|    clip_range           | 0.2         |
|    entropy_loss   

NotImplementedError: 