In [1]:
import gymnasium as gym
import numpy as np
import torch
import open3d as o3d
from gymnasium import spaces
import copy
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import SubprocVecEnv
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.utils import set_random_seed
from stable_baselines3.common.callbacks import CheckpointCallback
from stable_baselines3.common.callbacks import BaseCallback
from gymnasium.envs.registration import register
import torch.nn as nn
import matplotlib.pyplot as plt
import copy
import sys
import os
import cv2
from datetime import datetime
sys.path.append("/home/dir/RL_CoveragePlanning/viewpointPlaygroundEnv/viewpoint_env")
from viewpointWorld import CoverageEnv

Jupyter environment detected. Enabling Open3D WebVisualizer.
[Open3D INFO] WebRTC GUI backend enabled.
[Open3D INFO] WebRTCWindowSystem: HTTP handshake server disabled.


2024-12-23 20:26:19.134792: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-23 20:26:19.136113: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-12-23 20:26:19.159498: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
class ActionSpaceExplorationCallback(BaseCallback):
    def __init__(self, verbose=0, heatmap_interval=10000):
        super().__init__(verbose)
        self.heatmap_interval = heatmap_interval
        self.action_counts = None
        self.total_timesteps = 0
        
        # Create a timestamp for the subdirectory
        self.timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        self.save_dir = os.path.join('heatmaps', self.timestamp)
        os.makedirs(self.save_dir, exist_ok=True)

    def _on_training_start(self):
        # The action space is Box(-1, 1, (2,))
        self.action_counts = np.zeros((100, 100))  # 100x100 grid for the 2D continuous action space

    def _on_step(self) -> bool:
        # Get the actual last actions taken by the agent
        actions = self.training_env.get_attr('last_action')

        for action in actions:
            # Discretize the action and update counts
            x, y = self.discretize_action(action)
            self.action_counts[x, y] += 1

        self.total_timesteps += len(actions)
        if self.total_timesteps % self.heatmap_interval == 0:
            self.generate_heatmap()

        return True

    def discretize_action(self, action):
        # Map action from [-1, 1] to [0, 99] for both dimensions
        x = int((action[0] + 1) * 49.5)
        y = int((action[1] + 1) * 49.5)
        return min(x, 99), min(y, 99)

    def generate_heatmap(self):
        plt.figure(figsize=(12, 10))
        
        plt.imshow(self.action_counts, cmap='hot', interpolation='nearest')
        plt.colorbar()
        plt.xlabel('Theta')
        plt.ylabel('Phi')
        
        plt.title(f'Action Space Exploration at {self.total_timesteps} steps')
        
        # Add text with total actions taken
        total_actions = np.sum(self.action_counts)
        plt.text(0.95, 0.95, f'Total actions: {total_actions}', 
                 verticalalignment='top', horizontalalignment='right',
                 transform=plt.gca().transAxes, fontsize=10, bbox=dict(facecolor='white', alpha=0.5))
        
        plt.savefig(os.path.join(self.save_dir, f'action_heatmap_{self.total_timesteps}.png'))
        plt.close()

    def on_training_end(self):
        # Generate a final heatmap at the end of training
        self.generate_heatmap()
        # Create a video from all heatmaps
        self.create_video_from_heatmaps()

    def create_video_from_heatmaps(self):
        images = [img for img in os.listdir(self.save_dir) if img.endswith(".png")]
        images.sort(key=lambda x: int(x.split('_')[-1].split('.')[0]))  # Sort by timestep

        print("Generating video file...")
        if not images:
            print("No heatmap images found to create video.")
            return

        frame = cv2.imread(os.path.join(self.save_dir, images[0]))
        height, width, layers = frame.shape

        video_name = os.path.join(self.save_dir, 'heatmap_video.mp4')
        video = cv2.VideoWriter(video_name, cv2.VideoWriter_fourcc(*'mp4v'), 10, (width, height))

        for image in images:
            video.write(cv2.imread(os.path.join(self.save_dir, image)))

        cv2.destroyAllWindows()
        video.release()

        print(f"Video created and saved as {video_name}")

In [3]:
register(
    id="CoverageEnv-v0",
    entry_point="viewpoint_env.viewpointWorld:CoverageEnv",
)

In [4]:
gym.pprint_registry()

===== classic_control =====
Acrobot-v1                        CartPole-v0                       CartPole-v1
MountainCar-v0                    MountainCarContinuous-v0          Pendulum-v1
===== phys2d =====
phys2d/CartPole-v0                phys2d/CartPole-v1                phys2d/Pendulum-v0
===== box2d =====
BipedalWalker-v3                  BipedalWalkerHardcore-v3          CarRacing-v2
LunarLander-v2                    LunarLanderContinuous-v2
===== toy_text =====
Blackjack-v1                      CliffWalking-v0                   FrozenLake-v1
FrozenLake8x8-v1                  Taxi-v3
===== tabular =====
tabular/Blackjack-v0              tabular/CliffWalking-v0
===== mujoco =====
Ant-v2                            Ant-v3                            Ant-v4
HalfCheetah-v2                    HalfCheetah-v3                    HalfCheetah-v4
Hopper-v2                         Hopper-v3                         Hopper-v4
Humanoid-v2                       Humanoid-v3                       Hu

In [5]:
def make_env(env_id, rank, seed=0):
    def _init():
        env = gym.make(env_id)
        env.reset(seed=seed + rank)
        return env
    set_random_seed(seed)
    return _init

In [6]:
# Environment setup
env_id = "CoverageEnv-v0"
n_envs = 32

# Create multiple environment instances
env = DummyVecEnv([lambda: CoverageEnv() for _ in range(n_envs)])


Mesh file: test_6.obj loaded for environment...
Mesh file: test_6.obj loaded for environment...
Mesh file: test_6.obj loaded for environment...
Mesh file: test_6.obj loaded for environment...
Mesh file: test_6.obj loaded for environment...
Mesh file: test_6.obj loaded for environment...
Mesh file: test_6.obj loaded for environment...
Mesh file: test_6.obj loaded for environment...
Mesh file: test_6.obj loaded for environment...
Mesh file: test_6.obj loaded for environment...
Mesh file: test_6.obj loaded for environment...
Mesh file: test_6.obj loaded for environment...
Mesh file: test_6.obj loaded for environment...
Mesh file: test_6.obj loaded for environment...
Mesh file: test_6.obj loaded for environment...
Mesh file: test_6.obj loaded for environment...
Mesh file: test_6.obj loaded for environment...
Mesh file: test_6.obj loaded for environment...
Mesh file: test_6.obj loaded for environment...
Mesh file: test_6.obj loaded for environment...
Mesh file: test_6.obj loaded for environ

In [7]:
# Model hyperparameters
policy_kwargs = dict(
    log_std_init=-1,
    ortho_init=True,
    activation_fn=nn.ReLU,
    net_arch=dict(pi=[256, 512, 512, 256], vf=[256, 512, 512, 256])
)

In [8]:
# Create the PPO agent
model = PPO(
    "MlpPolicy",
    env,
    n_steps=4096,
    batch_size=512,
    gae_lambda=0.95,
    gamma=0.99,
    n_epochs=8,
    ent_coef=1.0,
    learning_rate=3e-4,
    clip_range=0.2,
    policy_kwargs=policy_kwargs,
    verbose=1
)

Using cuda device




In [9]:
# Setup checkpointing
checkpoint_callback = CheckpointCallback(
    save_freq=100000,
    save_path="./logs/",
    name_prefix="ppo_coverage_model"
)

exploration_callback = ActionSpaceExplorationCallback(heatmap_interval=10000)

In [11]:
# Train the agent
total_timesteps = int(1e7)
model.learn(
    total_timesteps=total_timesteps,
    callback=[checkpoint_callback],
    progress_bar=True,
)

Output()

In [11]:
# Test the trained agent
obs = env.reset()
for _ in range(10):
    action, _states = model.predict(obs, deterministic=True)
    obs, reward, done, info = env.step(action)
    print(f"{reward=} | {done=}")
    env.render()
    if done.any():
        obs = env.reset()

env.close()

reward=array([-12389., -12948., -11900., -12475., -12694., -11900., -12694.,
       -11784., -12475., -12694., -11784., -11784., -11879., -12050.,
       -11900., -12948., -12669., -12475., -12669., -11784., -12694.,
       -12948., -11784., -11900., -12389., -12694., -11900., -12389.,
       -12948., -12948., -11784., -12050., -11879., -12050., -12948.,
       -11879., -11784., -11784., -12389., -11900., -11784., -12948.,
       -11784., -12948., -11879., -12475., -12669., -12948., -12694.,
       -12475., -12475., -12669., -12475., -12948., -12389., -12669.,
       -12050., -12389., -11900., -11900., -12050., -11900., -11879.,
       -12948.], dtype=float32) | done=array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, Fa



In [12]:
# Save the final model
# model.save("ppo_coverage_final_model")