# **Stage 1**: Train PPO on PointMaze with standard rewards, collect data, train distance models


In [14]:
import gymnasium as gym
import numpy as np
import torch
import os

import sys
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..')))
from env_wrappers import EnvBuilder
from ppo_agent_v2 import PPOAgent
from env_wrappers import GoalObservationWrapper
from src import distance_models

os.environ['WANDB_API_KEY'] = 'd58c31e07030724bbab6b7e8edc93edacd934c87'

# Check for GPU
print("CUDA available:", torch.cuda.is_available())
print("Current device:", torch.cuda.current_device())
print("Device name:", torch.cuda.get_device_name(torch.cuda.current_device()) if torch.cuda.is_available() else "CPU")

CUDA available: True
Current device: 0
Device name: Tesla V100-SXM3-32GB


In [None]:
env_id = 'PointMaze_UMaze-v3'
train_episodes = 10000
max_episode_steps = 512
seed = 0

torch.manual_seed(seed); np.random.seed(seed)

# Initialize environment
c = 'c'; U_map = [
    [1, 1, 1, 1, 1],
    [1, c, 0, 0, 1],
    [1, 1, 1, 0, 1],
    [1, c, 0, 0, 1],
    [1, 1, 1, 1, 1]
]
builder = EnvBuilder(env_id=env_id, maze_map=U_map, max_episode_steps=max_episode_steps, seed=seed)
obs_dim, act_dim = builder.get_obs_act_dim()

  self.scope.user = {"email": email}




In [12]:
N_envs = 1
def make_env(env_id, idx, capture_video, run_name, gamma):
    def thunk():
        # Set render_mode to 'rgb_array' if capturing video
        render_mode = "rgb_array" if capture_video and idx == 0 else None
        env = gym.make(env_id, render_mode=render_mode)
        # env = gym.wrappers.FlattenObservation(env)
        env = GoalObservationWrapper(env)
        if capture_video and idx == 0:
            env = gym.wrappers.RecordVideo(env, f"videos/{run_name}")
        env = gym.wrappers.RecordEpisodeStatistics(env)
        return env
    return thunk
envs = gym.vector.SyncVectorEnv(
        [make_env(env_id, seed + i, i, False, "test") for i in range(N_envs)]
    )

In [16]:
# Base
agent = PPOAgent(state_dim=obs_dim, action_dim=act_dim, total_timesteps=10000, log_to_wandb=False)
agent.train_ppo(envs)



global_step=300, episodic_return=[0.]
global_step=600, episodic_return=[137.]
global_step=900, episodic_return=[0.]
global_step=1200, episodic_return=[0.]
global_step=1500, episodic_return=[204.]
global_step=1800, episodic_return=[48.]
global_step=2100, episodic_return=[183.]
global_step=2400, episodic_return=[0.]
global_step=2700, episodic_return=[0.]
global_step=3000, episodic_return=[0.]
global_step=3300, episodic_return=[0.]
global_step=3600, episodic_return=[0.]
global_step=3900, episodic_return=[0.]
global_step=4200, episodic_return=[0.]
global_step=4500, episodic_return=[0.]
global_step=4800, episodic_return=[40.]
global_step=5100, episodic_return=[0.]
global_step=5400, episodic_return=[38.]
global_step=5700, episodic_return=[89.]
global_step=6000, episodic_return=[0.]
global_step=6300, episodic_return=[0.]
global_step=6600, episodic_return=[26.]
global_step=6900, episodic_return=[0.]
global_step=7200, episodic_return=[0.]
global_step=7500, episodic_return=[0.]
global_step=7800,

In [17]:
# reward
distance_model = distance_models.SupervisedDistanceEstimator(input_dim=4)
agent = PPOAgent(state_dim=obs_dim, action_dim=act_dim, total_timesteps=10000, distance_model=distance_model, log_to_wandb=False)
agent.train_ppo(envs, use_distance_shaping = True)



global_step=300, episodic_return=[0.]
global_step=600, episodic_return=[129.]
global_step=900, episodic_return=[0.]
global_step=1200, episodic_return=[0.]
global_step=1500, episodic_return=[204.]
global_step=1800, episodic_return=[53.]
global_step=2100, episodic_return=[182.]
global_step=2400, episodic_return=[0.]
global_step=2700, episodic_return=[0.]
global_step=3000, episodic_return=[0.]
global_step=3300, episodic_return=[0.]
global_step=3600, episodic_return=[0.]
global_step=3900, episodic_return=[0.]
global_step=4200, episodic_return=[0.]
global_step=4500, episodic_return=[0.]
global_step=4800, episodic_return=[0.]
global_step=5100, episodic_return=[0.]
global_step=5400, episodic_return=[13.]
global_step=5700, episodic_return=[0.]
global_step=6000, episodic_return=[0.]
global_step=6300, episodic_return=[0.]
global_step=6600, episodic_return=[53.]
global_step=6900, episodic_return=[0.]
global_step=7200, episodic_return=[0.]
global_step=7500, episodic_return=[0.]
global_step=7800, e

In [18]:
# state
distance_model = distance_models.SupervisedDistanceEstimator(input_dim=4)
agent = PPOAgent(state_dim=obs_dim, action_dim=act_dim, total_timesteps=10000, distance_model=distance_model, include_distance_state=True, log_to_wandb=False)
agent.train_ppo(envs, use_distance_shaping = False)



global_step=300, episodic_return=[0.]
global_step=600, episodic_return=[133.]
global_step=900, episodic_return=[0.]
global_step=1200, episodic_return=[0.]
global_step=1500, episodic_return=[200.]
global_step=1800, episodic_return=[21.]
global_step=2100, episodic_return=[182.]
global_step=2400, episodic_return=[0.]
global_step=2700, episodic_return=[0.]
global_step=3000, episodic_return=[0.]
global_step=3300, episodic_return=[0.]
global_step=3600, episodic_return=[0.]
global_step=3900, episodic_return=[0.]
global_step=4200, episodic_return=[0.]
global_step=4500, episodic_return=[0.]
global_step=4800, episodic_return=[0.]
global_step=5100, episodic_return=[0.]
global_step=5400, episodic_return=[0.]
global_step=5700, episodic_return=[0.]
global_step=6000, episodic_return=[0.]
global_step=6300, episodic_return=[0.]
global_step=6600, episodic_return=[141.]
global_step=6900, episodic_return=[0.]
global_step=7200, episodic_return=[0.]
global_step=7500, episodic_return=[0.]
global_step=7800, e

In [20]:
# both (state + reward)
distance_model = distance_models.SupervisedDistanceEstimator(input_dim=4)
agent = PPOAgent(state_dim=obs_dim, action_dim=act_dim, total_timesteps=10000, distance_model=distance_model, include_distance_state=True, log_to_wandb=False)
agent.train_ppo(envs, use_distance_shaping = True)



global_step=300, episodic_return=[0.]
global_step=600, episodic_return=[133.]
global_step=900, episodic_return=[0.]
global_step=1200, episodic_return=[0.]
global_step=1500, episodic_return=[200.]
global_step=1800, episodic_return=[21.]
global_step=2100, episodic_return=[182.]
global_step=2400, episodic_return=[0.]
global_step=2700, episodic_return=[0.]
global_step=3000, episodic_return=[0.]
global_step=3300, episodic_return=[0.]
global_step=3600, episodic_return=[0.]
global_step=3900, episodic_return=[0.]
global_step=4200, episodic_return=[0.]
global_step=4500, episodic_return=[0.]
global_step=4800, episodic_return=[0.]
global_step=5100, episodic_return=[0.]
global_step=5400, episodic_return=[0.]
global_step=5700, episodic_return=[0.]
global_step=6000, episodic_return=[0.]
global_step=6300, episodic_return=[0.]
global_step=6600, episodic_return=[120.]
global_step=6900, episodic_return=[0.]
global_step=7200, episodic_return=[0.]
global_step=7500, episodic_return=[0.]
global_step=7800, e

In [None]:
# model_path = "runs/ppo_1752098546.9388137.cleanrl_model"
# agent.agent.load_state_dict(torch.load(model_path, map_location="cuda"))

In [None]:
# agent.save_model()

In [9]:
env = builder()
trajectories = agent.evaluate_ppo(env, num_episodes=10, max_episode_steps=250)

Evaluating: 100%|███████████████████████████████████████████████████| 10/10 [00:03<00:00,  3.01it/s]

Success rate: 0.00%, avg_return: 0.00





In [None]:
from src.utils import trajectories_to_dataset
dataset = trajectories_to_dataset(trajectories, samples=500000)

In [None]:

sup_model = distance_models.SupervisedDistanceEstimator(input_dim=4)
sup_loss = sup_model.train_from_data(dataset, epochs=20, batch_size=16192)
# eval_loss = sup_model.evaluate_from_data(dataset, save_model=True)

In [None]:
sup_model.plot_distance_heatmap(env=env)