_______________________________

With distance model
_______________________________

In [2]:
import gymnasium as gym
import numpy as np
import torch
import os

import sys
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..')))
from src.env_wrappers import EnvBuilder
from src.ppo_agent_v2 import PPOAgent
from src.env_wrappers import GoalObservationWrapper
import src.distance_models as distance_models
from src.utils import trajectories_to_dataset

os.environ['WANDB_API_KEY'] = 'd58c31e07030724bbab6b7e8edc93edacd934c87'

# Check for GPU
print("CUDA available:", torch.cuda.is_available())
print("Current device:", torch.cuda.current_device())
print("Device name:", torch.cuda.get_device_name(torch.cuda.current_device()) if torch.cuda.is_available() else "CPU")

CUDA available: True
Current device: 0
Device name: Tesla V100-SXM3-32GB


In [3]:
env_id = 'PointMaze_UMaze-v3'
train_episodes = 10000
max_episode_steps = 512
seed = 0

torch.manual_seed(seed); np.random.seed(seed)

# Initialize environment
c = 'c'; U_map = [
    [1, 1, 1, 1, 1],
    [1, c, 0, 0, 1],
    [1, 1, 1, 0, 1],
    [1, c, 0, 0, 1],
    [1, 1, 1, 1, 1]
]
builder = EnvBuilder(env_id=env_id, maze_map=U_map, max_episode_steps=max_episode_steps, seed=seed)
obs_dim, act_dim = builder.get_obs_act_dim()

In [4]:
N_envs = 1
def make_env(env_id, idx, capture_video, run_name, gamma):
    def thunk():
        # Set render_mode to 'rgb_array' if capturing video
        render_mode = "rgb_array" if capture_video and idx == 0 else None
        env = gym.make(env_id, render_mode=render_mode)
        # env = gym.wrappers.FlattenObservation(env)
        env = GoalObservationWrapper(env)
        if capture_video and idx == 0:
            env = gym.wrappers.RecordVideo(env, f"videos/{run_name}")
        env = gym.wrappers.RecordEpisodeStatistics(env)
        return env
    return thunk

In [None]:
def train_supervised_distance(
        envs,
        env,
        agent,
        distance_model,
        *,
        num_stage: int = 30,
        eval_size: int = 10,
        max_episode_steps: int = 350,
        d_samples: int = 500000,
        en_reward=False,
    ):
        sup_model = distance_model
        for episode in range(num_stage):
            agent.train_ppo(envs, use_distance_shaping = en_reward, verbose=False)
            trajectories = agent.evaluate_ppo(env, num_episodes=eval_size, max_episode_steps=max_episode_steps)

            dataset = trajectories_to_dataset(trajectories, samples=d_samples)
            # sup_model = SupervisedDistanceEstimator(input_dim=4)
            sup_loss = sup_model.train_from_data(dataset, epochs=20, batch_size=16192)          
            print(f"Finish {episode=}, dd loss = {sup_loss}\n")


In [7]:
envs = gym.vector.SyncVectorEnv([make_env(env_id, seed, 0, False, "test")])
env = builder()
distance_model = distance_models.SupervisedDistanceEstimator(input_dim=4)

In [9]:
EXPERIMENTS = {
    # tag      include_distance_state  use_distance_shaping
    "base"            : (False, False),
    "reward_only"     : (False, True ),
    "state_only"      : (True , False),
    "reward_and_state": (True , True ),
}


In [None]:
en_state, en_reward = EXPERIMENTS["state_only"]

agent = PPOAgent(state_dim=obs_dim, action_dim=act_dim, total_timesteps=10000, distance_model=distance_model, include_distance_state=en_state)
agent.train_ppo(envs, use_distance_shaping = True)
train_supervised_distance(envs, env, agent, distance_model, en_reward=en_reward)

[34m[1mwandb[0m: Currently logged in as: [33mgamershmidt-sofya[0m ([33mgamershmidt-sofya-innopolis-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
  self.scope.user = {"email": email}
  self.scope.user = {"email": email}


global_step=300, episodic_return=[0.]
global_step=600, episodic_return=[160.]
global_step=900, episodic_return=[0.]
global_step=1200, episodic_return=[0.]
global_step=1500, episodic_return=[204.]
global_step=1800, episodic_return=[51.]
global_step=2100, episodic_return=[182.]
global_step=2400, episodic_return=[0.]
global_step=2700, episodic_return=[0.]
global_step=3000, episodic_return=[0.]
global_step=3300, episodic_return=[0.]
global_step=3600, episodic_return=[0.]
global_step=3900, episodic_return=[0.]
global_step=4200, episodic_return=[0.]
global_step=4500, episodic_return=[0.]
global_step=4800, episodic_return=[0.]
global_step=5100, episodic_return=[0.]
global_step=5400, episodic_return=[0.]
global_step=5700, episodic_return=[0.]
global_step=6000, episodic_return=[0.]
global_step=6300, episodic_return=[0.]
global_step=6600, episodic_return=[101.]
global_step=6900, episodic_return=[0.]
global_step=7200, episodic_return=[0.]
global_step=7500, episodic_return=[0.]
global_step=7800, e

Evaluating: 100%|███████████████████████████████████████████████████| 10/10 [00:04<00:00,  2.05it/s]


Success rate: 0.00%, avg_return: 0.00


Building dataset: 100%|████████████████████████████████████████████| 10/10 [00:00<00:00, 274.46it/s]


Trajectories processed : 10
Generated samples      : 451500 (train 361200 / eval 90300)


Training, t_loss=0.00: 100%|████████████████████████████████████████| 20/20 [00:01<00:00, 18.60it/s]


eval_losses[0]=0.3718803316202935 -> eval_losses[-1]=0.07946629656245674
Finish episode=0, dd loss = 0.07106221467256546


Evaluating: 100%|███████████████████████████████████████████████████| 10/10 [00:04<00:00,  2.03it/s]


Success rate: 0.00%, avg_return: 0.00


Building dataset: 100%|████████████████████████████████████████████| 10/10 [00:00<00:00, 418.63it/s]


Trajectories processed : 10
Generated samples      : 451500 (train 361200 / eval 90300)


Training, t_loss=0.00: 100%|████████████████████████████████████████| 20/20 [00:01<00:00, 19.63it/s]


eval_losses[0]=0.0746437669249204 -> eval_losses[-1]=0.06212455301717798
Finish episode=1, dd loss = 0.05749207362532616


Evaluating: 100%|███████████████████████████████████████████████████| 10/10 [00:04<00:00,  2.00it/s]


Success rate: 0.00%, avg_return: 0.00


Building dataset: 100%|████████████████████████████████████████████| 10/10 [00:00<00:00, 503.31it/s]


Trajectories processed : 10
Generated samples      : 451500 (train 361200 / eval 90300)


Training, t_loss=0.00: 100%|████████████████████████████████████████| 20/20 [00:00<00:00, 20.32it/s]


eval_losses[0]=0.042869137313014775 -> eval_losses[-1]=0.03761533008461379
Finish episode=2, dd loss = 0.034445133060216904


Evaluating: 100%|███████████████████████████████████████████████████| 10/10 [00:05<00:00,  1.81it/s]


Success rate: 0.00%, avg_return: 0.00


Building dataset: 100%|████████████████████████████████████████████| 10/10 [00:00<00:00, 510.43it/s]


Trajectories processed : 10
Generated samples      : 451500 (train 361200 / eval 90300)


Training, t_loss=0.00: 100%|████████████████████████████████████████| 20/20 [00:00<00:00, 20.95it/s]


eval_losses[0]=0.0383838644270617 -> eval_losses[-1]=0.03372971307663691
Finish episode=3, dd loss = 0.031578224152326584


Evaluating: 100%|███████████████████████████████████████████████████| 10/10 [00:04<00:00,  2.03it/s]


Success rate: 0.00%, avg_return: 0.00


Building dataset: 100%|████████████████████████████████████████████| 10/10 [00:00<00:00, 500.78it/s]


Trajectories processed : 10
Generated samples      : 451500 (train 361200 / eval 90300)


Training, t_loss=0.00: 100%|████████████████████████████████████████| 20/20 [00:00<00:00, 20.12it/s]


eval_losses[0]=0.03042157093947908 -> eval_losses[-1]=0.02448841352663431
Finish episode=4, dd loss = 0.022594427689909935


Evaluating: 100%|███████████████████████████████████████████████████| 10/10 [00:05<00:00,  1.96it/s]


Success rate: 0.00%, avg_return: 0.00


Building dataset: 100%|████████████████████████████████████████████| 10/10 [00:00<00:00, 505.50it/s]


Trajectories processed : 10
Generated samples      : 451500 (train 361200 / eval 90300)


Training, t_loss=0.00: 100%|████████████████████████████████████████| 20/20 [00:01<00:00, 19.92it/s]


eval_losses[0]=0.029620630004477264 -> eval_losses[-1]=0.022169470971868943
Finish episode=5, dd loss = 0.020699800923466682


Evaluating: 100%|███████████████████████████████████████████████████| 10/10 [00:05<00:00,  1.99it/s]


Success rate: 0.00%, avg_return: 0.00


Building dataset: 100%|████████████████████████████████████████████| 10/10 [00:00<00:00, 496.23it/s]


Trajectories processed : 10
Generated samples      : 451500 (train 361200 / eval 90300)


Training, t_loss=0.00: 100%|████████████████████████████████████████| 20/20 [00:01<00:00, 19.30it/s]


eval_losses[0]=0.028929265593637527 -> eval_losses[-1]=0.022073405826606624
Finish episode=6, dd loss = 0.019993510097265244


Evaluating: 100%|███████████████████████████████████████████████████| 10/10 [00:04<00:00,  2.01it/s]


Success rate: 0.00%, avg_return: 0.00


Building dataset: 100%|████████████████████████████████████████████| 10/10 [00:00<00:00, 360.24it/s]


Trajectories processed : 10
Generated samples      : 451500 (train 361200 / eval 90300)


Training, t_loss=0.00: 100%|████████████████████████████████████████| 20/20 [00:00<00:00, 20.63it/s]


eval_losses[0]=0.020086848194548985 -> eval_losses[-1]=0.015609236809106364
Finish episode=7, dd loss = 0.014478659257292747


Evaluating: 100%|███████████████████████████████████████████████████| 10/10 [00:05<00:00,  1.95it/s]


Success rate: 30.00%, avg_return: 105.00


Building dataset: 100%|████████████████████████████████████████████| 10/10 [00:00<00:00, 532.62it/s]


Trajectories processed : 10
Generated samples      : 409644 (train 327715 / eval 81929)


Training, t_loss=0.00: 100%|████████████████████████████████████████| 20/20 [00:00<00:00, 21.81it/s]


eval_losses[0]=0.023528402556384092 -> eval_losses[-1]=0.01073737439492436
Finish episode=8, dd loss = 0.010490969754755497


Evaluating: 100%|███████████████████████████████████████████████████| 10/10 [00:04<00:00,  2.10it/s]


Success rate: 20.00%, avg_return: 102.50


Building dataset: 100%|████████████████████████████████████████████| 10/10 [00:00<00:00, 528.22it/s]


Trajectories processed : 10
Generated samples      : 424071 (train 339256 / eval 84815)


Training, t_loss=0.00: 100%|████████████████████████████████████████| 20/20 [00:00<00:00, 22.68it/s]


eval_losses[0]=0.007001346893301552 -> eval_losses[-1]=0.005007694844598584
Finish episode=9, dd loss = 0.004652036819607019


Evaluating: 100%|███████████████████████████████████████████████████| 10/10 [00:04<00:00,  2.01it/s]


Success rate: 30.00%, avg_return: 65.00


Building dataset: 100%|████████████████████████████████████████████| 10/10 [00:00<00:00, 527.82it/s]


Trajectories processed : 10
Generated samples      : 439225 (train 351380 / eval 87845)


Training, t_loss=0.00: 100%|████████████████████████████████████████| 20/20 [00:00<00:00, 21.56it/s]


eval_losses[0]=0.0058212797903256005 -> eval_losses[-1]=0.004437007423117312
Finish episode=10, dd loss = 0.004307310562580824


Evaluating: 100%|███████████████████████████████████████████████████| 10/10 [00:04<00:00,  2.04it/s]


Success rate: 10.00%, avg_return: 94.00


Building dataset: 100%|████████████████████████████████████████████| 10/10 [00:00<00:00, 524.82it/s]


Trajectories processed : 10
Generated samples      : 439503 (train 351602 / eval 87901)


Training, t_loss=0.00: 100%|████████████████████████████████████████| 20/20 [00:00<00:00, 21.51it/s]


eval_losses[0]=0.0040282199988206395 -> eval_losses[-1]=0.0033310570042404107
Finish episode=11, dd loss = 0.0032027163542807102


Evaluating: 100%|███████████████████████████████████████████████████| 10/10 [00:05<00:00,  1.99it/s]


Success rate: 40.00%, avg_return: 61.00


Building dataset: 100%|████████████████████████████████████████████| 10/10 [00:00<00:00, 483.84it/s]


Trajectories processed : 10
Generated samples      : 439743 (train 351794 / eval 87949)


Training, t_loss=0.00: 100%|████████████████████████████████████████| 20/20 [00:00<00:00, 21.43it/s]


eval_losses[0]=0.004540518446534333 -> eval_losses[-1]=0.00405625442855171
Finish episode=12, dd loss = 0.004008608870208263


Evaluating: 100%|███████████████████████████████████████████████████| 10/10 [00:05<00:00,  1.98it/s]


Success rate: 20.00%, avg_return: 117.50


Building dataset: 100%|████████████████████████████████████████████| 10/10 [00:00<00:00, 540.96it/s]


Trajectories processed : 10
Generated samples      : 416948 (train 333558 / eval 83390)


Training, t_loss=0.00: 100%|████████████████████████████████████████| 20/20 [00:00<00:00, 21.72it/s]


eval_losses[0]=0.002651153186597274 -> eval_losses[-1]=0.0016656871150188672
Finish episode=13, dd loss = 0.0015186162199825048


Evaluating: 100%|███████████████████████████████████████████████████| 10/10 [00:04<00:00,  2.44it/s]


Success rate: 70.00%, avg_return: 124.86


Building dataset: 100%|████████████████████████████████████████████| 10/10 [00:00<00:00, 686.15it/s]


Trajectories processed : 10
Generated samples      : 316366 (train 253092 / eval 63274)


Training, t_loss=0.00: 100%|████████████████████████████████████████| 20/20 [00:00<00:00, 29.15it/s]


eval_losses[0]=0.0022719071216392136 -> eval_losses[-1]=0.001257354896575072
Finish episode=14, dd loss = 0.0012094221310690045


Evaluating: 100%|███████████████████████████████████████████████████| 10/10 [00:04<00:00,  2.40it/s]


Success rate: 70.00%, avg_return: 126.29


Building dataset: 100%|████████████████████████████████████████████| 10/10 [00:00<00:00, 592.47it/s]


Trajectories processed : 10
Generated samples      : 314313 (train 251450 / eval 62863)


Training, t_loss=0.00: 100%|████████████████████████████████████████| 20/20 [00:00<00:00, 28.83it/s]


eval_losses[0]=0.0011931936647158383 -> eval_losses[-1]=0.0009099220058756166
Finish episode=15, dd loss = 0.0008708694949746132


Evaluating: 100%|███████████████████████████████████████████████████| 10/10 [00:04<00:00,  2.26it/s]


Success rate: 30.00%, avg_return: 159.67


Building dataset: 100%|████████████████████████████████████████████| 10/10 [00:00<00:00, 587.64it/s]


Trajectories processed : 10
Generated samples      : 371462 (train 297169 / eval 74293)


Training, t_loss=0.00: 100%|████████████████████████████████████████| 20/20 [00:00<00:00, 23.72it/s]


eval_losses[0]=0.005850524017127366 -> eval_losses[-1]=0.0008857420524069427
Finish episode=16, dd loss = 0.0007908001425676048


Evaluating: 100%|███████████████████████████████████████████████████| 10/10 [00:04<00:00,  2.06it/s]


Success rate: 10.00%, avg_return: 178.00


Building dataset: 100%|████████████████████████████████████████████| 10/10 [00:00<00:00, 527.70it/s]


Trajectories processed : 10
Generated samples      : 421401 (train 337120 / eval 84281)


Training, t_loss=0.00: 100%|████████████████████████████████████████| 20/20 [00:00<00:00, 20.76it/s]


eval_losses[0]=0.0007001811744799565 -> eval_losses[-1]=0.00020604152219811286
Finish episode=17, dd loss = 0.00019243631686549634


Evaluating: 100%|███████████████████████████████████████████████████| 10/10 [00:04<00:00,  2.21it/s]


Success rate: 30.00%, avg_return: 172.00


Building dataset: 100%|████████████████████████████████████████████| 10/10 [00:00<00:00, 610.74it/s]


Trajectories processed : 10
Generated samples      : 364855 (train 291884 / eval 72971)


Training, t_loss=0.00: 100%|████████████████████████████████████████| 20/20 [00:00<00:00, 23.62it/s]


eval_losses[0]=0.0012402422939039727 -> eval_losses[-1]=0.0007407876549358341
Finish episode=18, dd loss = 0.0007227618480101228


Evaluating: 100%|███████████████████████████████████████████████████| 10/10 [00:04<00:00,  2.35it/s]


Success rate: 40.00%, avg_return: 166.75


Building dataset: 100%|████████████████████████████████████████████| 10/10 [00:00<00:00, 653.23it/s]


Trajectories processed : 10
Generated samples      : 339427 (train 271541 / eval 67886)


Training, t_loss=0.00: 100%|████████████████████████████████████████| 20/20 [00:00<00:00, 26.52it/s]


eval_losses[0]=0.00044416101849205917 -> eval_losses[-1]=0.0002909228063004084
Finish episode=19, dd loss = 0.00026880885707214475


Evaluating: 100%|███████████████████████████████████████████████████| 10/10 [00:04<00:00,  2.36it/s]


Success rate: 40.00%, avg_return: 158.75


Building dataset: 100%|████████████████████████████████████████████| 10/10 [00:00<00:00, 633.94it/s]


Trajectories processed : 10
Generated samples      : 345767 (train 276613 / eval 69154)


Training, t_loss=0.00: 100%|████████████████████████████████████████| 20/20 [00:00<00:00, 25.80it/s]


eval_losses[0]=0.0005806960340253247 -> eval_losses[-1]=0.0004388181484121024
Finish episode=20, dd loss = 0.00039990231744013727


Evaluating: 100%|███████████████████████████████████████████████████| 10/10 [00:04<00:00,  2.32it/s]


Success rate: 40.00%, avg_return: 168.50


Building dataset: 100%|████████████████████████████████████████████| 10/10 [00:00<00:00, 651.01it/s]


Trajectories processed : 10
Generated samples      : 338007 (train 270405 / eval 67602)


Training, t_loss=0.00: 100%|████████████████████████████████████████| 20/20 [00:00<00:00, 25.95it/s]


eval_losses[0]=0.0005130774276305443 -> eval_losses[-1]=0.0003709976942785174
Finish episode=21, dd loss = 0.0003397638211026788


Evaluating: 100%|███████████████████████████████████████████████████| 10/10 [00:04<00:00,  2.20it/s]


Success rate: 40.00%, avg_return: 159.25


Building dataset: 100%|████████████████████████████████████████████| 10/10 [00:00<00:00, 640.53it/s]


Trajectories processed : 10
Generated samples      : 344841 (train 275872 / eval 68969)


Training, t_loss=0.00: 100%|████████████████████████████████████████| 20/20 [00:00<00:00, 25.85it/s]


eval_losses[0]=0.0008549840552368969 -> eval_losses[-1]=0.0006325208364186166
Finish episode=22, dd loss = 0.0005826194537803531


Evaluating: 100%|███████████████████████████████████████████████████| 10/10 [00:04<00:00,  2.20it/s]


Success rate: 20.00%, avg_return: 167.50


Building dataset: 100%|████████████████████████████████████████████| 10/10 [00:00<00:00, 476.43it/s]


Trajectories processed : 10
Generated samples      : 395068 (train 316054 / eval 79014)


Training, t_loss=0.00: 100%|████████████████████████████████████████| 20/20 [00:00<00:00, 22.89it/s]


eval_losses[0]=0.0012178453125877514 -> eval_losses[-1]=0.0002623029615394642
Finish episode=23, dd loss = 0.00025805109180510044


Evaluating: 100%|███████████████████████████████████████████████████| 10/10 [00:04<00:00,  2.30it/s]


Success rate: 30.00%, avg_return: 176.67


Building dataset: 100%|████████████████████████████████████████████| 10/10 [00:00<00:00, 601.96it/s]


Trajectories processed : 10
Generated samples      : 362165 (train 289732 / eval 72433)


Training, t_loss=0.00: 100%|████████████████████████████████████████| 20/20 [00:00<00:00, 26.06it/s]


eval_losses[0]=0.0004496271741626728 -> eval_losses[-1]=0.00027742653864212396
Finish episode=24, dd loss = 0.00026451831217855215


Evaluating: 100%|███████████████████████████████████████████████████| 10/10 [00:03<00:00,  2.57it/s]


Success rate: 50.00%, avg_return: 178.20


Building dataset: 100%|████████████████████████████████████████████| 10/10 [00:00<00:00, 714.96it/s]


Trajectories processed : 10
Generated samples      : 300965 (train 240772 / eval 60193)


Training, t_loss=0.00: 100%|████████████████████████████████████████| 20/20 [00:00<00:00, 30.13it/s]


eval_losses[0]=0.0003684925953405888 -> eval_losses[-1]=0.0002564238339032273
Finish episode=25, dd loss = 0.00023880648950580508


Evaluating: 100%|███████████████████████████████████████████████████| 10/10 [00:04<00:00,  2.37it/s]


Success rate: 40.00%, avg_return: 164.25


Building dataset: 100%|████████████████████████████████████████████| 10/10 [00:00<00:00, 368.16it/s]


Trajectories processed : 10
Generated samples      : 341272 (train 273017 / eval 68255)


Training, t_loss=0.00: 100%|████████████████████████████████████████| 20/20 [00:00<00:00, 26.93it/s]


eval_losses[0]=0.001771823457462623 -> eval_losses[-1]=0.0004764316702192934
Finish episode=26, dd loss = 0.00044820268522016704


In [None]:
agent.save_model()

In [None]:
distance_model.plot_distance_heatmap(env, source_point=(-1, 1))

In [None]:
model_path = "results/ppo_r/PPO_R.cleanrl_model"
agent.agent.load_state_dict(torch.load(model_path, map_location="cuda"))
_ = evaluate_ppo(agent, env, num_episodes=100, max_episode_steps=300)


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import wandb
import numpy as np
import mujoco 

from tqdm import trange

def plot_distance_heatmap(model, env=None, source_point=None, xlim=None, ylim=None, grid_size=500, cmap="viridis", wall_alpha=0.35, epsilon=0.1):
    """Heat-map of model-predicted distances with MuJoCo walls overlaid."""
    source_point = model.start_point if source_point is None else source_point
    xlim = model.xlim if xlim is None else xlim
    ylim = model.ylim if ylim is None else ylim
    x_src, y_src = source_point

    # ── 1. build query grid ────────────────────────────────────────────────
    xs = np.linspace(*xlim, grid_size)
    ys = np.linspace(*ylim, grid_size)
    xx, yy = np.meshgrid(xs, ys)
    targets = np.stack([xx.ravel(), yy.ravel()], 1)
    inp = np.hstack([np.full((targets.shape[0], 2), [x_src, y_src]), targets])

    # ── 2. model prediction ───────────────────────────────────────────────
    with torch.no_grad():
        pred = model(torch.as_tensor(inp, device=model.device, dtype=torch.float32)).squeeze().cpu().numpy()
    heatmap = pred.reshape(grid_size, grid_size)

    # ── 3. plot heat-map ──────────────────────────────────────────────────
    fig, ax = plt.subplots(figsize=(7, 6))
    im = ax.imshow(heatmap, origin="lower", extent=(*xlim, *ylim),
                cmap=cmap, aspect="equal")
    fig.colorbar(im, ax=ax, label="Predicted distance")

    # ── 4. overlay walls ──────────────────────────────────────────────────
    if env:
        mjm = env.unwrapped.model

        def geom_name(i):
            if hasattr(mjm, "geom_names"):
                raw = mjm.geom_names[i]
                return raw.decode() if isinstance(raw, (bytes, bytearray)) else raw
            return mujoco.mj_id2name(mjm, mujoco.mjtObj.mjOBJ_GEOM, i) or ""

        for gid in range(mjm.ngeom):
            name = geom_name(gid).lower()
            if "block" in name:
                cx, cy = mjm.geom_pos[gid, :2]
                hx, hy = mjm.geom_size[gid, :2]
                ax.add_patch(patches.Rectangle(
                    (cx - hx, cy - hy), 2 * hx, 2 * hy,
                    facecolor="black", alpha=wall_alpha, edgecolor="none"))

    # ── 5. decorations ────────────────────────────────────────────────────
    ax.scatter(*source_point, color="red", s=60, edgecolors="black", label="Source")
    ax.set_xlabel("X")
    ax.set_ylabel("Y")
    ax.set_title("Distance heat-map with maze")
    ax.grid()
    ax.legend()

    # ── 6. axis limits and equal scaling ──────────────────────────────────
    xeps = epsilon * (xlim[1] - xlim[0])
    yeps = epsilon * (ylim[1] - ylim[0])
    ax.set_xlim(xlim[0] - xeps, xlim[1] + xeps)
    ax.set_ylim(ylim[0] - yeps, ylim[1] + yeps)
    ax.set_aspect("equal")

    plt.tight_layout()
    plt.show()
plot_distance_heatmap(distance_model, env, source_point=(-1, -1))

In [None]:
trajectories = agent.evaluate_ppo(env, num_episodes=100, max_episode_steps=350)

In [None]:
def evaluate_ppo(
    agent,
    env,
    *,
    num_episodes: int = 100,
    max_episode_steps: int | None = 170,
):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    agent.agent.to(device)         
    agent.agent.eval()

    trajectories = []
    success_count = 0
    avg_return = 0

    for ep in trange(num_episodes, desc="Evaluating", ncols=100):
        obs, _ = env.reset()
        episode_traj = []

        for step in range(max_episode_steps):
            ball_x, ball_y = obs[:2]
            episode_traj.append((ball_x, ball_y))

            obs_tensor = torch.tensor(obs, dtype=torch.float32, device=device).unsqueeze(0)
            with torch.no_grad():
                action, *_ = agent.agent.get_action_and_value(obs_tensor)

            obs, _, terminated, truncated, info = env.step(action.squeeze(0).cpu().numpy())

            if info.get("success", False):
                success_count += 1
                avg_return += max_episode_steps - step
                break
            if terminated or truncated:
                break

        trajectories.append(episode_traj)

    print(f"Success rate over {num_episodes} episodes: {success_count / num_episodes:.2%}, avg_return = {avg_return / success_count if success_count else 0 :.2f}")
    return trajectories


In [None]:
print(f"Success rate over {num_episodes} episodes: {success_count / num_episodes:.2%}, {avg_return =}")
evaluate_ppo(agent, env, num_episodes=1, max_episode_steps=300)