# **Stage 1**: Train PPO on PointMaze with standard rewards, collect data, train distance models


In [1]:
import gymnasium as gym
import gymnasium_robotics
import numpy as np
import torch
import os

import sys
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..')))
from src.env_wrappers import EnvBuilder
from src import ppo_agent, distance_models
from src.utils import trajectories_to_dataset

# Check for GPU
print("CUDA available:", torch.cuda.is_available())
print("Current device:", torch.cuda.current_device())
print("Device name:", torch.cuda.get_device_name(torch.cuda.current_device()) if torch.cuda.is_available() else "CPU")

CUDA available: True
Current device: 0
Device name: NVIDIA GeForce RTX 4080 Laptop GPU


In [None]:
env_id = 'PointMaze_UMaze-v3'  # Use dense reward for initial training
train_episodes = 1000
max_episode_steps = 300
seed = 0

torch.manual_seed(seed); np.random.seed(seed)
gym.register_envs(gymnasium_robotics)

# Initialize environment
c = 'c'; U_map = [
    [1, 1, 1, 1, 1],
    [1, c, 0, 0, 1],
    [1, 1, 1, 0, 1],
    [1, c, 0, 0, 1],
    [1, 1, 1, 1, 1]
]
builder = EnvBuilder(env_id=env_id, maze_map=U_map, max_episode_steps=max_episode_steps, seed=seed)
obs_dim, act_dim = builder.get_obs_act_dim()
agent = ppo_agent.PPOAgent(state_dim=obs_dim, action_dim=act_dim, wandb_name="stage1")

[34m[1mwandb[0m: Currently logged in as: [33mvoronov_artem_lit[0m ([33mvoronov_artem_lit-skolkovo-institute-of-science-and-tech[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin




In [3]:
# model_path = "models/ppo_agent_stage1_good.pth"
# agent.ac.load_state_dict(torch.load(model_path))

In [5]:
# Train
agent.train_ppo_vectorized(
    builder,
    num_envs=32,
    horizon=max_episode_steps,
    total_updates=train_episodes//32,
    async_mode=True,   # set False if you prefer SyncVectorEnv
)


Training: 100%|█████████████████████████████████████████████████████| 31/31 [01:22<00:00,  2.67s/it]


In [6]:
# Train
agent.train_ppo_vectorized(
    builder,
    num_envs=32,
    horizon=max_episode_steps,
    total_updates=train_episodes//32,
    async_mode=False,
)

Training: 100%|█████████████████████████████████████████████████████| 31/31 [01:25<00:00,  2.75s/it]


In [None]:
# Train
env = builder()
agent.train_ppo(env, num_episodes=train_episodes, max_episode_steps=max_episode_steps)

Training: 100%|█████████████████████████████████████████████████| 1000/1000 [01:30<00:00, 11.01it/s]


In [7]:
# Evaluate
U_map = [
    [1, 1, 1, 1, 1],
    [1, c, 0, 0, 1],
    [1, 1, 1, 0, 1],
    [1, c, 0, 0, 1],
    [1, 1, 1, 1, 1]
]
eval_env = builder(maze_map=U_map)

trajectories = agent.evaluate_ppo(eval_env)
torch.save(agent.ac.state_dict(), "models/ppo_agent_stage1.pth")

eval_env.demonstrate(agent)

Evaluating: 100%|█████████████████████████████████████████████████| 100/100 [00:03<00:00, 27.16it/s]


Over 100 eval episodes, 0 were successful (0.0%)


___

## Generate dataset
___

In [None]:
dataset = trajectories_to_dataset(trajectories)

In [None]:
sup_model = distance_models.SupervisedDistanceEstimator(input_dim=4)
sup_loss = sup_model.train_from_data(dataset, epochs=50, batch_size=16192)
eval_loss = sup_model.evaluate_from_data(dataset, save_model=True, save_path="models/distance_model_sup.pth")

In [None]:
sup_model.plot_distance_heatmap(env=eval_env, source_point=(-1, 1))