# **Stage 1**: Train PPO on PointMaze with standard rewards, collect data, train distance models


In [1]:
import gymnasium as gym
import gymnasium_robotics
import numpy as np
import torch
import os

import sys
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..')))
from src.env_wrappers import EnvBuilder
from src import ppo_agent, distance_models

# Check for GPU
print("CUDA available:", torch.cuda.is_available())
print("Current device:", torch.cuda.current_device())
print("Device name:", torch.cuda.get_device_name(torch.cuda.current_device()) if torch.cuda.is_available() else "CPU")

CUDA available: True
Current device: 0
Device name: NVIDIA GeForce RTX 4080 Laptop GPU


In [2]:
env_id = 'PointMaze_UMaze-v3'  # Use dense reward for initial training
train_episodes = 50
max_episode_steps = 1024
seed = 0

torch.manual_seed(seed); np.random.seed(seed)
gym.register_envs(gymnasium_robotics)

# Initialize environment
builder = EnvBuilder(env_id=env_id, max_episode_steps=max_episode_steps, seed=seed)
obs_dim, act_dim = builder.get_obs_act_dim()
agent = ppo_agent.PPOAgent(state_dim=obs_dim, action_dim=act_dim, wandb_name="stage1")

[34m[1mwandb[0m: Currently logged in as: [33mvoronov_artem_lit[0m ([33mvoronov_artem_lit-skolkovo-institute-of-science-and-tech[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [3]:
# # Train
# env = builder()
# agent.train_ppo(env, num_episodes=train_episodes, max_episode_steps=max_episode_steps)

In [4]:
# Evaluate
eval_env = builder()

trajectories = agent.evaluate_ppo(eval_env)
torch.save(agent.ac.state_dict(), "models/ppo_agent_stage1.pth")

eval_env.demonstrate(agent)

Evaluating: 100%|█████████████████████████████████████████████████| 100/100 [00:04<00:00, 21.18it/s]


Over 100 eval episodes, 0 were successful (0.0%)


___

## Generate dataset
___

In [None]:
sup_states = np.array(sup_states, dtype=np.float32)
sup_distances = np.array(sup_distances, dtype=np.float32)

# Train distance estimators on the collected data
sup_model = distance_models.SupervisedDistanceEstimator(input_dim=obs_dim)
sup_loss = sup_model.train_from_data(sup_states, sup_distances, epochs=100)
# td_model = distance_models.TDDistanceEstimator(input_dim=obs_dim)
# td_loss = td_model.train_from_transitions(td_transitions, epochs=100)
# Compare models on the supervised dataset
sup_preds = sup_model.model(torch.tensor(sup_states)).detach().numpy().flatten()
# td_preds = td_model.model(torch.tensor(sup_states)).detach().numpy().flatten()
mse_sup = np.mean((sup_preds - sup_distances)**2)
# mse_td = np.mean((td_preds - sup_distances)**2)

print(f"Supervised model MSE on training data: {mse_sup:.4f}")
# print(f"TD model MSE on training data: {mse_td:.4f}")

# Save models for Stage 2
# torch.save(td_model.state_dict(), "models/distance_model_td.pth")
torch.save(sup_model.state_dict(), "models/distance_model_sup.pth")
torch.save(agent.ac.state_dict(), "models/ppo_agent_stage1.pth")
