In [None]:
import sys
from pathlib import Path
repo_root = Path.cwd().parent
sys.path.insert(0, str(repo_root / "src"))

import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import VecMonitor, DummyVecEnv, VecNormalize
from stable_baselines3.common.callbacks import CheckpointCallback
import torch as th
import os
from GurobiParamEnv import InexactGBDEnv

In [None]:
log_dir = "tb_logs"
os.makedirs(log_dir, exist_ok=True)

# Wrap your env in Monitor before vectorizing
def make_env():
    # the Monitor wrapper writes out a monitor.csv under log_dir
    return Monitor(InexactGBDEnv(), filename=os.path.join(log_dir, "monitor.csv"))

# Create a DummyVecEnv of monitored envs
env = DummyVecEnv([make_env])
env = VecNormalize(env, norm_obs=True, norm_reward=True)
env = VecMonitor(env, filename=os.path.join(log_dir, "vecmonitor.csv"))

policy_kwargs = dict(activation_fn=th.nn.ReLU,
                     net_arch=dict(pi=[64, 64, 64], vf=[64, 64]))

# Initialize PPO with tensorboard logging pointed at the same directory
model = PPO(
    "MlpPolicy",
    env,
    policy_kwargs=policy_kwargs, 
    gamma=0.99,
    learning_rate=5e-4,
    clip_range=0.15,
    n_steps=512,
    batch_size=128,
    verbose=1,
    tensorboard_log=log_dir, 
)

# 6. Train
model.learn(total_timesteps=20_000, 
            tb_log_name="RL_iGBD")

In [None]:
# Save both model and normalization statistics
model.save("../models/ppo_benders_model1")
env.save("../models/vecnormalize_benders1.pkl")

print("Training complete. Model saved to ppo_benders_model.zip and stats to vecnormalize_benders.pkl")