In [1]:
import gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import VecMonitor, VecFrameStack, DummyVecEnv
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import CheckpointCallback

def make_procgen_env(env_name, num_envs=1, start_level=0, num_levels=0):
    def _init():
        env = gym.make(env_name, start_level=start_level, num_levels=num_levels)
        return env
    return DummyVecEnv([_init] * num_envs)

env_name = "procgen:procgen-heist-v0"  
num_envs = 4 
learning_rate = 3e-4
n_steps = 2048
batch_size = 64
n_epochs = 10
gamma = 0.99

env = make_procgen_env(env_name, num_envs)
env = VecMonitor(env)
env = VecFrameStack(env, n_stack=4)

model = PPO("CnnPolicy", env, learning_rate=learning_rate, n_steps=n_steps, batch_size=batch_size, n_epochs=n_epochs, gamma=gamma, verbose=1, tensorboard_log="./procgen_tensorboard/")

checkpoint_callback = CheckpointCallback(save_freq=10000, save_path='./models/', name_prefix='ppo_procgen')

total_timesteps = 1e6
model.learn(total_timesteps=int(total_timesteps), callback=checkpoint_callback)

model.save("ppo_procgen_final")

mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=10)
print(f"Mean reward: {mean_reward}, Std Reward: {std_reward}")

env.close()

