# Proximal Policy Optimizator

The PPO is a RL algorithm that combine ideas from A2C (Actor-Critic) and TRPO (Trust Region Policy Optimization) in such way that learn using exploration-exploitation throught several iteration (A2C) but constraining the policy update step to be closer to the previous policy, thus, using a trust region to improve the actor. 

In [18]:
import os
from datetime import datetime
import gymnasium as gym

from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
#from stable_baselines3.common.vec_env import VecNormalize, DummyVecEnv, SubprocVecEnv, VecFrameStack, VecTransposeImageç

## Create and Training model

In [19]:
# Set path to logfile and model file:
root = './PPO/'

log_path = os.path.join(root, 'LogFiles/')
log_name = datetime.now().strftime('%Y-%m-%d_%H-%M')
model_path = os.path.join(root, f"Models/model_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.zip")

print(log_path+log_name)
print(model_path)

./PPO/LogFiles/2024-05-05_22-10
./PPO/Models/model_2024-05-05_22-10-39.zip


In [20]:
# Create multiples environments in parallel:
vec_env = make_vec_env("Humanoid-v4", n_envs=10)


#vec_env = VecNormalize(VecFrameStack(VecTransposeImage(vec_env, transpose=True))) # Normalize and stack multiples observations and rewards
#vec_env.training_env.logger = gym.logger                                          # Set training logger
#vec_env = VecNormalize.load(log_path, vec_env)                                    # Load previous stasts - in case that interrupted training#      

# Initialize PPO model:
model = PPO("MlpPolicy", 
            vec_env, 
            device='cuda',
            verbose=1, 
            tensorboard_log = log_path,

            # Tunning params:
            learning_rate = 3e-4,
            n_steps = 2048,                 # Number of step for agent live
            n_epochs = 4,                   # Optimize/Learning steps
            batch_size =64,                # Number of samples (action, reward) used for optimize cycle
            gamma = 0.9,                   # Decay factor, influence of future reward against current
            ent_coef = 0.01,                # Entropy: determin randomization in pdf of action to take (low value = deterministic, high value = stocastic)
            vf_coef = 0.5,                    # Relative importance of value function in training
            use_sde = False,                # SDE = State Dependent Exploration, include stochastic noice
            sde_sample_freq = -1,           # Frequency of sample a noise matrix (-1 = only sample at the begining of the rollout)
            normalize_advantage = True      # Normalize value function
)

# Training loop:
model.learn(total_timesteps=1e7, tb_log_name=log_name)
#model.learn(total_timesteps=1e7, tb_log_name="2024-05-02_22-12",reset_num_timesteps=False )
# Save model
model.save(model_path)

del model # remove to demonstrate saving and loading

Using cuda device
Logging to ./PPO/LogFiles/2024-05-05_22-10_1
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 22.1     |
|    ep_rew_mean     | 108      |
| time/              |          |
|    fps             | 1883     |
|    iterations      | 1        |
|    time_elapsed    | 10       |
|    total_timesteps | 20480    |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 24.2        |
|    ep_rew_mean          | 117         |
| time/                   |             |
|    fps                  | 1496        |
|    iterations           | 2           |
|    time_elapsed         | 27          |
|    total_timesteps      | 40960       |
| train/                  |             |
|    approx_kl            | 0.016075846 |
|    clip_fraction        | 0.197       |
|    clip_range           | 0.2         |
|    entropy_loss         | -24.2       |
|    expl

## Test

In [17]:
test_path = model_path
model = PPO.load(test_path)

# Create environment
#env = gym.make("Humanoid-v4", terminate_when_unhealthy=True, forward_reward_weight= 1.75)
vec_env = make_vec_env("Humanoid-v4", n_envs=6)

obs = vec_env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, dones, info = vec_env.step(action)
    vec_env.render("human")

FileNotFoundError: [Errno 2] No such file or directory: 'PPO\\Models\\model_2024-05-05_01-08-04.zip.zip'