### Imports ###

In [8]:
import optuna
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

import tensorboardX

### Aufsetzen des Cart-Pole-Environments ###

In [2]:
env = gym.make('CartPole-v1')

### Initiale Hyperparameter Definition ###

In [3]:
LOG_DIR = "logs"
VERBOSE = 1
TOTAL_TIMESTEPS = 40000
N_EVAL_EPISODES = 10
N_TRIALS = 10

### Baseline PPO-Agent ###

In [4]:
# Initialisierung des Modells
model = PPO('MlpPolicy', env, verbose=VERBOSE, tensorboard_log=LOG_DIR)

# Trainieren des Modells
model.learn(total_timesteps=TOTAL_TIMESTEPS)

# Evaluierung des Modells
mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=N_EVAL_EPISODES)

print(f"Mean reward: {mean_reward}, std deviation: {std_reward}")

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to logs\PPO_1
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 22       |
|    ep_rew_mean     | 22       |
| time/              |          |
|    fps             | 492      |
|    iterations      | 1        |
|    time_elapsed    | 4        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 26.7        |
|    ep_rew_mean          | 26.7        |
| time/                   |             |
|    fps                  | 383         |
|    iterations           | 2           |
|    time_elapsed         | 10          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.009360518 |
|    clip_fraction        | 0.122       |
|    clip_range           | 0.2         

![PPO](./img/PPO.png)

### Hyperparameter Tuning ###

In [5]:
# Definition der Optimierungsfunktion
def optimize_ppo(trial):
    env = gym.make('CartPole-v1')
    env = DummyVecEnv([lambda: env])
    n_envs = 1

    # Definition der Hyperparameter inkl. Range
    n_steps = trial.suggest_int('n_steps', 5, 2048)
    gamma = trial.suggest_float('gamma', 0.8, 0.9999)
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 0.05)
    ent_coef = trial.suggest_float('ent_coef', 1e-8, 0.1)

    # Erstellen des PPO-Modells
    model = PPO('MlpPolicy', env, n_steps=n_steps, gamma=gamma, learning_rate=learning_rate, ent_coef=ent_coef, batch_size=n_steps * n_envs, verbose=VERBOSE)
    
    # Trainieren des Modells
    model.learn(total_timesteps=TOTAL_TIMESTEPS)
    
    # Evaluierung des Modells
    mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=N_EVAL_EPISODES)

    return mean_reward

# Erstellen einer Optuna-Studie und Optimierung die Hyperparameter
study = optuna.create_study(direction='maximize')
study.optimize(optimize_ppo, n_trials=N_TRIALS)

# Ausgabe der besten Hyperparameter
print("Beste Hyperparameter: ", study.best_params)

[I 2024-07-26 13:11:53,114] A new study created in memory with name: no-name-bb64b5d5-04c1-411c-a236-cbf6126cfa24


Using cpu device
----------------------------
| time/              |     |
|    fps             | 423 |
|    iterations      | 1   |
|    time_elapsed    | 1   |
|    total_timesteps | 463 |
----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 392         |
|    iterations           | 2           |
|    time_elapsed         | 2           |
|    total_timesteps      | 926         |
| train/                  |             |
|    approx_kl            | 0.008125198 |
|    clip_fraction        | 0.113       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.685      |
|    explained_variance   | 0.000667    |
|    learning_rate        | 0.023       |
|    loss                 | 0.432       |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0108     |
|    value_loss           | 6.34        |
-----------------------------------------
------------------------

[I 2024-07-26 13:13:08,458] Trial 0 finished with value: 175.2 and parameters: {'n_steps': 463, 'gamma': 0.840782254404652, 'learning_rate': 0.02301810458049787, 'ent_coef': 0.0553239433664652}. Best is trial 0 with value: 175.2.


Using cpu device
----------------------------
| time/              |     |
|    fps             | 411 |
|    iterations      | 1   |
|    time_elapsed    | 1   |
|    total_timesteps | 744 |
----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 318         |
|    iterations           | 2           |
|    time_elapsed         | 4           |
|    total_timesteps      | 1488        |
| train/                  |             |
|    approx_kl            | 0.012275593 |
|    clip_fraction        | 0.0786      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.688      |
|    explained_variance   | 0.00229     |
|    learning_rate        | 0.00193     |
|    loss                 | 18.7        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.00855    |
|    value_loss           | 39.9        |
-----------------------------------------
------------------------

[I 2024-07-26 13:14:12,516] Trial 1 finished with value: 290.0 and parameters: {'n_steps': 744, 'gamma': 0.9290886074039225, 'learning_rate': 0.0019302598318344914, 'ent_coef': 0.047480672293053056}. Best is trial 1 with value: 290.0.


Using cpu device
-----------------------------
| time/              |      |
|    fps             | 669  |
|    iterations      | 1    |
|    time_elapsed    | 2    |
|    total_timesteps | 1428 |
-----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 579          |
|    iterations           | 2            |
|    time_elapsed         | 4            |
|    total_timesteps      | 2856         |
| train/                  |              |
|    approx_kl            | 0.0040022316 |
|    clip_fraction        | 0.219        |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.672       |
|    explained_variance   | -0.0236      |
|    learning_rate        | 0.0498       |
|    loss                 | 1.86         |
|    n_updates            | 10           |
|    policy_gradient_loss | 0.00936      |
|    value_loss           | 11.4         |
------------------------------------------

[I 2024-07-26 13:15:12,082] Trial 2 finished with value: 127.9 and parameters: {'n_steps': 1428, 'gamma': 0.9175029390827572, 'learning_rate': 0.049795408531304194, 'ent_coef': 0.07054899135540321}. Best is trial 1 with value: 290.0.


Using cpu device
-----------------------------
| time/              |      |
|    fps             | 875  |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 1123 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 820         |
|    iterations           | 2           |
|    time_elapsed         | 2           |
|    total_timesteps      | 2246        |
| train/                  |             |
|    approx_kl            | 0.013812214 |
|    clip_fraction        | 0.122       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.685      |
|    explained_variance   | -0.00074    |
|    learning_rate        | 0.0161      |
|    loss                 | 2.8         |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0108     |
|    value_loss           | 12.6        |
-----------------------------------------
-----------------

[I 2024-07-26 13:16:06,114] Trial 3 finished with value: 123.5 and parameters: {'n_steps': 1123, 'gamma': 0.8929956407753132, 'learning_rate': 0.01607833959217026, 'ent_coef': 0.052442645905905416}. Best is trial 1 with value: 290.0.


Using cpu device
-----------------------------
| time/              |      |
|    fps             | 891  |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 1006 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 790         |
|    iterations           | 2           |
|    time_elapsed         | 2           |
|    total_timesteps      | 2012        |
| train/                  |             |
|    approx_kl            | 0.008132423 |
|    clip_fraction        | 0.113       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.683      |
|    explained_variance   | -0.00493    |
|    learning_rate        | 0.0341      |
|    loss                 | 0.182       |
|    n_updates            | 10          |
|    policy_gradient_loss | 0.00303     |
|    value_loss           | 3.71        |
-----------------------------------------
-----------------

[I 2024-07-26 13:16:58,179] Trial 4 finished with value: 122.4 and parameters: {'n_steps': 1006, 'gamma': 0.8076157828589428, 'learning_rate': 0.03410551013876498, 'ent_coef': 0.052085576821896046}. Best is trial 1 with value: 290.0.


Using cpu device
----------------------------
| time/              |     |
|    fps             | 868 |
|    iterations      | 1   |
|    time_elapsed    | 0   |
|    total_timesteps | 716 |
----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 805          |
|    iterations           | 2            |
|    time_elapsed         | 1            |
|    total_timesteps      | 1432         |
| train/                  |              |
|    approx_kl            | 0.0057681627 |
|    clip_fraction        | 0.0899       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.685       |
|    explained_variance   | -0.244       |
|    learning_rate        | 0.0275       |
|    loss                 | 0.538        |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.00474     |
|    value_loss           | 5.1          |
------------------------------------------
------

[I 2024-07-26 13:17:53,696] Trial 5 finished with value: 254.2 and parameters: {'n_steps': 716, 'gamma': 0.8003557303296661, 'learning_rate': 0.027511636446238323, 'ent_coef': 0.036140590474082565}. Best is trial 1 with value: 290.0.


Using cpu device
-----------------------------
| time/              |      |
|    fps             | 888  |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 1167 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 842         |
|    iterations           | 2           |
|    time_elapsed         | 2           |
|    total_timesteps      | 2334        |
| train/                  |             |
|    approx_kl            | 0.014846014 |
|    clip_fraction        | 0.134       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.685      |
|    explained_variance   | -0.00824    |
|    learning_rate        | 0.0147      |
|    loss                 | 2.51        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0186     |
|    value_loss           | 9.9         |
-----------------------------------------
-----------------

[I 2024-07-26 13:18:47,012] Trial 6 finished with value: 317.0 and parameters: {'n_steps': 1167, 'gamma': 0.8688669775109215, 'learning_rate': 0.014718848396787448, 'ent_coef': 0.03383386979055671}. Best is trial 6 with value: 317.0.


Using cpu device
-----------------------------
| time/              |      |
|    fps             | 1448 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 1800 |
-----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 1350         |
|    iterations           | 2            |
|    time_elapsed         | 2            |
|    total_timesteps      | 3600         |
| train/                  |              |
|    approx_kl            | 0.0023792342 |
|    clip_fraction        | 0.104        |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.685       |
|    explained_variance   | -0.092       |
|    learning_rate        | 0.0278       |
|    loss                 | 2.29         |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.00437     |
|    value_loss           | 10.7         |
------------------------------------------

[I 2024-07-26 13:19:21,769] Trial 7 finished with value: 500.0 and parameters: {'n_steps': 1800, 'gamma': 0.8744836161035229, 'learning_rate': 0.027757382573945992, 'ent_coef': 0.03260042664169487}. Best is trial 7 with value: 500.0.


Using cpu device
-----------------------------
| time/              |      |
|    fps             | 1260 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 839  |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 994         |
|    iterations           | 2           |
|    time_elapsed         | 1           |
|    total_timesteps      | 1678        |
| train/                  |             |
|    approx_kl            | 0.009866332 |
|    clip_fraction        | 0.15        |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.681      |
|    explained_variance   | -0.036      |
|    learning_rate        | 0.0318      |
|    loss                 | 0.993       |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0111     |
|    value_loss           | 9.69        |
-----------------------------------------
-----------------

[I 2024-07-26 13:20:02,982] Trial 8 finished with value: 485.0 and parameters: {'n_steps': 839, 'gamma': 0.8929122709783177, 'learning_rate': 0.03181455625786926, 'ent_coef': 0.0503659999980605}. Best is trial 7 with value: 500.0.


Using cpu device
-----------------------------
| time/              |      |
|    fps             | 1386 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 1978 |
-----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 1335         |
|    iterations           | 2            |
|    time_elapsed         | 2            |
|    total_timesteps      | 3956         |
| train/                  |              |
|    approx_kl            | 0.0092349425 |
|    clip_fraction        | 0.211        |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.677       |
|    explained_variance   | -0.0158      |
|    learning_rate        | 0.0435       |
|    loss                 | 1.35         |
|    n_updates            | 10           |
|    policy_gradient_loss | 0.00196      |
|    value_loss           | 10.6         |
------------------------------------------

[I 2024-07-26 13:20:36,461] Trial 9 finished with value: 85.5 and parameters: {'n_steps': 1978, 'gamma': 0.9084301968180913, 'learning_rate': 0.04354193004063276, 'ent_coef': 0.07119297938707998}. Best is trial 7 with value: 500.0.


Beste Hyperparameter:  {'n_steps': 1800, 'gamma': 0.8744836161035229, 'learning_rate': 0.027757382573945992, 'ent_coef': 0.03260042664169487}


### Übernehmen der besten Hyperparameter ###

In [6]:
best_params = study.best_params

N_STEPS = best_params['n_steps']
GAMMA = best_params['gamma']
LEARNING_RATE = best_params['learning_rate']
ENT_COEF = best_params['ent_coef']

### Finale Ausführung ###

In [7]:
# Erstellen des PPO-Modells
model = PPO('MlpPolicy', env, n_steps=N_STEPS, gamma=GAMMA, learning_rate=LEARNING_RATE, ent_coef=ENT_COEF, batch_size=1960, verbose=VERBOSE)
    
# Trainieren des Modells
model.learn(total_timesteps=TOTAL_TIMESTEPS)

# Evaluierung des Modells
mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=N_EVAL_EPISODES)

print(f"Mean reward: {mean_reward}, std deviation: {std_reward}")

# Speicherung des Modells
model.save("./final_models/ppo_model")

We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=1800 and n_envs=1)


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 21.4     |
|    ep_rew_mean     | 21.4     |
| time/              |          |
|    fps             | 1313     |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 1800     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 23.2        |
|    ep_rew_mean          | 23.2        |
| time/                   |             |
|    fps                  | 1248        |
|    iterations           | 2           |
|    time_elapsed         | 2           |
|    total_timesteps      | 3600        |
| train/                  |             |
|    approx_kl            | 0.009360698 |
|    clip_fraction        | 0.121       |
|    clip_range           | 0.2         |
|    entropy_loss   