### Imports ###

In [8]:
import optuna
import gymnasium as gym
from stable_baselines3 import A2C
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

import tensorboardX

### Aufsetzen des Cart-Pole-Environments ###

In [2]:
env = gym.make('CartPole-v1')

### Initiale Hyperparameter Definition ###

In [3]:
LOG_DIR = "logs"
VERBOSE = 1
TOTAL_TIMESTEPS = 40000
N_EVAL_EPISODES = 10
N_TRIALS = 10

### Baseline A2C-Agent ###

In [4]:
# Initialisierung des Modells
model = A2C('MlpPolicy', env, verbose=VERBOSE, tensorboard_log=LOG_DIR)

# Trainieren des Modells
model.learn(total_timesteps=TOTAL_TIMESTEPS)

# Evaluierung des Modells
mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=N_EVAL_EPISODES)

print(f"Mean reward: {mean_reward}, std deviation: {std_reward}")

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to logs\A2C_1
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 35.9     |
|    ep_rew_mean        | 35.9     |
| time/                 |          |
|    fps                | 389      |
|    iterations         | 100      |
|    time_elapsed       | 1        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -0.658   |
|    explained_variance | 0.0245   |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | 1.7      |
|    value_loss         | 10       |
------------------------------------
------------------------------------
| rollout/              |          |
|    ep_len_mean        | 44.1     |
|    ep_rew_mean        | 44.1     |
| time/                 |          |
|    fps                | 327      |
|    iterations         | 200    

![A2C](./img/A2C.png)

### Hyperparameter Tuning ###

In [5]:
# Definition der Optimierungsfunktion
def optimize_a2c(trial):
    env = gym.make('CartPole-v1')
    env = Monitor(env)
    env = DummyVecEnv([lambda: env])

    # Definition der Hyperparameter inkl. Range
    n_steps = trial.suggest_int('n_steps', 5, 2048)
    gamma = trial.suggest_float('gamma', 0.8, 0.9999)
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 0.05)
    ent_coef = trial.suggest_float('ent_coef', 1e-8, 0.1)

    # Erstellen des A2C-Modells
    model = A2C('MlpPolicy', env, n_steps=n_steps, gamma=gamma, learning_rate=learning_rate, ent_coef=ent_coef, verbose=VERBOSE)
    
    # Trainieren des Modells
    model.learn(total_timesteps=TOTAL_TIMESTEPS)
    
    # Evaluierung des Modells
    mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=N_EVAL_EPISODES)
    
    return mean_reward

# Erstellen einer Optuna-Studie und Optimierung der Hyperparameter
study = optuna.create_study(direction='maximize')
study.optimize(optimize_a2c, n_trials=N_TRIALS)

# Ausgabe der besten Hyperparameter
print("Beste Hyperparameter: ", study.best_params)

[I 2024-07-26 13:07:48,740] A new study created in memory with name: no-name-966afebf-3a58-45dc-8a00-f88be2998c10


Using cpu device


[I 2024-07-26 13:08:43,644] Trial 0 finished with value: 500.0 and parameters: {'n_steps': 466, 'gamma': 0.9196134144569083, 'learning_rate': 0.007110664620061475, 'ent_coef': 0.05340626331462296}. Best is trial 0 with value: 500.0.


Using cpu device


[I 2024-07-26 13:09:44,405] Trial 1 finished with value: 495.1 and parameters: {'n_steps': 1283, 'gamma': 0.8367873009442857, 'learning_rate': 0.01731754753170912, 'ent_coef': 0.09106994360691205}. Best is trial 0 with value: 500.0.


Using cpu device


[I 2024-07-26 13:11:00,826] Trial 2 finished with value: 9.0 and parameters: {'n_steps': 623, 'gamma': 0.9598415295935352, 'learning_rate': 0.04902310325839985, 'ent_coef': 0.082529766096586}. Best is trial 0 with value: 500.0.


Using cpu device
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 9.37      |
|    ep_rew_mean        | 9.37      |
| time/                 |           |
|    fps                | 558       |
|    iterations         | 100       |
|    time_elapsed       | 60        |
|    total_timesteps    | 33800     |
| train/                |           |
|    entropy_loss       | -2.26e-17 |
|    explained_variance | 0.722     |
|    learning_rate      | 0.0434    |
|    n_updates          | 99        |
|    policy_loss        | -0        |
|    value_loss         | 3.37      |
-------------------------------------


[I 2024-07-26 13:12:11,351] Trial 3 finished with value: 9.3 and parameters: {'n_steps': 338, 'gamma': 0.949924304026942, 'learning_rate': 0.043381845901495464, 'ent_coef': 0.06834466976249778}. Best is trial 0 with value: 500.0.


Using cpu device


[I 2024-07-26 13:13:18,013] Trial 4 finished with value: 9.0 and parameters: {'n_steps': 1065, 'gamma': 0.8014460391205953, 'learning_rate': 0.02831245882467321, 'ent_coef': 0.08512604337559897}. Best is trial 0 with value: 500.0.


Using cpu device


[I 2024-07-26 13:14:13,039] Trial 5 finished with value: 8.8 and parameters: {'n_steps': 1576, 'gamma': 0.9897003395499065, 'learning_rate': 0.01659460650470261, 'ent_coef': 0.0455209063130767}. Best is trial 0 with value: 500.0.


Using cpu device


[I 2024-07-26 13:15:09,201] Trial 6 finished with value: 76.7 and parameters: {'n_steps': 1289, 'gamma': 0.9463228003134572, 'learning_rate': 0.02074482778689341, 'ent_coef': 0.06667562395762504}. Best is trial 0 with value: 500.0.


Using cpu device


[I 2024-07-26 13:16:03,073] Trial 7 finished with value: 500.0 and parameters: {'n_steps': 436, 'gamma': 0.9146636809779404, 'learning_rate': 0.012320881602234277, 'ent_coef': 0.09929044863010757}. Best is trial 0 with value: 500.0.


Using cpu device


[I 2024-07-26 13:16:50,527] Trial 8 finished with value: 9.4 and parameters: {'n_steps': 1438, 'gamma': 0.9740315122745662, 'learning_rate': 0.03365034079871839, 'ent_coef': 0.005013454740481852}. Best is trial 0 with value: 500.0.


Using cpu device


[I 2024-07-26 13:17:40,438] Trial 9 finished with value: 9.6 and parameters: {'n_steps': 589, 'gamma': 0.9501353751476117, 'learning_rate': 0.0381402614322328, 'ent_coef': 0.07980000993815495}. Best is trial 0 with value: 500.0.


Beste Hyperparameter:  {'n_steps': 466, 'gamma': 0.9196134144569083, 'learning_rate': 0.007110664620061475, 'ent_coef': 0.05340626331462296}


### Übernehmen der besten Hyperparameter ###

In [6]:
best_params = study.best_params

N_STEPS = best_params['n_steps']
GAMMA = best_params['gamma']
LEARNING_RATE = best_params['learning_rate']
ENT_COEF = best_params['ent_coef']

### Finale Ausführung ###

In [7]:
# Erstellen des A2C-Modells
tuned_model = A2C('MlpPolicy', env, n_steps=N_STEPS, gamma=GAMMA, learning_rate=LEARNING_RATE, ent_coef=ENT_COEF, verbose=VERBOSE)
    
# Trainieren des Modells
tuned_model.learn(total_timesteps=TOTAL_TIMESTEPS)

# Evaluierung des Modells
mean_reward, std_reward = evaluate_policy(tuned_model, tuned_model.get_env(), n_eval_episodes=N_EVAL_EPISODES)

print(f"Mean reward: {mean_reward}, std deviation: {std_reward}")

# Speicherung des Modells
tuned_model.save("./final_models/a2c_model")

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Mean reward: 500.0, std deviation: 0.0
