In [8]:
import os
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

In [26]:
environment_name='CartPole-v1'
env=gym.make(environment_name)

In [25]:
episodes=10
for episode in range(1,episodes+1):
    obsevation,info=env.reset()
    terminated=False
    score=0

    while not terminated or truncated:
        action=env.action_space.sample()
        observation,reward,terminated,truncated,info=env.step(action)
        score+=reward
    print('Episode:{} score{}'.format(episode,score))
env.close()


Episode:1 score13.0
Episode:2 score34.0
Episode:3 score24.0
Episode:4 score23.0
Episode:5 score18.0
Episode:6 score16.0
Episode:7 score22.0
Episode:8 score21.0
Episode:9 score20.0
Episode:10 score34.0


In [9]:
log_path=os.path.join('Training','Logs')

In [13]:
env=gym.make(environment_name)
env=DummyVecEnv([lambda:env])
model=PPO('MlpPolicy',env,verbose=1,tensorboard_log=log_path)

Using cpu device


In [16]:
model.learn(total_timesteps=20000)

Logging to Training\Logs\PPO_2
-----------------------------
| time/              |      |
|    fps             | 607  |
|    iterations      | 1    |
|    time_elapsed    | 3    |
|    total_timesteps | 2048 |
-----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 628          |
|    iterations           | 2            |
|    time_elapsed         | 6            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0012232309 |
|    clip_fraction        | 0.0196       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.56        |
|    explained_variance   | 0.834        |
|    learning_rate        | 0.0003       |
|    loss                 | 0.813        |
|    n_updates            | 110          |
|    policy_gradient_loss | -0.00398     |
|    value_loss           | 17.5         |
----------------------------

<stable_baselines3.ppo.ppo.PPO at 0x108905818c8>

In [4]:
PPO_Path=os.path.join('Training','Saved','cartpole_PPO')

In [18]:
model.save(PPO_Path)

In [19]:
del model

In [32]:
model=PPO.load(PPO_Path,env=env)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [35]:
evaluate_policy(model,env,n_eval_episodes=1,render=True)

(500.0, 0.0)

In [21]:
env.close()

In [36]:
episodes=5
for episode in range(1,episodes+1):
    observation,info=env.reset()
    terminated=False
    score=0

    while not terminated or truncated:
        action,_=model.predict(observation)
        observation,reward,terminated,truncated,info=env.step(action)
        score+=reward

        if score >200:
            break
    print('Episode:{} score{}'.format(episode,score))
env.close()


Episode:1 score201.0
Episode:2 score201.0
Episode:3 score201.0
Episode:4 score201.0
Episode:5 score201.0


In [27]:
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold
save_path=os.path.join('Training','Saved')

In [28]:
stop_callback=StopTrainingOnRewardThreshold(reward_threshold=200,verbose=1)
eval_callback=EvalCallback(
    env,
    callback_on_new_best=stop_callback,
    eval_freq=5000,
    best_model_save_path=save_path,
    verbose=1
)

In [29]:
model=PPO('MlpPolicy',env,verbose=1)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [30]:
model.learn(total_timesteps=20000,callback=eval_callback)

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 22.7     |
|    ep_rew_mean     | 22.7     |
| time/              |          |
|    fps             | 1518     |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 26.3        |
|    ep_rew_mean          | 26.3        |
| time/                   |             |
|    fps                  | 1031        |
|    iterations           | 2           |
|    time_elapsed         | 3           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.008202034 |
|    clip_fraction        | 0.0644      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.687      |
|    explained_variance   | -0.00409    |
|    learning_rate        | 0.



Eval num_timesteps=5000, episode_reward=254.80 +/- 137.62
Episode length: 254.80 +/- 137.62
------------------------------------------
| eval/                   |              |
|    mean_ep_length       | 255          |
|    mean_reward          | 255          |
| time/                   |              |
|    total_timesteps      | 5000         |
| train/                  |              |
|    approx_kl            | 0.0095167775 |
|    clip_fraction        | 0.0737       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.671       |
|    explained_variance   | 0.0703       |
|    learning_rate        | 0.0003       |
|    loss                 | 15.2         |
|    n_updates            | 20           |
|    policy_gradient_loss | -0.0207      |
|    value_loss           | 35.7         |
------------------------------------------
New best mean reward!
Stopping training because the mean reward 254.80  is above the threshold 200


<stable_baselines3.ppo.ppo.PPO at 0x2c91aad0348>