In [1]:
# Import the necessary modules
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
import os
import gym
from stable_baselines3 import PPO

In [2]:
env_name = 'CartPole-v1'
# Generate environment (currently only one supported by PyTorch)
env = gym.make(env_name)

In [3]:
# episodes = 5  # number of episodes to test

# for episode in range(1, episodes+1):
#     state = env.reset()  # reset the environment get the initial state
#     done = False  # set the episode end flag to false
#     score = 0

#     while not done:
#         env.render()  # render the environment
#         # sample an action from the action space (e.g. a random action)
#         action = env.action_space.sample()
#         # take the action and get the next state, reward and episode end flag from the environment
#         state, reward, done, info = env.step(action)

#         score += reward  # update the score (sum of rewards)
#     print(f'Episode: {episode} Score: {score}')

# env.close()

In [4]:
# path to the log folder (where the model will be saved)
log_path = os.path.join('Training', 'Logs')
# create a vectorized environment to run the agent in parallel
env = DummyVecEnv([lambda: env])

In [5]:
model = PPO("MlpPolicy", env, verbose=1,
            tensorboard_log=log_path)  # create the agent

Using cpu device


In [6]:
# model.learn(total_timesteps=30000)  # train the agent (using the environment) for 30000 timesteps

In [7]:
PPO_path = os.path.join('Training','Saved Models', 'PPO_model_Cartpole')

In [8]:
model.learn(total_timesteps=77000)  # train the agent (using the environment) for 77000 timesteps

Logging to Training\Logs\PPO_5
-----------------------------
| time/              |      |
|    fps             | 379  |
|    iterations      | 1    |
|    time_elapsed    | 5    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 512         |
|    iterations           | 2           |
|    time_elapsed         | 7           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.008411117 |
|    clip_fraction        | 0.114       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.686      |
|    explained_variance   | -0.00443    |
|    learning_rate        | 0.0003      |
|    loss                 | 8.59        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0191     |
|    value_loss           | 56          |
-----------------------------------------
---

<stable_baselines3.ppo.ppo.PPO at 0x219ec736820>

In [9]:
model.save(PPO_path)  # save the agent (weights, etc.) to a file (to be loaded later) 

In [10]:
evaluate_policy(model, env, n_eval_episodes=10,render=True)  # evaluate the agent in the environment using 10 episodes (randomly sampled) and render the results



(500.0, 0.0)

In [11]:
env.close() # close the environment (and free memory) 

In [12]:
episodes = 10  # number of episodes to test

for episode in range(1, episodes+1):
    obs = env.reset()  # reset the environment get the initial state
    done = False  # set the episode end flag to false
    score = 0

    while not done:
        env.render()  # render the environment

        # Model.predict() returns the action to take from the state (in this case the state is the observation)
        action, _ = model.predict(obs)

        obs, reward, done, info = env.step(action)

        score += reward  # update the score (sum of rewards)
    print(f'Episode: {episode} Score: {score}')


Episode: 1 Score: [500.]
Episode: 2 Score: [500.]
Episode: 3 Score: [500.]
Episode: 4 Score: [500.]
Episode: 5 Score: [500.]
Episode: 6 Score: [500.]
Episode: 7 Score: [500.]
Episode: 8 Score: [500.]
Episode: 9 Score: [500.]
Episode: 10 Score: [500.]


# To Open and run tensorboard run the following CMD on terminal
!tensorboard --logdir=Training/Logs/PPO_1/ --port=6006 

In [13]:
from stable_baselines3.common.callbacks import EvalCallback,StopTrainingOnRewardThreshold


In [14]:
save_path = os.path.join('Training','Saved Models') # path to the folder where the model will be saved

In [15]:
stop_callback = StopTrainingOnRewardThreshold(reward_threshold=400, verbose=1)
eval_callback = EvalCallback(env,callback_on_new_best=stop_callback,eval_freq=10000,best_model_save_path= save_path,verbose=1) # create the callback (to be called at every 10000 timesteps)

In [16]:
model = PPO("MlpPolicy", env, verbose=1,tensorboard_log=log_path)  # create the agent
model.learn(total_timesteps=60000,callback=eval_callback)  # train the agent (using the environment) for 30000 timesteps

Using cpu device
Logging to Training\Logs\PPO_6
-----------------------------
| time/              |      |
|    fps             | 1301 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 999         |
|    iterations           | 2           |
|    time_elapsed         | 4           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.009390707 |
|    clip_fraction        | 0.098       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.686      |
|    explained_variance   | -0.000103   |
|    learning_rate        | 0.0003      |
|    loss                 | 7.12        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0159     |
|    value_loss           | 55.9        |
----------------------------

<stable_baselines3.ppo.ppo.PPO at 0x21981149e20>

# Changing Policies

In [17]:
net_arch = [dict(pi=[128,128,128,128],vf=[128,128,128,128])]

In [18]:
model = PPO("MlpPolicy", env, verbose=1,tensorboard_log=log_path, policy_kwargs={'net_arch':net_arch})  # create the agent

Using cpu device


In [19]:
model.learn(total_timesteps=30000,callback=eval_callback)  # train the agent (using the environment) for 30000 timesteps

Logging to Training\Logs\PPO_7
-----------------------------
| time/              |      |
|    fps             | 1102 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 684         |
|    iterations           | 2           |
|    time_elapsed         | 5           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.013566804 |
|    clip_fraction        | 0.215       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.682      |
|    explained_variance   | 0.00369     |
|    learning_rate        | 0.0003      |
|    loss                 | 3.3         |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0254     |
|    value_loss           | 21.3        |
-----------------------------------------
---

<stable_baselines3.ppo.ppo.PPO at 0x219815591f0>

In [20]:
from stable_baselines3 import DQN

In [21]:
dqn = DQN("MlpPolicy", env, verbose=1,tensorboard_log=log_path)  # create the agent

Using cpu device


In [22]:
model.learn(total_timesteps=30000)  # train the agent (using the environment) for 30000 timesteps

Logging to Training\Logs\PPO_8
-----------------------------
| time/              |      |
|    fps             | 991  |
|    iterations      | 1    |
|    time_elapsed    | 2    |
|    total_timesteps | 2048 |
-----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 520          |
|    iterations           | 2            |
|    time_elapsed         | 7            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0049489737 |
|    clip_fraction        | 0.0595       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.468       |
|    explained_variance   | 0.0117       |
|    learning_rate        | 0.0003       |
|    loss                 | 4.46         |
|    n_updates            | 160          |
|    policy_gradient_loss | -0.00451     |
|    value_loss           | 86.3         |
----------------------------

<stable_baselines3.ppo.ppo.PPO at 0x219815591f0>