In [9]:
#ML Imports
import gymnasium as gym
from stable_baselines3 import A2C, PPO, DQN
from stable_baselines3.ppo import MlpPolicy
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_util import make_vec_env
import pygame
import time
import numpy as np

# Goals for this Stage of Research
* Create and render three different Gymnasium environments
* Train three different StableBaseline3 RL models on each environment
* Measure performance of Models

### 1. Taxi-V3
Here we're running the Taxi-v3 environment in Gymnasium with StableBaseline3's VecEnv (Vectorized Environment) which allows for simultaneous training on multiple instances of a Gymnasium environment.

In [4]:
#Add in render_mode = "human" to have PyGame display the environment as it progresses
vec_env = make_vec_env("Taxi-v3",n_envs=4)
model1 = PPO(MlpPolicy, vec_env, verbose=0)
mean_reward,std_reward = evaluate_policy(model1, vec_env, n_eval_episodes=100)

print(f"mean reward: {mean_reward:.2f} +/- {std_reward:.2f}")


mean reward: -1097.48 +/- 897.50


Here we're training a StableBaselines3 PPO Model to solve the Taxi environment. We can see significant improvement in the model.

In [3]:
model1.learn(total_timesteps=10000)
mean_reward, std_reward = evaluate_policy(model1, vec_env, n_eval_episodes=100)
print(f"Mean Reward: {mean_reward:.2f} +/- {std_reward:.2f}")

obs = vec_env.reset()
for i in range(1000):
    action, _states = model1.predict(obs)
    obs, reward, dones, info = vec_env.step(action)
    vec_env.render("human")
vec_env.close()
model.save("PPO_Taxi")

Mean Reward: -200.00 +/- 0.00


In [5]:
model2 = A2C("MlpPolicy", vec_env, verbose=0)
mean_reward,std_reward = evaluate_policy(model1, vec_env, n_eval_episodes=100)

print(f"mean reward: {mean_reward:.2f} +/- {std_reward:.2f}")

mean reward: -1205.30 +/- 891.11


In [8]:
model2.learn(total_timesteps=10000)
mean_reward,std_reward = evaluate_policy(model2, vec_env, n_eval_episodes=100)
print(f"mean reward: {mean_reward:.2f} +/- {std_reward:.2f}")

model2.save("A2C_Taxi")

mean reward: -200.00 +/- 0.00


In [11]:
model3 = DQN("MlpPolicy", vec_env, verbose=0)
mean_reward,std_reward = evaluate_policy(model3, vec_env, n_eval_episodes=100)

print(f"mean reward: {mean_reward:.2f} +/- {std_reward:.2f}")

mean reward: -1963.91 +/- 251.99


In [13]:
model3.learn(total_timesteps=10000)
mean_reward,std_reward = evaluate_policy(model3, vec_env, n_eval_episodes=100)

print(f"mean reward: {mean_reward:.2f} +/- {std_reward:.2f}")

model3.save("DQN_Taxi")

mean reward: -200.00 +/- 0.00


We trained all three types of models and all got a mean reward of -200, which appears to be the maximum. All three seem to be viable algorithms for this game. Let's move on to another environment.

### Cartpole-v1

Let's now test three models (A2C, PPO, and DQN) in Cartpole-v1 to see how they perform.

In [15]:
vec_env = make_vec_env("CartPole-v1", n_envs=4)
model1 = DQN("MlpPolicy", vec_env, verbose=0)

mean_reward,std_reward = evaluate_policy(model1, vec_env, n_eval_episodes=100)
print(f"mean reward: {mean_reward:.2f} +/- {std_reward:.2f}")

model1.learn(total_timesteps=10000)

mean_reward,std_reward = evaluate_policy(model1, vec_env, n_eval_episodes=100)
print(f"mean reward: {mean_reward:.2f} +/- {std_reward:.2f}")

model1.save("DQN_CartPole")

mean reward: 9.33 +/- 0.79
mean reward: 9.38 +/- 0.72


In [16]:
model2 = A2C("MlpPolicy", vec_env, verbose=0)
mean_reward,std_reward = evaluate_policy(model2, vec_env, n_eval_episodes=100)
print(f"mean reward: {mean_reward:.2f} +/- {std_reward:.2f}")

model2.learn(total_timesteps=10000)

mean_reward,std_reward = evaluate_policy(model2, vec_env, n_eval_episodes=100)
print(f"mean reward: {mean_reward:.2f} +/- {std_reward:.2f}")

model2.save("A2C_CartPole")

mean reward: 81.94 +/- 35.55
mean reward: 249.13 +/- 58.29


In [17]:
model3 = PPO("MlpPolicy", vec_env, verbose=0)

mean_reward,std_reward = evaluate_policy(model3, vec_env, n_eval_episodes=100)
print(f"mean reward: {mean_reward:.2f} +/- {std_reward:.2f}")

model3.learn(total_timesteps=10000)

mean_reward,std_reward = evaluate_policy(model1, vec_env, n_eval_episodes=100)
print(f"mean reward: {mean_reward:.2f} +/- {std_reward:.2f}")

model3.save("PPO_CartPole")

mean reward: 8.87 +/- 0.64
mean reward: 9.43 +/- 0.70


Interestingly, the A2C model out-performed PPO and DQN in the CartPole environment. Finally, we have a third environment...

### FrozenLake-v1
This is another simple environment, which includes an element of randomness in how the model interacts with its' environment. In this environment, the agent attempts to reach a finish goal by moving across a slippery ice-field, trying to avoid holes in the ice. Due to the 'slipperiness' of the ice, the agent has a small chance of randomly moving in a direction other than the one chosen by the policy.

In [20]:
vec_env = make_vec_env('FrozenLake-v1', n_envs=4)
vec_env.reset()

In [21]:
model1 = PPO('MlpPolicy',vec_env,verbose=0)

mean_reward,std_reward = evaluate_policy(model1, vec_env, n_eval_episodes=100)
print(f"mean reward: {mean_reward:.2f} +/- {std_reward:.2f}")

model1.learn(total_timesteps=10000)

mean_reward,std_reward = evaluate_policy(model1, vec_env, n_eval_episodes=100)
print(f"mean reward: {mean_reward:.2f} +/- {std_reward:.2f}")

model1.save("PPO_FrozenLake")

mean reward: 0.00 +/- 0.00
mean reward: 0.04 +/- 0.20


In [22]:
model2 = A2C('MlpPolicy',vec_env,verbose=0)

mean_reward,std_reward = evaluate_policy(model2, vec_env, n_eval_episodes=100)
print(f"mean reward: {mean_reward:.2f} +/- {std_reward:.2f}")

model2.learn(total_timesteps=10000)

mean_reward,std_reward = evaluate_policy(model2, vec_env, n_eval_episodes=100)
print(f"mean reward: {mean_reward:.2f} +/- {std_reward:.2f}")

model2.save("A2C_FrozenLake")

mean reward: 0.00 +/- 0.00
mean reward: 0.14 +/- 0.35


In [23]:
model3 = DQN('MlpPolicy',vec_env,verbose=0)

mean_reward,std_reward = evaluate_policy(model3, vec_env, n_eval_episodes=100)
print(f"mean reward: {mean_reward:.2f} +/- {std_reward:.2f}")

model3.learn(total_timesteps=10000)

mean_reward,std_reward = evaluate_policy(model3, vec_env, n_eval_episodes=100)
print(f"mean reward: {mean_reward:.2f} +/- {std_reward:.2f}")

model3.save("DQN_FrozenLake")

mean reward: 0.05 +/- 0.22
mean reward: 0.07 +/- 0.26


Again, it seems the A2C model out-performed the other two models. This could be due to the A2C model having better training performance on a Vectorized Environment.

In [None]:
env = gym.make('FrozenLake-v1')

In [25]:
model1 = PPO('MlpPolicy',env,verbose=0)

mean_reward,std_reward = evaluate_policy(model1, env, n_eval_episodes=100)
print(f"mean reward: {mean_reward:.2f} +/- {std_reward:.2f}")

model1.learn(total_timesteps=10000)

mean_reward,std_reward = evaluate_policy(model1, env, n_eval_episodes=100)
print(f"mean reward: {mean_reward:.2f} +/- {std_reward:.2f}")

model1.save("PPO_FrozenLake")

mean reward: 0.00 +/- 0.00
mean reward: 0.18 +/- 0.38


In [24]:
model3 = DQN('MlpPolicy',env,verbose=0)

mean_reward,std_reward = evaluate_policy(model3, env, n_eval_episodes=100)
print(f"mean reward: {mean_reward:.2f} +/- {std_reward:.2f}")

model3.learn(total_timesteps=10000)

mean_reward,std_reward = evaluate_policy(model3, env, n_eval_episodes=100)
print(f"mean reward: {mean_reward:.2f} +/- {std_reward:.2f}")

model3.save("DQN_FrozenLake")



mean reward: 0.02 +/- 0.14
mean reward: 0.27 +/- 0.44


Yup. it appears that the DQN and PPO models are not designed to be trained on Vectorized environments, at least not for some gymnasium envs. So, vectorizing environments is only helpful in some circumstances.