In [1]:
import gymnasium as gym
import torch
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv

In [2]:
env_name = "LunarLander-v3"
env = gym.make(env_name, render_mode = None)

env = Monitor(env)
env = DummyVecEnv([lambda: env])

env.reset()

print("Observation Space Shape : ", env.observation_space.shape)
print("Sample Observation : ", env.observation_space.sample())

Observation Space Shape :  (8,)
Sample Observation :  [-2.1709743   0.06801224 -2.3974411  -3.3341873   2.7114687   5.796174
  0.03589563  0.9623087 ]


  from pkg_resources import resource_stream, resource_exists


In [3]:
print("Action Space Shape : ", env.action_space.n)
print("Action Space Sample : ", env.action_space.sample())

Action Space Shape :  4
Action Space Sample :  3


In [4]:
model = PPO(
    policy = "MlpPolicy", 
    env = env, 
    n_steps = 1024, 
    batch_size = 64, 
    n_epochs = 4, 
    gamma = 0.999, 
    gae_lambda = 0.98, 
    ent_coef = 0.01, 
    verbose = 1, 
    device = "cpu"
)

Using cpu device


In [5]:
model.learn(total_timesteps= 500000)

model_name = "ppo-LunarLander-v3"
model.save(model_name)

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 98.1     |
|    ep_rew_mean     | -134     |
| time/              |          |
|    fps             | 872      |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 1024     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 88.7         |
|    ep_rew_mean          | -129         |
| time/                   |              |
|    fps                  | 429          |
|    iterations           | 2            |
|    time_elapsed         | 4            |
|    total_timesteps      | 2048         |
| train/                  |              |
|    approx_kl            | 0.0019183599 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.39        |
|    explained_variance   | 0.00256      |
|    learning_r

In [6]:
from stable_baselines3.common.evaluation import evaluate_policy

In [7]:
#Evaluation : 

model = PPO.load("ppo-LunarLander-v3.zip")

eval_env = Monitor(gym.make("LunarLander-v3"))

print("Evaluating model quality")
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes=10, deterministic=True)

print(f"Mean Reward: {mean_reward:.2f} +/- {std_reward:.2f}")



Evaluating model quality
Mean Reward: 32.52 +/- 116.69


In [8]:
env = gym.make("LunarLander-v3", render_mode = "human")

print("Random Agent : Before Training")

obs, _ = env.reset()
for i in range (300):
    action = env.action_space.sample()
    obs, reward, terminated, truncated, info = env.step(action)
    
    if terminated or truncated:
        obs, _ = env.reset()

env.close()

Random Agent : Before Training


In [10]:
import pygame

env = gym.make("LunarLander-v3", render_mode = "human")

print("Random Agent : After Training")

obs, _ = env.reset()

try:
    while True:
        
        for event in pygame.event.get():
            
            # If user presses a key
            if event.type == pygame.KEYDOWN:
                if event.key == pygame.K_ESCAPE:
                    raise KeyboardInterrupt
                
        action, _ = model.predict(obs, deterministic=True)
        
        obs, reward, terminated, truncated, info = env.step(action)
        
        if terminated or truncated:
            print("Landed! Resetting")
            obs, _ = env.reset()
            
except KeyboardInterrupt:
    print("\nClosing Environment")
    env.close()

Random Agent : After Training

Closing Environment
