# 1. Import Dependencies

In [1]:
import os
import gymnasium as gym # Build env and work with pre-existing env's.
from stable_baselines3 import PPO # SB -> high-level RL algos (e.g. PPO).
from stable_baselines3.common.vec_env import DummyVecEnv # Test multiple agents at the same time.
from stable_baselines3.common.evaluation import evaluate_policy # 
import pygame
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.callbacks import BaseCallback
from torch.utils.tensorboard import SummaryWriter


# 2. Load Environment

In [2]:
# Create Environment
environment_name = "CartPole-v1"
env = gym.make(environment_name, render_mode='human')
env = Monitor(env)
# print(environment_name)

In [3]:
episodes = 5

for episode in range(1, episodes + 1):
    state, _ = env.reset()  # Reset the environment
    done = False
    score = 0
    
    while not done:
        env.render() # Render the environment (avoid unnecessary pygame handling)
        action = env.action_space.sample()  # Generate a random action
        
        # Step through the environment
        next_state, reward, terminated, truncated, info = env.step(action)
        
        done = terminated or truncated
        score += reward

    print('Episode:{} Score:{}'.format(episode, score))

# Properly close the environment
env.close()

import pygame
pygame.quit()

Episode:1 Score:37.0
Episode:2 Score:26.0
Episode:3 Score:31.0
Episode:4 Score:25.0
Episode:5 Score:23.0


# 3. Understanding the Environment

In [4]:
env.action_space

Discrete(2)

In [5]:
env.action_space.sample()


np.int64(1)

In [6]:
env.observation_space

Box([-4.8               -inf -0.41887903        -inf], [4.8               inf 0.41887903        inf], (4,), float32)

In [7]:
env.observation_space.sample()

array([0.26281783, 0.9452759 , 0.13019696, 0.521015  ], dtype=float32)

# 4. Train an RL Model

In [8]:
log_path = os.path.join('Training', 'Logs')
log_path

'Training/Logs'

In [9]:
env = gym.make(environment_name) # recreate env, new instance of env
env = DummyVecEnv([lambda: env]) # Wrap single env into vectorized end to create parallel envs for multi-processing
log_dir = "Training/Logs"
os.makedirs(log_dir, exist_ok=True)
model = PPO('MlpPolicy', env, verbose = 1, tensorboard_log=log_dir) # Simple NN used for policy

class TensorboardCallback(BaseCallback):
    def __init__(self, verbose=0):
        super(TensorboardCallback, self).__init__(verbose)
        self.writer = SummaryWriter(log_dir)

    def _on_step(self) -> bool:
        # Log scalar metrics
        self.logger.record('reward', self.locals['rewards'])
        # Log custom scalar to TensorBoard
        if 'reward' in self.locals:
            reward = self.locals['rewards'][0]
            self.writer.add_scalar('Reward', reward, self.num_timesteps)
        return True

    def _on_training_end(self) -> None:
        self.writer.close()

Using cpu device


In [None]:
model.learn(total_timesteps=20000, callback=TensorboardCallback())

Logging to Training/Logs/PPO_5


TypeError: TensorboardCallback.__init__() takes from 1 to 2 positional arguments but 3 were given

# 4. Save and Reload Model

In [None]:
PPO_Path = os.path.join('Training', 'Saved Models', 'PPO_Model_Cartpole')

In [None]:
model.save(PPO_Path)

In [None]:
del model

In [None]:
PPO_Path

'Training/Saved Models/PPO_Model_Cartpole'

In [None]:
model = PPO.load(PPO_Path, env=env)

In [None]:
model.learn(total_timesteps=1000)

Logging to Training/Logs/PPO_4
-----------------------------
| time/              |      |
|    fps             | 8944 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 2048 |
-----------------------------


<stable_baselines3.ppo.ppo.PPO at 0x2d853d460>

# 5. Evaluation

In [None]:
from stable_baselines3.common.evaluation import evaluate_policy

mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=10, render=True)
print(f"Mean reward: {mean_reward}, Std reward: {std_reward}")



Mean reward: 500.0, Std reward: 0.0


In [None]:
env.close()

# 6. Test Model

In [None]:
import numpy as np

episodes = 5

for episode in range(1, episodes + 1):
    obs = env.reset()
    done = False
    score = 0
    
    while not done:
        env.render()
        
        # Predict the action based on the current observation
        action, _states = model.predict(obs, deterministic=True)
        
        # Step through the environment
        obs, reward, done, info = env.step(action)
        
        # Ensure reward is handled as a scalar
        score += reward
        # item() if isinstance(reward, np.ndarray) else reward

    print('Episode:{} Score:{}'.format(episode, score))

env.close()

import pygame
pygame.quit()

Episode:1 Score:[500.]
Episode:2 Score:[500.]
Episode:3 Score:[500.]
Episode:4 Score:[500.]
Episode:5 Score:[500.]


In [None]:
obs = env.reset()

In [None]:
obs

array([[0.0257719 , 0.00298568, 0.02925526, 0.01775082]], dtype=float32)

In [None]:
action, _ = model.predict(obs)

In [None]:
env.action_space.sample()

np.int64(0)

In [None]:
env.step(action)

(array([[ 0.02583161, -0.19254334,  0.02961028,  0.3195187 ]],
       dtype=float32),
 array([1.], dtype=float32),
 array([False]),
 [{'TimeLimit.truncated': False}])

# 7. Viewing Logs in Tensorboard

In [None]:
training_log_path = os.path.join(log_path, 'PPO_2')

In [None]:
training_log_path

'Training/Logs/PPO_2'