## Setup

### Imports

In [None]:
# Import Gymnasium
import gymnasium as gym

# Import Wrappers
from gymnasium.wrappers import GrayScaleObservation # Wrapper to convert RGB image to grayscale
from stable_baselines3.common.vec_env import VecFrameStack, DummyVecEnv # VecFrameStack is a wrapper that stacks the last n frames

# Import Algorithms for Training
from stable_baselines3 import PPO # Proximal Policy Optimization (PPO) algorithm

# Additional Imports
import os # File management
from stable_baselines3.common.callbacks import BaseCallback # Callbacks for saving models
from matplotlib import pyplot as plt




### Functions and Definitions

In [None]:
CHECKPOINT_DIR = 'checkpoints' # Directory to save models
LOG_DIR = 'logs'

In [None]:
# Custom class to make saving models easier
class TrainAndLoggingCallback(BaseCallback):

  def __init__(self, save_freq, save_path, save_prefix, verbose=1):
    super(TrainAndLoggingCallback, self).__init__(verbose)
    self.save_freq = save_freq
    self.save_path = save_path
    if save_prefix is not None:
      self.save_prefix = save_prefix
    else:
      self.save_prefix = 'model'

  def _init_callback(self):
    if self.save_path is not None:
      os.makedirs(self.save_path, exist_ok=True)

  def _on_step(self):
    if self.n_calls % self.save_freq == 0:
      model_path = os.path.join(self.save_path, '{}_{}'.format(self.save_prefix, self.n_calls))
      self.model.save(model_path)

    return True

In [None]:
def initialize_env(game, frame_memory, render_mode = None):
  # Create and define the environment
  if (render_mode != None):
    env = gym.make(game, render_mode = render_mode)
  else:
    env = gym.make(game)

  env = GrayScaleObservation(env, keep_dim=True)                # Grayscale Wrapper to reduce the number of features
  env = DummyVecEnv([lambda: env])                              # Wrapper to vectorize the environment (allows for multiple parallel environments)
  env = VecFrameStack(env, frame_memory, channels_order='last') # FrameStack Wrapper to remember the last n frames

  # Initializes the environment
  env.reset()
  return env

def initialize_model(env, policy, learning_rate, n_steps):
  # Initialize the model
  model = PPO(policy, env, verbose=1, learning_rate=learning_rate, tensorboard_log=LOG_DIR, n_steps=n_steps)
  return model

In [None]:
def train_model(model_game,
                model_policy,
                model_learning_rate,
                model_n_steps,
                model_frame_memory=1,
                training_length=1,
                save_freq=1,
                model_name=None,
                pre_existing_model=None):
  # Initialize the environment
  env = initialize_env(model_game, model_frame_memory)

  # Initialize the model
  if pre_existing_model is None:
    model = initialize_model(env, model_policy, model_learning_rate, model_n_steps)
  else:
    model = pre_existing_model

  # Initialize "callback" to save the model
  callback = TrainAndLoggingCallback(save_freq=save_freq, save_prefix=model_name, save_path=CHECKPOINT_DIR)

  # Train the model for the specified number of timesteps
  model.learn(total_timesteps=training_length, callback=callback)


def load_model(game, frame_memory, full_model_name, render_mode = 'human'):
  env = initialize_env(game, frame_memory, render_mode=render_mode)
  model = PPO.load(os.path.join(CHECKPOINT_DIR, full_model_name))
  return model, env

def watch_model(model, env, num_actions):
  state = env.reset()
  for _ in range(num_actions):
    action, _ = model.predict(state)
    state, reward, done, info = env.step(action)
    env.render()

## Start Doing Stuff

In [None]:
full_model_name = "TetrisPPO_mk1_1000000"

# Ideally these two are saved along with the model, and can be read in
game = "ALE/Tetris-v5"
frame_memory = 1

# Also ideally all the rest of the model parameters are saved with the model

model, env = load_model(game, frame_memory, full_model_name)

In [None]:
watch_model(model, env, 2048)

KeyboardInterrupt: 

In [None]:
# Train a new model
model_name = "SpaceInvadersPPO_mk2"
game = "ALE/SpaceInvaders-v5"
frame_memory = 4
policy = "CnnPolicy"
learning_rate = 0.000001
n_steps = 2048

train_model(model_game=game,
            model_policy=policy,
            model_learning_rate=learning_rate,
            model_n_steps=n_steps,
            model_frame_memory=frame_memory,
            training_length=2000000,
            save_freq=500000,
            model_name=model_name)

Using cuda device
Wrapping the env in a VecTransposeImage.
Logging to logs\PPO_7
-----------------------------
| time/              |      |
|    fps             | 17   |
|    iterations      | 1    |
|    time_elapsed    | 117  |
|    total_timesteps | 2048 |
-----------------------------
-------------------------------------------
| time/                   |               |
|    fps                  | 28            |
|    iterations           | 2             |
|    time_elapsed         | 145           |
|    total_timesteps      | 4096          |
| train/                  |               |
|    approx_kl            | 4.8005895e-06 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -1.79         |
|    explained_variance   | -0.00133      |
|    learning_rate        | 1e-06         |
|    loss                 | 12.9          |
|    n_updates            | 10            |
|    policy_gradient_loss | -4.86e-05     |
|    

In [None]:
# Train a new model
model_name = "TetrisPPO_mk1"
game = "ALE/Tetris-v5"
frame_memory = 1
policy = "CnnPolicy"
learning_rate = 0.000001
n_steps = 2048

train_model(model_game=game,
            model_policy=policy,
            model_learning_rate=learning_rate,
            model_n_steps=n_steps,
            model_frame_memory=frame_memory,
            training_length=1000000,
            save_freq=250000,
            model_name=model_name)

Using cuda device
Wrapping the env in a VecTransposeImage.
Logging to logs\PPO_9
-----------------------------
| time/              |      |
|    fps             | 135  |
|    iterations      | 1    |
|    time_elapsed    | 15   |
|    total_timesteps | 2048 |
-----------------------------
-------------------------------------------
| time/                   |               |
|    fps                  | 136           |
|    iterations           | 2             |
|    time_elapsed         | 30            |
|    total_timesteps      | 4096          |
| train/                  |               |
|    approx_kl            | 5.7595607e-06 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -1.61         |
|    explained_variance   | -0.088        |
|    learning_rate        | 1e-06         |
|    loss                 | 0.000865      |
|    n_updates            | 10            |
|    policy_gradient_loss | -4.65e-05     |
|    

# Recording Parameters for each model
SpaceInvadersPPO_mk1
- game = "ALE/SpaceInvaders-v5"
- frame_memory = 4
- policy = "CnnPolicy"
- learning_rate = 0.000001
- n_steps = 512



SpaceInvadersPPO_mk2
- game = "ALE/SpaceInvaders-v5"
- frame_memory = 4
- policy = "CnnPolicy"
- learning_rate = 0.000001
- n_steps = 2048


SpaceInvadersPPO_mk2
- game = "ALE/SpaceInvaders-v5"
- frame_memory = 4
- policy = "CnnPolicy"
- learning_rate = 0.00001
- n_steps = 2048

TetrisPPO_mk1
- game = "ALE/Tetris-v5"
- frame_memory = 1
- policy = "CnnPolicy"
- learning_rate = 0.000001
- n_steps = 2048




tensorboard --logdir=.