# Imports

In [3]:
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter
from collections import deque
import random
import time

In [None]:
# %pip install stable-baselines3[extra]

In [4]:
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack
from stable_baselines3.common.env_util import make_atari_env
from stable_baselines3.common.callbacks import EvalCallback, CheckpointCallback

# Settings

In [42]:
parameters = {
    "device" : torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
    "total_time_steps" : 1000000
}

In [3]:
parameters['device']

device(type='cuda')

# Training

In [43]:
env_id = 'AssaultNoFrameskip-v4'
env = make_atari_env(env_id, n_envs=1, seed=0)

In [44]:
# Wrap the environment to stack frames and normalize observations
env = VecFrameStack(env, n_stack=4)

In [45]:
tensorboard_log_dir = "./ppo_assault_tensorboard/"

In [46]:
# Create the PPO model
model = PPO('CnnPolicy', env, verbose=0, tensorboard_log=tensorboard_log_dir) # Change verbose to 1 for info messages and 2 for debug messages

In [47]:
# Callbacks for evaluation and saving models
checkpoint_callback = CheckpointCallback(save_freq=10000, save_path='./logs/', name_prefix='ppo_assault') # Save checkpoint trained state every 10k time steps. Might need to remove
eval_callback = EvalCallback(env, best_model_save_path='./logs/best_model/',
                             log_path='./logs/results', eval_freq=10000)

In [48]:
model.learn(total_timesteps=parameters['total_time_steps'], callback=[checkpoint_callback, eval_callback])

  return F.conv2d(input, weight, bias, self.stride,


Eval num_timesteps=10000, episode_reward=231.00 +/- 35.14
Episode length: 3477.60 +/- 403.89
New best mean reward!
Eval num_timesteps=20000, episode_reward=344.40 +/- 156.92
Episode length: 3351.20 +/- 1317.51
New best mean reward!
Eval num_timesteps=30000, episode_reward=432.60 +/- 54.11
Episode length: 3887.20 +/- 536.20
New best mean reward!
Eval num_timesteps=40000, episode_reward=449.40 +/- 89.69
Episode length: 3509.40 +/- 681.68
New best mean reward!
Eval num_timesteps=50000, episode_reward=483.00 +/- 176.20
Episode length: 3147.00 +/- 811.19
New best mean reward!
Eval num_timesteps=60000, episode_reward=491.40 +/- 87.70
Episode length: 3119.00 +/- 582.37
New best mean reward!
Eval num_timesteps=70000, episode_reward=424.20 +/- 135.05
Episode length: 2546.00 +/- 671.59
Eval num_timesteps=80000, episode_reward=386.40 +/- 94.48
Episode length: 2836.00 +/- 685.39
Eval num_timesteps=90000, episode_reward=319.20 +/- 97.96
Episode length: 2580.20 +/- 620.48
Eval num_timesteps=100000, 

<stable_baselines3.ppo.ppo.PPO at 0x21c29632650>

* Time Elapsed 1m steps: 120m

In [49]:
model.save("ppo_assault_1m_first_run")

# Loading and evaluating the model

* TODO: Load policy model and run for evaluation