# Imports

In [1]:
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter
from collections import deque
import random
import time

In [2]:
# %pip install stable-baselines3[extra]

In [3]:
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack
from stable_baselines3.common.env_util import make_atari_env
from stable_baselines3.common.callbacks import EvalCallback, CheckpointCallback
from stable_baselines3.common.utils import get_linear_fn
from stable_baselines3.common.policies import ActorCriticCnnPolicy

# Settings

In [4]:
parameters = {
    "device" : torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
    "total_time_steps" : 2000000,
    "checkpoint_freq" : 200000,
    "eval_freq" : 50000,
    "n_steps" : 2048,
    "batch_size" : 64,
    "gae_lambda" : 0.95,
    "ent_coef" : 0.01,
    "gamma" : 0.99,
    "verbose" : 0,
    "clip_range" : 0.2
}

In [5]:
parameters['device']

device(type='cuda')

# Initial Model

In [6]:
env_id = 'AssaultNoFrameskip-v4'
env = make_atari_env(env_id, n_envs=1, seed=0)

In [7]:
# Wrap the environment to stack frames and normalize observations
env = VecFrameStack(env, n_stack=4)

In [8]:
tensorboard_log_dir = "./ppo_assault_tensorboard/"

In [9]:
# Create the PPO model
#model = PPO('CnnPolicy', env, verbose=0, tensorboard_log=tensorboard_log_dir) # Change verbose to 1 for info messages and 2 for debug messages

# Fine tuned model with custom actor-critic policy

In [10]:
class CustomCnnPolicy(ActorCriticCnnPolicy):
    def __init__(self, *args, **kwargs):
        super(CustomCnnPolicy, self).__init__(*args, **kwargs,
            net_arch=[dict(pi=[256, 256], vf=[256, 256])])


In [11]:
learning_rate_schedule = get_linear_fn(start=3e-4, end=1e-6, end_fraction=0.9)
model = PPO(CustomCnnPolicy, env, learning_rate=learning_rate_schedule, verbose=parameters['verbose'], 
            tensorboard_log=tensorboard_log_dir, n_steps=parameters['n_steps'], 
            batch_size=parameters['batch_size'], clip_range=parameters['clip_range'], gae_lambda=parameters['gae_lambda'], 
            ent_coef=parameters['ent_coef'], gamma=parameters['gamma'])



# Training

In [12]:
# Callbacks for evaluation and saving models
#checkpoint_callback = CheckpointCallback(save_freq=parameters['checkpoint_freq'], save_path='./logs/', name_prefix='ppo_assault_2m') # Save checkpoint trained state every 10k time steps. Might need to remove
eval_callback = EvalCallback(env, best_model_save_path='./logs/best_model/assault_2m_steps_tuned',
                             log_path='./logs/results', eval_freq=parameters['eval_freq'])

In [13]:
start_time = time.time()
model.learn(total_timesteps=parameters['total_time_steps'], callback=[eval_callback]) #[checkpoint_callback, eval_callback])

  return F.conv2d(input, weight, bias, self.stride,


Eval num_timesteps=50000, episode_reward=352.80 +/- 75.60
Episode length: 2488.60 +/- 332.13
New best mean reward!
Eval num_timesteps=100000, episode_reward=378.00 +/- 53.13
Episode length: 2362.20 +/- 237.13
New best mean reward!
Eval num_timesteps=150000, episode_reward=369.60 +/- 84.63
Episode length: 2336.40 +/- 290.72
Eval num_timesteps=200000, episode_reward=386.40 +/- 74.66
Episode length: 2613.40 +/- 399.31
New best mean reward!
Eval num_timesteps=250000, episode_reward=247.80 +/- 27.86
Episode length: 2081.40 +/- 291.08
Eval num_timesteps=300000, episode_reward=281.40 +/- 76.99
Episode length: 2337.00 +/- 297.53
Eval num_timesteps=350000, episode_reward=264.60 +/- 67.20
Episode length: 2168.60 +/- 556.37
Eval num_timesteps=400000, episode_reward=331.80 +/- 97.06
Episode length: 2401.60 +/- 515.05
Eval num_timesteps=450000, episode_reward=344.40 +/- 31.43
Episode length: 2345.80 +/- 391.02
Eval num_timesteps=500000, episode_reward=289.80 +/- 38.49
Episode length: 2701.40 +/- 25

<stable_baselines3.ppo.ppo.PPO at 0x1cd9f38db90>

* Time Elapsed 1m steps: 84m
* Time Elapsed 1m steps tuned: 76m
* Time Elapsed 2m steps: 140m
* Time Elapsed 2m steps tuned: 153m

In [14]:
model.save("ppo_assault_2m_tuned")

In [15]:
print("Time Elapsed: ", (time.time() - start_time)/60)

Time Elapsed:  153.5739587386449


# Loading and evaluating the model

* TODO: Load policy model and run for evaluation.