Author: Christos Christidis

# Imports

In [1]:
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter
from collections import deque
import random
import time

In [2]:
# %pip install stable-baselines3[extra]

In [3]:
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack
from stable_baselines3.common.env_util import make_atari_env
from stable_baselines3.common.callbacks import EvalCallback, CheckpointCallback
from stable_baselines3.common.utils import get_linear_fn
from stable_baselines3.common.policies import ActorCriticCnnPolicy

# Settings

In [16]:
parameters = {
    "device" : torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
    "total_time_steps" : 1000000,#5000000,
    "checkpoint_freq" : 200000,
    "eval_freq" : 50000,
    "n_steps" : 2048,
    "batch_size" : 64,
    "gae_lambda" : 0.95,
    "ent_coef" : 0.01,
    "gamma" : 0.99,
    "verbose" : 0,
    "clip_range" : 0.2,
    "features_dim" : 512
}

In [17]:
parameters['device']

device(type='cuda')

# Initial Model

In [23]:
env_id = 'AssaultNoFrameskip-v4'
env = make_atari_env(env_id, n_envs=1, seed=0)

In [24]:
# Wrap the environment to stack frames and normalize observations
env = VecFrameStack(env, n_stack=4)

In [25]:
tensorboard_log_dir = "./ppo_assault_tensorboard/"

In [9]:
# Create the PPO model
#model = PPO('CnnPolicy', env, verbose=0, tensorboard_log=tensorboard_log_dir) # Change verbose to 1 for info messages and 2 for debug messages

# Fine tuned model with custom actor-critic policy

In [26]:
class CustomCnnPolicy(ActorCriticCnnPolicy):
    def __init__(self, *args, **kwargs):
        super(CustomCnnPolicy, self).__init__(*args, **kwargs,
            net_arch=[dict(pi=[256, 256], vf=[256, 256])])


In [27]:
learning_rate_schedule = get_linear_fn(start=3e-4, end=1e-6, end_fraction=0.9)
model = PPO(CustomCnnPolicy, env, learning_rate=learning_rate_schedule, verbose=parameters['verbose'], 
            tensorboard_log=tensorboard_log_dir, n_steps=parameters['n_steps'], 
            batch_size=parameters['batch_size'], clip_range=parameters['clip_range'], gae_lambda=parameters['gae_lambda'], 
            ent_coef=parameters['ent_coef'], gamma=parameters['gamma'])

# Training

In [28]:
# Callbacks for evaluation and saving models
#checkpoint_callback = CheckpointCallback(save_freq=parameters['checkpoint_freq'], save_path='./logs/', name_prefix='ppo_assault_2m') # Save checkpoint trained state every 10k time steps. Might need to remove
eval_callback = EvalCallback(env, best_model_save_path='./logs/best_model/assault_5m_steps_tuned',
                             log_path='./logs/results', eval_freq=parameters['eval_freq'])

In [29]:
start_time = time.time()
model.learn(total_timesteps=parameters['total_time_steps'], callback=[eval_callback]) #[checkpoint_callback, eval_callback])

  return F.conv2d(input, weight, bias, self.stride,


Eval num_timesteps=50000, episode_reward=323.40 +/- 54.11
Episode length: 2200.80 +/- 388.03
New best mean reward!
Eval num_timesteps=100000, episode_reward=323.40 +/- 36.61
Episode length: 2332.00 +/- 265.96
Eval num_timesteps=150000, episode_reward=369.60 +/- 97.24
Episode length: 2495.00 +/- 446.51
New best mean reward!
Eval num_timesteps=200000, episode_reward=399.00 +/- 116.55
Episode length: 2645.80 +/- 450.42
New best mean reward!
Eval num_timesteps=250000, episode_reward=373.80 +/- 165.03
Episode length: 2789.40 +/- 660.19
Eval num_timesteps=300000, episode_reward=424.20 +/- 62.86
Episode length: 3077.80 +/- 295.09
New best mean reward!
Eval num_timesteps=350000, episode_reward=428.40 +/- 75.83
Episode length: 2983.60 +/- 575.20
New best mean reward!
Eval num_timesteps=400000, episode_reward=336.00 +/- 18.78
Episode length: 2357.20 +/- 221.34
Eval num_timesteps=450000, episode_reward=453.60 +/- 110.00
Episode length: 3189.40 +/- 529.22
New best mean reward!
Eval num_timesteps=5

<stable_baselines3.ppo.ppo.PPO at 0x18c70480bd0>

* Time Elapsed 1m steps: 84m
* Time Elapsed 1m steps tuned: 76m
* Time Elapsed 2m steps: 140m
* Time Elapsed 2m steps tuned: 153m
* Time Elapsed 5m steps: 434m

In [30]:
model.save("ppo_assault_5m_tuned")

In [31]:
print("Time Elapsed: ", (time.time() - start_time)/60)

Time Elapsed:  434.2075705130895


# ResNet and PPO

In [40]:
from torchvision.models import resnet18
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
from stable_baselines3.common.vec_env import DummyVecEnv

In [41]:
from gym import spaces
class GrayToRGBWrapper(gym.ObservationWrapper):
    def __init__(self, env):
        super(GrayToRGBWrapper, self).__init__(env)
        old_shape = self.observation_space.shape
        new_shape = (old_shape[0], old_shape[1], 3)
        self.observation_space = spaces.Box(low=0, high=255, shape=new_shape, dtype=np.uint8)

    def observation(self, obs):
        return np.repeat(obs[..., np.newaxis], 3, -1)


In [42]:
class CustomResNetFeatureExtractor(BaseFeaturesExtractor):
    def __init__(self, observation_space, features_dim=parameters['features_dim']):
        super(CustomResNetFeatureExtractor, self).__init__(observation_space, features_dim)
        # Load pre-trained ResNet18
        self.resnet = resnet18(pretrained=True)
        # Replace the first convolutional layer to accept single-channel images
        self.resnet.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False)
        # Remove the fully connected layer of ResNet
        self.resnet = nn.Sequential(*list(self.resnet.children())[:-1])
        self._features_dim = features_dim
    
    def forward(self, observations):
        # Forward pass through ResNet
        return self.resnet(observations).squeeze()

In [43]:
from stable_baselines3.common.callbacks import BaseCallback
from torch.utils.tensorboard import SummaryWriter

class CustomTensorboardCallback(BaseCallback):
    def __init__(self, verbose=0):
        super(CustomTensorboardCallback, self).__init__(verbose)
        self.writer = None

    def _on_training_start(self) -> None:
        self.writer = SummaryWriter()

    def _on_step(self) -> bool:
        # Log loss, reward, variance, episode length
        if 'losses' in self.locals:
            self.writer.add_scalar("Loss/Policy Loss", self.locals["losses"].policy_loss.item(), self.num_timesteps)
            self.writer.add_scalar("Loss/Value Loss", self.locals["losses"].value_loss.item(), self.num_timesteps)
        if 'ep_info_buffer' in self.locals and len(self.locals['ep_info_buffer']) > 0:
            self.writer.add_scalar("Reward/Mean Reward", np.mean([ep_info['r'] for ep_info in self.locals['ep_info_buffer']]), self.num_timesteps)
            self.writer.add_scalar("Episode/Length", np.mean([ep_info['l'] for ep_info in self.locals['ep_info_buffer']]), self.num_timesteps)
        return True

    def _on_training_end(self) -> None:
        self.writer.close()


In [44]:
tensorboard_log_dir = "./resnetppo/"

In [46]:
# Create the environment
env_id = 'AssaultNoFrameskip-v4'
env = make_atari_env(env_id, n_envs=1, seed=0)
#env = DummyVecEnv([lambda: GrayToRGBWrapper(env)])

policy_kwargs = dict(
    features_extractor_class=CustomResNetFeatureExtractor,
    features_extractor_kwargs=dict(features_dim=parameters['features_dim']),
)

model = PPO('CnnPolicy', env, tensorboard_log=tensorboard_log_dir, policy_kwargs=policy_kwargs, verbose=parameters['verbose'])


custom_callback = CustomTensorboardCallback()
model.learn(total_timesteps=parameters['total_time_steps'], callback=custom_callback)


<stable_baselines3.ppo.ppo.PPO at 0x206d89f0d50>

* PPO ResNet Policy Time Elapsed: 321m

In [47]:
model.save("ppo_resnet_policy")

Visualizing the environment

In [48]:
model = PPO.load("ppo_resnet_policy")




In [None]:
env_id = 'AssaultNoFrameskip-v4'
env = make_atari_env(env_id, n_envs=1, seed=0)
obs = env.reset()
for _ in range(1000):
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render(mode='human')