In [None]:
# !apt-get install -y swig
# !pip install gymnasium[atari] torch torchvision stable-baselines3
# !pip install gymnasium[accept-rom-license]

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  swig4.0
Suggested packages:
  swig-doc swig-examples swig4.0-examples swig4.0-doc
The following NEW packages will be installed:
  swig swig4.0
0 upgraded, 2 newly installed, 0 to remove and 45 not upgraded.
Need to get 1,116 kB of archives.
After this operation, 5,542 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 swig4.0 amd64 4.0.2-1ubuntu1 [1,110 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 swig all 4.0.2-1ubuntu1 [5,632 B]
Fetched 1,116 kB in 3s (432 kB/s)
Selecting previously unselected package swig4.0.
(Reading database ... 121925 files and directories currently installed.)
Preparing to unpack .../swig4.0_4.0.2-1ubuntu1_amd64.deb ...
Unpacking swig4.0 (4.0.2-1ubuntu1) ...
Selecting previously unselected package swig.
Preparing to unpack .../swig_4.0.2-1ubunt

## Import Dependencies ##

In [1]:
## Import Dependencies ##
import os
import gymnasium as gym
import torch as th
import torch.nn as nn
from stable_baselines3 import DQN  # Import DQN
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.callbacks import CheckpointCallback, EvalCallback
from stable_baselines3.common.logger import configure
from stable_baselines3.common.atari_wrappers import ClipRewardEnv
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
import matplotlib.pyplot as plt

import torch

# Enable CuDNN benchmark mode
torch.backends.cudnn.benchmark = True


# Device configuration
device = th.device("cuda" if th.cuda.is_available() else "cpu")

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

  and should_run_async(code)


Mounted at /content/drive


# Custom CNN

In [3]:
# Custom CNN feature extractor
class CustomCNN(BaseFeaturesExtractor):
    def __init__(self, observation_space: gym.spaces.Box, features_dim: int = 256):
        super(CustomCNN, self).__init__(observation_space, features_dim)
        n_input_channels = observation_space.shape[0]
        self.cnn = nn.Sequential(
            nn.Conv2d(n_input_channels, 32, kernel_size=8, stride=4, padding=0),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2, padding=0),
            nn.ReLU(),
            nn.Flatten(),
        )

        with th.no_grad():
            n_flatten = self.cnn(th.as_tensor(observation_space.sample()[None]).float()).shape[1]

        self.linear = nn.Sequential(nn.Linear(n_flatten, features_dim), nn.ReLU())

    def forward(self, observations: th.Tensor) -> th.Tensor:
        return self.linear(self.cnn(observations))

class CustomCNNDeep(BaseFeaturesExtractor):
    def __init__(self, observation_space: gym.spaces.Box, features_dim: int = 256):
        super(CustomCNNDeep, self).__init__(observation_space, features_dim)
        n_input_channels = observation_space.shape[0]
        self.cnn = nn.Sequential(
            nn.Conv2d(n_input_channels, 32, kernel_size=8, stride=4, padding=0),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2, padding=0),
            nn.ReLU(),
            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=0),
            nn.ReLU(),
            nn.Flatten(),
        )

        with th.no_grad():
            n_flatten = self.cnn(th.as_tensor(observation_space.sample()[None]).float()).shape[1]

        self.linear = nn.Sequential(nn.Linear(n_flatten, features_dim), nn.ReLU())

    def forward(self, observations: th.Tensor) -> th.Tensor:
        return self.linear(self.cnn(observations))


class CustomCNNWide(BaseFeaturesExtractor):
    def __init__(self, observation_space: gym.spaces.Box, features_dim: int = 256):
        super(CustomCNNWide, self).__init__(observation_space, features_dim)
        n_input_channels = observation_space.shape[0]
        self.cnn = nn.Sequential(
            nn.Conv2d(n_input_channels, 64, kernel_size=8, stride=4, padding=0),
            nn.ReLU(),
            nn.Conv2d(64, 128, kernel_size=4, stride=2, padding=0),
            nn.ReLU(),
            nn.Flatten(),
        )

        with th.no_grad():
            n_flatten = self.cnn(th.as_tensor(observation_space.sample()[None]).float()).shape[1]

        self.linear = nn.Sequential(nn.Linear(n_flatten, features_dim), nn.ReLU())

    def forward(self, observations: th.Tensor) -> th.Tensor:
        return self.linear(self.cnn(observations))

class CustomCNNShallow(BaseFeaturesExtractor):
    def __init__(self, observation_space: gym.spaces.Box, features_dim: int = 256):
        super(CustomCNNShallow, self).__init__(observation_space, features_dim)
        n_input_channels = observation_space.shape[0]
        self.cnn = nn.Sequential(
            nn.Conv2d(n_input_channels, 32, kernel_size=8, stride=4, padding=0),
            nn.ReLU(),
            nn.Flatten(),
        )

        with th.no_grad():
            n_flatten = self.cnn(th.as_tensor(observation_space.sample()[None]).float()).shape[1]

        self.linear = nn.Sequential(nn.Linear(n_flatten, features_dim), nn.ReLU())

    def forward(self, observations: th.Tensor) -> th.Tensor:
        return self.linear(self.cnn(observations))


# Configuration of DQN model

In [4]:
## Configuration dictionary ##
config = {
    'learning_rate': 1e-4,
    'buffer_size': 1000000, #replay memory size
    'learning_starts': 100,
    'batch_size': 32,
    'tau': 1.0, #soft update coefficient 1 is hard update
    'gamma': 0.99, #discount factor
    'train_freq': 4,
    'gradient_steps': 1,
    'target_update_interval': 10000,
    'exploration_fraction': 0.1, #decay
    'exploration_initial_eps': 1.0,
    'exploration_final_eps': 0.05,
    'max_grad_norm': 10,
    'replay_buffer_class': None,
    'replay_buffer_kwargs': None,
    'optimize_memory_usage': False,
    'stats_window_size': 100,
    'tensorboard_log': None,
    'policy_kwargs': None, #neural network size
    'verbose': 0,
    'seed': None,
    'device': 'auto',
    '_init_setup_model': True
}

# Define the log path using os.path.join for cross-platform compatibility
log_path = os.path.join('Training', 'Target_update_Logs')

# Update parameters based on your preferred values
config.update({
    'learning_rate': 1e-4,              #1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-8
    'buffer_size': 75000,               #25000, 50000, 75000, 100000
    'batch_size': 64,                   #16, 32, 64, 128
    'gamma': 0.99,                      #0.999, 0.99, 0.95
    'target_update_interval': 20000,    #5000, 10000, 20000, 30000
    'exploration_fraction': 0.1,        #0.1, 0.2, 0.5, 0.99
    'exploration_initial_eps': 1.0,     #1.0, 0.9
    'exploration_final_eps': 0.05,      #0.1, 0.05, 0.01

    'policy_kwargs': {
         'features_extractor_class': CustomCNN,
         'features_extractor_kwargs': {'features_dim': 128},
         'net_arch': [256, 256],
         'activation_fn': th.nn.ReLU
     },
    'verbose': 1,
    # 'tensorboard_log': '/content/drive/MyDrive/Colab Notebooks/Asterix/Training/Logs_Asterix/200k_Logs', ## change for different logs different parameters
    'tensorboard_log': log_path, ## change for different logs different parameters
})

## Make Environment ##

In [5]:
## Preprocessing and Wrappers ##
def make_env(env_name):
    env = gym.make(env_name, render_mode="rgb_array", obs_type="grayscale")
    env.metadata['render_fps'] = 10000 # Disable FPS limit for faster rendering
    env = Monitor(env)  # Monitor to keep track of rewards and episode lengths
    env = gym.wrappers.AtariPreprocessing(env, noop_max=30, frame_skip=4, screen_size=84, terminal_on_life_loss=False, grayscale_obs=True, grayscale_newaxis=False, scale_obs=False)
    # env = ClipRewardEnv(env)  # Add reward clipping
    env = gym.wrappers.FrameStack(env, 4)  # Stack frames to provide temporal context
    return env

## Test Environment ##
environment_name = 'AsterixNoFrameskip-v4'
env = make_env(environment_name)
env = DummyVecEnv([lambda: env])
env = VecFrameStack(env, n_stack=4)  # Ensure the frames are stacked
obs = env.reset()
# env.action_space
# env.observation_space

## Test Environment ##

In [6]:
episodes = 5

for episode in range(1, episodes+1):
    obs = env.reset()
    done = [False]
    score = 0

    while not done[0]:
        env.render()
        action = env.action_space.sample()
        obs, rewards, dones, infos = env.step([action])  # Wrap action in list for DummyVecEnv
        score += rewards[0]

        done = dones  # done is already a list
    print('Episode:{} Score:{}'.format(episode, score))
env.close()

Episode:1 Score:150.0
Episode:2 Score:300.0
Episode:3 Score:250.0
Episode:4 Score:100.0
Episode:5 Score:50.0


## Train Model ##

In [7]:
# First Training Session
# log_path = os.path.join('Training', 'Target_update_Logs')
# model = DQN('CnnPolicy', env, verbose=1, tensorboard_log=log_path, buffer_size=10000, batch_size=32)
model = DQN('CnnPolicy', env, **config)  # Enable Double DQN -- double_q=True


# Recreate callbacks to ensure they are properly initialized
# checkpoint_callback = CheckpointCallback(save_freq=1000, save_path='./drive/MyDrive/Colab Notebooks/Asterix/logs/', name_prefix='dqn_model')
# eval_callback = EvalCallback(env, best_model_save_path='./drive/MyDrive/Colab Notebooks/Asterix/logs/best_model', log_path='./drive/MyDrive/Colab Notebooks/Asterix/logs/results', eval_freq=500, deterministic=True, render=False)
checkpoint_callback = CheckpointCallback(save_freq=1000, save_path='./logs/', name_prefix='dqn_model')
eval_callback = EvalCallback(env, best_model_save_path='./logs/best_model', log_path='./logs/results', eval_freq=500, deterministic=True, render=False)

# CNN tuning
# tb_log_name = "CNN_Base"
# tb_log_name = "CNN_Custom"
# tb_log_name = "CNN_Deep"
# tb_log_name = "CNN_Wide"
# tb_log_name = "CNN_Shalow"

# Lr tuning
# tb_log_name = "lr_e8"
# tb_log_name = "lr_e6"
# tb_log_name = "lr_e5"
# tb_log_name = "lr_e4"
# tb_log_name = "lr_e3"
# tb_log_name = "lr_e2"

# exploration frac tuning
# tb_log_name = "expl_Frac_0.1"
# tb_log_name = "expl_Frac_0.2"
# tb_log_name = "expl_Frac_0.5"
# tb_log_name = "expl_Frac_0.99"

# exploration final tuning
# tb_log_name = "expl_final_0.01"
# tb_log_name = "expl_final_0.05"
# tb_log_name = "expl_final_0.1"

# target update tuning
# tb_log_name = "target_up_30k"
# tb_log_name = "target_up_10k"


# --> used for trianing final model

tb_log_name = "target_up_20k"

# exploration init tuning
# tb_log_name = "expl_init_0.9"
# tb_log_name = "expl_init_0.95"
# tb_log_name = "expl_init_1.0"

# gamma tuning
# tb_log_name = "gamma_0.95"
# tb_log_name = "gamma_0.99"
# tb_log_name = "gamma_0.999"

# tb_log_name = "200k"
# tb_log_name = "Assault_200k_2"
# tb_log_name = "Assault_Lr_e2"
# tb_log_name = "SpaceInvaders_200k"
# tb_log_name = "SpaceInvaders_basic"
# tb_log_name = "SpaceInvaders_Lr_e2"
# tb_log_name = "SpaceInvaders_Lr_e2_frac_0.3"
# tb_log_name = "Breakout_200k"
# tb_log_name = "Asterix_Test"
# tb_log_name = "Asterix_Test2"

# saved_model = "Model_Assualt_200k_2"
# saved_model = "Model_Assualt_Lr_e2"
# saved_model = "Model_SpaceInvaders_200k"
# saved_model = "Model_SpaceInvaders_basic"
# saved_model = "Model_SpaceInvaders_Lr_e2"
# saved_model = "Model_SpaceInvaders_Lr_e2_frac_0.3"
# saved_model = "Model_Breakout_200k"
# saved_model = "Model_Asterix_Test"
# saved_model = "Model_Asterix_Test2"
# saved_model = "Model_200k"

# gamma tuning
# saved_model = "Model_gamma_0.95"
# saved_model = "Model_gamma_0.99"
# saved_model = "Model_gamma_0.999"

# exploration init tuning
# saved_model = "Model_expl_init_0.9"
# saved_model = "Model_expl_init_0.95"
# saved_model = "Model_expl_init_1.0"

# target update tuning
# saved_model = "Model_target_up_30k"
# saved_model = "Model_target_up_10k"


# --> used for training final model

saved_model = "Model_target_up_20k"

# exploration final tuning
# saved_model = "Model_expl_final_0.01"
# saved_model = "Model_expl_final_0.05"
# saved_model = "Model_expl_final_0.1"

# exploration frac tuning
# saved_model = "Model_expl_frac_0.1"
# saved_model = "Model_expl_frac_0.2"
# saved_model = "Model_expl_frac_0.5"
# saved_model = "Model_expl_frac_0.99"


# Buffer Tuning
# tb_log_name = "Buffer_25k"
# tb_log_name = "Buffer_50k"
# tb_log_name = "Buffer_100k"
# tb_log_name = "Buffer_75k"
# tb_log_name = "Buffer_500k"

# saved_model = "Model_buffer_25k"
# saved_model = "Model_buffer_50k"
# saved_model = "Model_buffer_100k"
# saved_model = "Model_buffer_75k"
# saved_model = "Model_buffer_500k"

# Batch tuning
# tb_log_name = "Batch_16"
# tb_log_name = "Batch_32"
# tb_log_name = "Batch_64"
# tb_log_name = "Batch_128"

# Batch tuning
# saved_model = "Model_Batch_16"
# saved_model = "Model_Batch_32"
# saved_model = "Model_Batch_64"
# saved_model = "Model_Batch_128"

# CNN Tuning
#saved_model = "Model_CNN_Base"
# saved_model = "Model_CNN_Custom"
# saved_model = "Model_CNN_Deep"
# saved_model = "Model_CNN_Wide"
# saved_model = "Model_CNN_Shalow"

# Lr tuning
# saved_model = "Model_lr_e8"
# saved_model = "Model_lr_e6"
# saved_model = "Model_lr_2_e5"
# saved_model = "Model_lr_2_e4"
# saved_model = "Model_lr_e3"
# saved_model = "Model_lr_e2"

# Enable CuDNN benchmark mode

# Train for initial steps
# model.learn(total_timesteps=2000, callback=[checkpoint_callback, eval_callback], reset_num_timesteps=False)
#
# model.learn(total_timesteps=50000, tb_log_name=tb_log_name, callback=[checkpoint_callback, eval_callback])
# Pass reset_num_timesteps=False to continue the training curve in tensorboard
# By default, it will create a new curve
# model.learn(total_timesteps=2000, tb_log_name="second_run", callback=[checkpoint_callback, eval_callback], reset_num_timesteps=False)
# model.learn(total_timesteps=2000, tb_log_name="third_run", callback=[checkpoint_callback, eval_callback], reset_num_timesteps=False)
# Assault: 2m 18.2 s / 2m 44 s / 3.45 s
# Asterix: 2m 37.3 /1.47.2 /1.39  10000: 14.45.7 m

Using cpu device




## Save and Reload Model ##

In [8]:
## Save and Reload Model ##
dqn_path = os.path.join('Training', 'Saved_Models', saved_model)

# dqn_path = os.path.join('/content/drive/MyDrive/Colab Notebooks/Asterix/Training/Saved_Models', saved_model)
# model.save(dqn_path)


In [None]:
del model
dqn_path = os.path.join('Training', 'Saved_Models', saved_model)
# dqn_path = os.path.join('/content/drive/MyDrive/Colab Notebooks/Asterix/Training/Saved_Models', saved_model)
model = DQN.load(dqn_path, env)

# Recreate callbacks to ensure they are properly initialized
# checkpoint_callback = CheckpointCallback(save_freq=1000, save_path='./drive/MyDrive/Colab Notebooks/Asterix/logs/', name_prefix='dqn_model')
# eval_callback = EvalCallback(env, best_model_save_path='./drive/MyDrive/Colab Notebooks/Asterix/logs/best_model', log_path='./drive/MyDrive/Colab Notebooks/Asterix/logs/results', eval_freq=500, deterministic=True, render=False)
checkpoint_callback = CheckpointCallback(save_freq=1000, save_path='./logs/', name_prefix='dqn_model')
eval_callback = EvalCallback(env, best_model_save_path='./logs/best_model', log_path='./logs/results', eval_freq=500, deterministic=True, render=False)

model.set_env(env)
# Continue training for more steps
model.learn(total_timesteps=50000, tb_log_name=tb_log_name, callback=[checkpoint_callback, eval_callback], reset_num_timesteps=False)

In [9]:
#save final model
# dqn_path_1 = os.path.join('Training', 'Saved Models', 'DQN_BeamRider_Model_1')
# model.save(dqn_path)
# del model
model = DQN.load(dqn_path, env)
model.set_env(env)

FileNotFoundError: [Errno 2] No such file or directory: 'Training\\Saved_Models\\Model_target_up_20k.zip'

## Evaluate and Test ##

In [None]:
## Evaluate and Test ##
# evaluate_policy(model, env, n_eval_episodes=10, render=False)
# env.close()

# ## Evaluate and Test ##
episodes = 5
for episode in range(1, episodes + 1):
    obs = env.reset()
    done = False
    score = 0

    while not done:
        # env.render()
        action = model.predict(obs, deterministic=True)[0]  # Use model to predict action
        obs, reward, done, info = env.step(action)  # Wrap action in list for DummyVecEnv
        score += reward[0]

        # done = terminated[0] or truncated[0]  # Correctly handle the vectorized done flag
    print('Evaluation Episode:{} Score:{}'.format(episode, score))

env.close()

## Visualization using TensorBoard ##
# In your terminal, run: tensorboard --logdir=./logs/


Evaluation Episode:1 Score:1450.0
Evaluation Episode:2 Score:1100.0
Evaluation Episode:3 Score:1500.0
Evaluation Episode:4 Score:1500.0
Evaluation Episode:5 Score:1100.0


In [None]:
# !kill 14898
# %load_ext tensorboard
# pathoflogsdir = "/content/drive/MyDrive/Colab Notebooks/Asterix/Training/Logs_Asterix/CNN_Logs/"
# %tensorboard --logdir $pathoflogsdir

/bin/bash: line 1: kill: (14898) - No such process


ERROR: Failed to launch TensorBoard (exited with 2).
Contents of stderr:
2024-06-15 18:36:32.018864: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-15 18:36:32.018938: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-15 18:36:32.020318: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
usage: tensorboard [-h] [--helpfull] [--logdir PATH] [--logdir_spec PATH_SPEC] [--host ADDR]
                   [--bind_all] [--port PORT] [--reuse_port BOOL] [--load_fast {false,auto,true}]
                   [--extra_data_server_flags EXTRA_DATA_SERVER_FLAGS]
                   [--grpc_cre