In [1]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [2]:
import os
import random
import time

import gym
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# Imports all our hyperparameters from the other file
#from hyperparams_A import Hyperparameters as params
#from hyperparams_B import Hyperparameters as params
#from hyperparams_C import Hyperparameters as params
#from hyperparams_D import Hyperparameters as params
#from hyperparams_E import Hyperparameters as params
from hyperparams_F import Hyperparameters as params

In [3]:
for key, value in vars(params).items():
    print(f"{key}: {value}")

__module__: hyperparams_F
env_id: BreakoutNoFrameskip-v4
exp_name: DQN_Breakout
seed: 1
torch_deterministic: True
capture_video: True
save_model: True
total_timesteps: 10000000
learning_rate: 0.0001
buffer_size: 400000
gamma: 0.95
tau: 1
target_network_frequency: 1000
batch_size: 128
start_e: 1
end_e: 0.01
exploration_fraction: 0.05
learning_starts: 80000
train_frequency: 4
__dict__: <attribute '__dict__' of 'Hyperparameters' objects>
__weakref__: <attribute '__weakref__' of 'Hyperparameters' objects>
__doc__: None


In [4]:
# stable_baselines3 have wrappers that simplifies 
# the preprocessing a lot, read more about them here:
# https://stable-baselines3.readthedocs.io/en/master/common/atari_wrappers.html
from stable_baselines3.common.atari_wrappers import (
    ClipRewardEnv,
    EpisodicLifeEnv,
    FireResetEnv,
    MaxAndSkipEnv,
    NoopResetEnv,
)
from stable_baselines3.common.buffers import ReplayBuffer

In [5]:
# Creates our gym environment and with all our wrappers.
def make_env(env_id, seed, idx, capture_video, run_name):
    def thunk():
        env = gym.make(env_id)
        env = gym.wrappers.RecordEpisodeStatistics(env)
        if capture_video:
            if idx == 0:
                env = gym.wrappers.RecordVideo(env, f"videos/{run_name}")
        env = NoopResetEnv(env, noop_max=30)
        env = MaxAndSkipEnv(env, skip=4)
        env = EpisodicLifeEnv(env)
        if "FIRE" in env.unwrapped.get_action_meanings():
            env = FireResetEnv(env)
        env = ClipRewardEnv(env)
        env = gym.wrappers.ResizeObservation(env, (84, 84))
        env = gym.wrappers.GrayScaleObservation(env)
        env = gym.wrappers.FrameStack(env, 4)
        env.seed(seed)
        env.action_space.seed(seed)
        env.observation_space.seed(seed)
        return env

    return thunk

def linear_schedule(start_e: float, end_e: float, duration: int, t: int):
    slope = (end_e - start_e) / duration
    return max(slope * t + start_e, end_e)

In [6]:
class QNetwork(nn.Module):
    def __init__(self, env,in_channels):
        super().__init__()
        # TODO: #Done Define your network (agent)
        # Look at Section 4.1 in the paper for help: https://arxiv.org/pdf/1312.5602v1.pdf
        
        self.network = nn.Sequential(
            nn.Conv2d(in_channels=in_channels, out_channels=16, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(in_channels=16, out_channels=32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(in_channels=32, out_channels=256, kernel_size=4, stride=2),
            nn.ReLU(),
        )
    
        conv_output_size = self.get_conv_output_size(in_channels)
        self.output_layer = nn.Linear(conv_output_size, env.single_action_space.n)

    def forward(self, x):
        #return self.network(x / 255.0)
        x = self.network(x)
        x = x.view(x.size(0), -1)  # Flatten the output from convolutional layers
        x = self.output_layer(x)
        return x
    
    def get_conv_output_size(self, input_channels):
        input_tensor = torch.rand(1, input_channels, 84, 84)
        conv_output = self.network(input_tensor)
        conv_output_size = conv_output.view(conv_output.size(0), -1).size(1)
        return conv_output_size

In [7]:
run_name = f"{params.env_id}__{params.exp_name}__{params.seed}__{int(time.time())}"

random.seed(params.seed)
np.random.seed(params.seed)
torch.manual_seed(params.seed)
torch.backends.cudnn.deterministic = params.torch_deterministic

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  # env setup
envs = gym.vector.SyncVectorEnv([make_env(params.env_id, params.seed, 0, params.capture_video, run_name)])
assert isinstance(envs.single_action_space, gym.spaces.Discrete), "only discrete action space is supported"

q_network = QNetwork(envs, 4).to(device)
optimizer = optim.Adam(q_network.parameters(), lr=params.learning_rate)
target_network = QNetwork(envs,4).to(device)
target_network.load_state_dict(q_network.state_dict())

# We’ll be using experience replay memory for training our DQN. 
# It stores the transitions that the agent observes, allowing us to reuse this data later. 
# By sampling from it randomly, the transitions that build up a batch are decorrelated. 
# It has been shown that this greatly stabilizes and improves the DQN training procedure.
rb = ReplayBuffer(
    params.buffer_size,
    envs.single_observation_space,
    envs.single_action_space,
    device,
    optimize_memory_usage=False,
    handle_timeout_termination=True,
)

obs = envs.reset()

  deprecation(
  deprecation(
  logger.deprecation(
  from pkg_resources import resource_filename
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages
  declare_namespace(pkg)
  if distutils.version.LooseVersion(
  deprecation(


In [8]:
episode_num=0

for global_step in range(params.total_timesteps):
#for global_step in range(100000):
    # Here we get epsilon for our epislon greedy.
    epsilon = linear_schedule(params.start_e, params.end_e, params.exploration_fraction * params.total_timesteps, global_step)

    if random.random() < epsilon:
        actions = envs.action_space.sample()                                # TODO: #DONE: sample a random action from the environment 
    else:
        q_values = q_network(torch.tensor(obs, dtype=torch.float32))        # TODO: #DONE : get q_values from the network you defined, what should the network receive as input?
        actions = torch.argmax(q_values, dim=1).cpu().numpy()

    # Take a step in the environment
    next_obs, rewards, dones, infos = envs.step(actions)

    # Here we print our reward.
    for info in infos:
        if "episode" in info.keys():
            print(f"global_step={global_step}, episodic_return={info['episode']['r']}")
            break

    # Save data to replay buffer
    real_next_obs = next_obs.copy()
    for idx, d in enumerate(dones):
        if d:
            real_next_obs[idx] = infos[idx]["terminal_observation"]

    # Here we store the transitions in D
    rb.add(obs, real_next_obs, actions, rewards, dones, infos)

    obs = next_obs
    # Training 
    if global_step > params.learning_starts:
        if global_step % params.train_frequency == 0:
            # Sample random minibatch of transitions from D
            data = rb.sample(params.batch_size)
            # You can get data with:
            # data.observation, data.rewards, data.dones, data.actions

            with torch.no_grad():
                # Now we calculate the y_j for non-terminal phi.
                next_observations_tensor = data.next_observations.to(torch.float32)
                next_q_values = target_network(next_observations_tensor)
                
                target_max, _ = torch.max(next_q_values, dim=1)
                td_target = data.rewards + params.gamma * (1 - data.dones) * target_max.unsqueeze(1)            # DONE: Calculate the td_target (y_j)

                #target_max, _ =    # TODO: Calculate max Q
                #td_target =        # TODO: Calculate the td_target (y_j)

            old_val = q_network(data.observations.to(torch.float32)).gather(1, data.actions).squeeze()          #DONE
            loss = F.mse_loss(old_val, td_target.squeeze())                                                     #DONE
            
            #old_val = q_network(?).gather(1, data.actions).squeeze()   # TODO:
            #loss = F.mse_loss(?, ?)                                    # TODO:

            # perform our gradient decent step
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        
        # update target network
        if global_step % params.target_network_frequency == 0:
            for target_network_param, q_network_param in zip(target_network.parameters(), q_network.parameters()):
                target_network_param.data.copy_(
                    params.tau * q_network_param.data + (1.0 - params.tau) * target_network_param.data
                )

  logger.deprecation(


global_step=110, episodic_return=0.0
global_step=223, episodic_return=0.0
global_step=364, episodic_return=1.0
global_step=650, episodic_return=4.0
global_step=851, episodic_return=3.0
global_step=962, episodic_return=0.0
global_step=1125, episodic_return=1.0
global_step=1343, episodic_return=3.0
global_step=1528, episodic_return=2.0
global_step=1643, episodic_return=0.0
global_step=1758, episodic_return=0.0
global_step=2014, episodic_return=3.0
global_step=2172, episodic_return=1.0
global_step=2359, episodic_return=2.0
global_step=2470, episodic_return=0.0
global_step=2583, episodic_return=0.0
global_step=2724, episodic_return=1.0
global_step=2837, episodic_return=0.0
global_step=3006, episodic_return=2.0
global_step=3272, episodic_return=4.0
global_step=3460, episodic_return=2.0
global_step=3621, episodic_return=1.0
global_step=3736, episodic_return=0.0
global_step=3849, episodic_return=0.0
global_step=4006, episodic_return=1.0
global_step=4230, episodic_return=3.0
global_step=4341, 

KeyboardInterrupt: 

## Comment on Runs

### <font color="red"> BreakoutNoFrameskip-v4__DQN_Breakout__A</font>

<font color="cyan">
Default hyperparameters<br>
Decreased buffer size to 100.000 due to  memory restrictions<br>
We stopped the loop early as the agent seems to have converged to a sub optimal policy<br>
</font>

Best-performing videos are

rl-video-episode-3000.mp4
and
rl-video-episode-5000.mp4

which achieve a score of 59

after that the agent becomes unstable until in after global-step ~700.000 it seems to be locked into a very low performance



### <font color="red"> BreakoutNoFrameskip-v4__DQN_Breakout__B</font>

<font color="cyan">
Compared to Run A <br>
Set buffer size to 200.000 vs 100.000<br>
Set end_e = 0.05 from 0.01
set exploration factor to 0.2 from 0.1
</font>

If we look into the glob-step vs episodic return we see than initially the performance seems to be very similar to run A (even a bit better at around steps 100.000-150.000) but after that ther is a sudden catastrophic forgetting happens around step 175.000.
the performance seems to incease afterwards but slowly so we stopped due to time restrictions

### <font color="red"> BreakoutNoFrameskip-v4__DQN_Breakout__C</font>

<font color="cyan">
Same hyper parameters with Run A <br>
Set buffer size to 200.000 from 100.000<br>
</font>

following the same initial learning as in in run A the performance seems to reach higher levels at approximately step 1.000.000 and after but the first instability also appears sooner at arrpoximately step 1.650.000.

### <font color="red"> BreakoutNoFrameskip-v4__DQN_Breakout__D</font>

<font color="cyan">
Same hyper parameters with Run C <br>
but set exploration fraction to 0.333 from 0.1<br>
</font>

The increase of average episodic return is too slow<br>
at step ~1.300.000 100-rolling average episodic return is around 5 while in run A it was approximately 45<br>
stopped due to time constraints