In [7]:
import os
import time
import re
from typing import List
from datetime import datetime
import numpy as np

import gymnasium as gym
from gymnasium.wrappers import AtariPreprocessing, FrameStack, RecordEpisodeStatistics, RecordVideo

from stable_baselines3.common.buffers import ReplayBuffer
from stable_baselines3.common.save_util import load_from_pkl, save_to_pkl

import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.tensorboard import SummaryWriter
from typing import Dict

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [8]:
def step_trigger(step: int):
    return step % 400_000 == 0

def make_env(env_name="ALE/Pong-v5", seed=42):
    env = gym.make(env_name, render_mode="rgb_array", full_action_space=False, frameskip=1)
    env = AtariPreprocessing(env)
    env = FrameStack(env, 4)
    env = RecordEpisodeStatistics(env)
    # A video will be recorded every 400,000 steps.
    # (Can't use lambda expression because it is not supported by cloud pickle)
    env = RecordVideo(env, "runs/videos/", step_trigger=step_trigger, video_length=1000)
    env.observation_space.seed(seed)
    env.action_space.seed(seed)

    return env

In [9]:
class DQN(nn.Module):
    def __init__(self, num_channels, num_actions):
        super(DQN, self).__init__()

        self.conv1 = nn.Conv2d(num_channels, 32, stride=4, kernel_size=8)
        self.conv2 = nn.Conv2d(32, 64, stride=2, kernel_size=4)
        self.conv3 = nn.Conv2d(64, 64, stride=1, kernel_size=3)
        self.flatten = nn.Flatten()

        # 64 * 7 * 7 is the output of last conv layer because of the formula above, for an input of 84*84
        self.linear = nn.Linear(64 * 7 * 7, 512)
        self.head = nn.Linear(512, num_actions)  # Head layer

    def forward(self, x):
        x = x.float() / 255  # Rescale input from [0, 255] to [0, 1]
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = self.flatten(x)
        x = F.relu(self.linear(x))
        return self.head(x)

- **policy network**: $Q^{A}_{\theta}$
- **target network**: $Q^{B}_{\theta}$

In [10]:

def linear_schedule(start_epsilon: float, end_epsilon: float, duration: int, timestep: int):
    slope = (end_epsilon - start_epsilon) / duration
    return max(slope * timestep + start_epsilon, end_epsilon)

In [11]:
def get_latest_checkpoint_file(files: List[str]) -> any:
    """
    Return the most recent checkpoint file from the passed list of files.

    If multiple files with same datetime are passed, only the first is returned

    :param files: list of file names containing a formatted datetime (=> %d-%m-%Y_%H:%M:%S)
    :return: the file with the most recent date time or ``None`` if no files were found (because of the lack of correctly formatted date in the file name)
    """
    datetime_regex = r"\d{2}-\d{2}-\d{4}_\d{2}:\d{2}:\d{2}"

    latest_file = None
    latest_datetime = datetime.min
    for file in files:
        match = re.search(datetime_regex, file)
        if not match: continue # Go to next element in list if no match is found

        file_datetime = datetime.strptime(match.group(), "%d-%m-%Y_%H:%M:%S")
        if file_datetime > latest_datetime:
            latest_datetime = file_datetime
            latest_file = file

    return latest_file


class DQNAgent:
    def __init__(self, name="CNN_DDQN_Pong-v5", env_name="ALE/Pong-v5"):
        self.name = name

        self.env = make_env(env_name)

        self.start_datetime = None
        self.start_time = None

        # I use the same hyperparameters as this model: https://huggingface.co/sb3/dqn-PongNoFrameskip-v4
        self.MAX_TIMESTEPS = 10_000_000  # Maximum number of total steps
        self.TARGET_UPDATE_INTERVAL = 1000  # Number of steps between the synchronisation of q and target network
        self.LEARNING_STARTS = 100_000  # The number of steps to wait before we start the training, so the agent can explore and store its experience in the replay buffer

        self.TRAIN_FREQUENCY = 4 # Training is done each 4 steps

        self.CHECKPOINT_INTERVAL_EPISODE = 1000 # Checkpoint saving interval per episode (a checkpoint will be saved each X episodes)

        self.REPLAY_SIZE = 10_000
        self.BATCH_SIZE = 32

        self.GAMMA = 0.99  # Discount rate

        self.EXPLORATION_FRACTION = 0.1  # The fraction of 'TOTAL_TIMESTEPS' it takes from 'EPSILON_START' to 'EPSILON_END'.
        self.EPSILON_INITIAL = 1.0
        self.EPSILON_FINAL = 0.01

        self.epsilon = self.EPSILON_INITIAL  # Exploration probability

        self.memory = ReplayBuffer(
            buffer_size=self.REPLAY_SIZE,
            observation_space=self.env.observation_space,
            action_space=self.env.action_space,
            device=device,
            optimize_memory_usage=True,
            handle_timeout_termination=False
        )

        self.timesteps = 0

        self.policy_network = DQN(4, self.env.action_space.n).to(device)
        self.target_network = DQN(4, self.env.action_space.n).to(device)
        self.target_network.load_state_dict(self.policy_network.state_dict())

        self.optimizer = torch.optim.Adam(self.policy_network.parameters(), lr=0.0001)
        self.loss_fn = nn.SmoothL1Loss()

        # Metrics/Logs
        self.PATH = "runs"
        if not os.path.exists(self.PATH):
            os.makedirs(self.PATH)

        self.CHECKPOINTS_PATH = f"{self.PATH}/checkpoints"
        self.LOGS_PATH = f"{self.PATH}/logs"
        self.VIDEO_PATH = f"{self.PATH}/videos"

        self.is_loaded_from_checkpoint = False
        self.writer = None

    def remember(self, observation, next_observation, action, reward, done, infos):
        self.memory.add(observation, next_observation, action, reward, done, infos)

    def act(self, state):
        # Reduce epsilon when learning started
        if self.timesteps >= self.LEARNING_STARTS:
            # Minus LEARNING_STARTS to takes into account that learning only started after LEARNING_STARTS,
            # and so we want to start reducing epsilon only when learning start
            self.epsilon = linear_schedule(
                self.EPSILON_INITIAL,
                self.EPSILON_FINAL,
                int(self.EXPLORATION_FRACTION * self.MAX_TIMESTEPS),
                self.timesteps - self.LEARNING_STARTS
            )

        if self.timesteps < self.LEARNING_STARTS or np.random.rand() < self.epsilon:
            # Random action
            return np.array(self.env.action_space.sample())
        else:
            with torch.no_grad():
                state_tensor = torch.tensor(np.array(state), device=device).unsqueeze(0)
                q_values = self.policy_network(state_tensor)
                return q_values.argmax(dim=1)[0].cpu().numpy()

    def update_target_network(self):
        self.target_network.load_state_dict(self.policy_network.state_dict())

    def optimize_model(self):
        minibatch = self.memory.sample(self.BATCH_SIZE)

        # Calculate Q values for current states
        # For each q_values, get the action according to the minibatch
        q_values = self.policy_network(minibatch.observations).gather(1, minibatch.actions)

        # Then, calculate the best actions for the next states, and return its indices
        with torch.no_grad():
            best_next_actions = self.policy_network(minibatch.next_observations).argmax(1).unsqueeze(1)

        # Calculate the Q values for the next states using the target network, and return the action according to the best next action returned by the q network
        target_next_q_values = self.target_network(minibatch.next_observations).gather(1, best_next_actions)

        # Calculate the target Q values using Double DQN
        target_q_values = minibatch.rewards + (1 - minibatch.dones) * self.GAMMA * target_next_q_values

        # Compute the loss
        loss = self.loss_fn(q_values, target_q_values)

        # Compute metrics for loss
        if self.timesteps % 100 == 0:
            self.writer.add_scalar("train/loss", loss, self.timesteps)
            self.writer.add_scalar("train/q_values", q_values.squeeze().mean().item(), self.timesteps)
            steps_per_second = int(self.timesteps / (time.time() - self.start_time))
            #print("Steps per second: ", steps_per_second)
            self.writer.add_scalar("train/steps_per_second", steps_per_second, self.timesteps)


        # Optimise Q network
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def save_checkpoint(self):
        if self.start_datetime is None:
            print("SAVE_CHECKPOINT_ERROR: Training need to have started to save a checkpoint.")
            return

        print("Saving checkpoint...")
        current_datetime_str = datetime.now().strftime("%d-%m-%Y_%H:%M:%S")
        start_datetime_str = self.start_datetime.strftime("%d-%m-%Y_%H:%M:%S")

        save_parent_directory = f"{self.CHECKPOINTS_PATH}/{self.name}_{start_datetime_str}"
        save_path = save_parent_directory + "/chkpt_" + current_datetime_str + ".tar"
        replay_buffer_path = save_parent_directory + "/replay_buffer_" + current_datetime_str

        if not os.path.exists(save_parent_directory):
            os.makedirs(save_parent_directory)

        checkpoint = {
            "env": self.env,
            "timesteps": self.timesteps,
            "start_datetime": self.start_datetime,
            "epsilon": self.epsilon,
            "policy_network": self.policy_network.state_dict(),
            "target_network": self.target_network.state_dict(),
            "optimizer": self.optimizer.state_dict(),
        }

        torch.save(checkpoint, save_path)
        # Saving the replay buffer will takes time! But it is needed to properly resume training
        save_to_pkl(replay_buffer_path, self.memory, verbose=1)

        print(f"Checkpoint saved into {save_parent_directory}")

    def load_last_checkpoint(self, path):
        """
        Load the last saved checkpoint found in the given ``path``

        :param path: the path to the directory containing the checkpoint(s)
        """
        print(f"Loading most recent checkpoint from {path}")
        self.is_loaded_from_checkpoint = True

        # Using list comprehension to filter directories and only get the files
        files = [file for file in os.listdir(path) if os.path.isfile(os.path.join(path, file))]

        checkpoint_files = [chkpt_file for chkpt_file in files if "chkpt" in chkpt_file]
        replay_buffer_files = [chkpt_file for chkpt_file in files if "replay_buffer" in chkpt_file]

        checkpoint_file = get_latest_checkpoint_file(checkpoint_files)
        replay_buffer_file = get_latest_checkpoint_file(replay_buffer_files)

        checkpoint: Dict[str, any] = torch.load(path + "/" + checkpoint_file)

        self.env = checkpoint["env"]
        self.timesteps = checkpoint["timesteps"]
        self.start_datetime: datetime = checkpoint["start_datetime"]
        self.start_time = self.start_datetime.timestamp()

        self.epsilon = checkpoint["epsilon"]

        self.policy_network.load_state_dict(checkpoint["policy_network"])
        self.target_network.load_state_dict(checkpoint["target_network"])
        self.optimizer.load_state_dict(checkpoint["optimizer"])

        self.memory: ReplayBuffer = load_from_pkl(path + "/" + replay_buffer_file)
        print("Checkpoint successfully loaded, you can resume the training now.")

    def run(self):
        # Either create a new SummaryWriter or resume from previous one
        if not self.is_loaded_from_checkpoint:
            current_datetime = datetime.now()
            self.start_datetime = current_datetime
            self.start_time = current_datetime.timestamp()

        start_datetime_str = self.start_datetime.strftime("%d-%m-%Y_%H:%M:%S")
        self.writer = SummaryWriter(f"{self.LOGS_PATH}/{self.name}_{start_datetime_str}")

        video_folder_path = f"{self.VIDEO_PATH}/{self.name}_{start_datetime_str}"
        if not os.path.exists(video_folder_path):
            os.makedirs(video_folder_path)
        self.env.video_folder = video_folder_path

        while self.timesteps < self.MAX_TIMESTEPS:
            state, _ = self.env.reset()
            done = False

            while not done:
                self.timesteps += 1

                action = self.act(state)
                next_state, reward, terminated, truncated, info = self.env.step(action)
                done = terminated or truncated

                self.remember(state, next_state, action, reward, terminated, info)

                if self.timesteps >= self.LEARNING_STARTS and self.timesteps % self.TRAIN_FREQUENCY == 0:
                    self.optimize_model()

                state = next_state

                if done:
                    mean_reward = np.mean(self.env.return_queue)
                    mean_length = np.mean(self.env.length_queue)

                    # Get episode statistics from info ("episode" key only exist when episode is done)
                    episode_reward = info["episode"]["r"]
                    self.writer.add_scalar("rollout/episodic_return", episode_reward, self.timesteps)
                    self.writer.add_scalar("rollout/episodic_length", info["episode"]["l"], self.timesteps)

                    self.writer.add_scalar("rollout/ep_rew_mean", mean_reward, self.timesteps)
                    self.writer.add_scalar("rollout/ep_len_mean", mean_length, self.timesteps)

                    self.writer.add_scalar("rollout/exploration_rate", self.epsilon, self.timesteps)


                    print(f"Episode {self.env.episode_count} finished (timesteps: {self.timesteps}/{self.MAX_TIMESTEPS})\n"
                          f"Epsilon: {self.epsilon:.2f}, Episode reward: {episode_reward.item()}, Mean reward: {mean_reward:.2f}")

                    if self.env.episode_count % self.CHECKPOINT_INTERVAL_EPISODE == 0:
                        self.save_checkpoint()
                    print("***************************")

                if self.timesteps >= self.LEARNING_STARTS and self.timesteps % self.TARGET_UPDATE_INTERVAL == 0:
                    self.update_target_network()
                    #print("Target model updated.")

        self.save_checkpoint() # Save last checkpoint at the end of training

        self.writer.flush()
        self.writer.close()

In [12]:
agent = DQNAgent()
#agent.load_last_checkpoint("runs/checkpoints/cnn_ddqn_ALE-Pong-v5_08-04-2023_20:00:00")
agent.run()

  logger.warn(


Episode 1 finished (timesteps: 974/10000000)
Epsilon: 1.00, Episode reward: -20.0, Mean reward: -20.00
***************************
Moviepy - Building video runs/videos/CNN_DDQN_Pong-v5_11-04-2023_19:37:01/rl-video-step-0.mp4.
Moviepy - Writing video runs/videos/CNN_DDQN_Pong-v5_11-04-2023_19:37:01/rl-video-step-0.mp4



                                                                 

Moviepy - Done !
Moviepy - video ready runs/videos/CNN_DDQN_Pong-v5_11-04-2023_19:37:01/rl-video-step-0.mp4
Episode 2 finished (timesteps: 1765/10000000)
Epsilon: 1.00, Episode reward: -21.0, Mean reward: -20.50
***************************
Episode 3 finished (timesteps: 2709/10000000)
Epsilon: 1.00, Episode reward: -20.0, Mean reward: -20.33
***************************
Episode 4 finished (timesteps: 3745/10000000)
Epsilon: 1.00, Episode reward: -19.0, Mean reward: -20.00
***************************
Episode 5 finished (timesteps: 4921/10000000)
Epsilon: 1.00, Episode reward: -18.0, Mean reward: -19.60
***************************
Episode 6 finished (timesteps: 5904/10000000)
Epsilon: 1.00, Episode reward: -20.0, Mean reward: -19.67
***************************
Episode 7 finished (timesteps: 6761/10000000)
Epsilon: 1.00, Episode reward: -20.0, Mean reward: -19.71
***************************
Episode 8 finished (timesteps: 7884/10000000)
Epsilon: 1.00, Episode reward: -19.0, Mean reward: -19

                                                                 

Moviepy - Done !
Moviepy - video ready runs/videos/CNN_DDQN_Pong-v5_11-04-2023_19:37:01/rl-video-step-400000.mp4
Episode 418 finished (timesteps: 401688/10000000)
Epsilon: 0.70, Episode reward: -18.0, Mean reward: -19.70
***************************
Episode 419 finished (timesteps: 403103/10000000)
Epsilon: 0.70, Episode reward: -19.0, Mean reward: -19.69
***************************
Episode 420 finished (timesteps: 404483/10000000)
Epsilon: 0.70, Episode reward: -18.0, Mean reward: -19.67
***************************
Episode 421 finished (timesteps: 405514/10000000)
Epsilon: 0.70, Episode reward: -20.0, Mean reward: -19.66
***************************
Episode 422 finished (timesteps: 406508/10000000)
Epsilon: 0.70, Episode reward: -21.0, Mean reward: -19.67
***************************
Episode 423 finished (timesteps: 407710/10000000)
Epsilon: 0.70, Episode reward: -19.0, Mean reward: -19.66
***************************
Episode 424 finished (timesteps: 409105/10000000)
Epsilon: 0.69, Episod

                                                                 

Moviepy - Done !
Moviepy - video ready runs/videos/CNN_DDQN_Pong-v5_11-04-2023_19:37:01/rl-video-step-800000.mp4
Episode 697 finished (timesteps: 801629/10000000)
Epsilon: 0.31, Episode reward: -16.0, Mean reward: -17.33
***************************
Episode 698 finished (timesteps: 803436/10000000)
Epsilon: 0.30, Episode reward: -14.0, Mean reward: -17.29
***************************
Episode 699 finished (timesteps: 805644/10000000)
Epsilon: 0.30, Episode reward: -12.0, Mean reward: -17.22
***************************
Episode 700 finished (timesteps: 807464/10000000)
Epsilon: 0.30, Episode reward: -17.0, Mean reward: -17.21
***************************
Episode 701 finished (timesteps: 809128/10000000)
Epsilon: 0.30, Episode reward: -17.0, Mean reward: -17.21
***************************
Episode 702 finished (timesteps: 810903/10000000)
Epsilon: 0.30, Episode reward: -13.0, Mean reward: -17.17
***************************
Episode 703 finished (timesteps: 812756/10000000)
Epsilon: 0.29, Episod

                                                                 

Moviepy - Done !
Moviepy - video ready runs/videos/CNN_DDQN_Pong-v5_11-04-2023_19:37:01/rl-video-step-1200000.mp4
Episode 876 finished (timesteps: 1201938/10000000)
Epsilon: 0.01, Episode reward: -7.0, Mean reward: -12.99
***************************
Episode 877 finished (timesteps: 1205317/10000000)
Epsilon: 0.01, Episode reward: -12.0, Mean reward: -12.95
***************************
Episode 878 finished (timesteps: 1208380/10000000)
Epsilon: 0.01, Episode reward: -8.0, Mean reward: -12.87
***************************
Episode 879 finished (timesteps: 1211853/10000000)
Epsilon: 0.01, Episode reward: -8.0, Mean reward: -12.81
***************************
Episode 880 finished (timesteps: 1214007/10000000)
Epsilon: 0.01, Episode reward: -17.0, Mean reward: -12.82
***************************
Episode 881 finished (timesteps: 1216243/10000000)
Epsilon: 0.01, Episode reward: -16.0, Mean reward: -12.81
***************************
Episode 882 finished (timesteps: 1219652/10000000)
Epsilon: 0.01, E

                                                                 

Moviepy - Done !
Moviepy - video ready runs/videos/CNN_DDQN_Pong-v5_11-04-2023_19:37:01/rl-video-step-1600000.mp4
Episode 989 finished (timesteps: 1605276/10000000)
Epsilon: 0.01, Episode reward: -8.0, Mean reward: -8.80
***************************
Episode 990 finished (timesteps: 1608552/10000000)
Epsilon: 0.01, Episode reward: -10.0, Mean reward: -8.80
***************************
Episode 991 finished (timesteps: 1611259/10000000)
Epsilon: 0.01, Episode reward: -14.0, Mean reward: -8.81
***************************
Episode 992 finished (timesteps: 1614883/10000000)
Epsilon: 0.01, Episode reward: -8.0, Mean reward: -8.78
***************************
Episode 993 finished (timesteps: 1618362/10000000)
Epsilon: 0.01, Episode reward: 5.0, Mean reward: -8.76
***************************
Episode 994 finished (timesteps: 1621619/10000000)
Epsilon: 0.01, Episode reward: -6.0, Mean reward: -8.71
***************************
Episode 995 finished (timesteps: 1625400/10000000)
Epsilon: 0.01, Episode r

                                                                 

Moviepy - Done !
Moviepy - video ready runs/videos/CNN_DDQN_Pong-v5_11-04-2023_19:37:01/rl-video-step-2000000.mp4
Episode 1106 finished (timesteps: 2002349/10000000)
Epsilon: 0.01, Episode reward: -6.0, Mean reward: -4.91
***************************
Episode 1107 finished (timesteps: 2006988/10000000)
Epsilon: 0.01, Episode reward: -2.0, Mean reward: -4.84
***************************
Episode 1108 finished (timesteps: 2011518/10000000)
Epsilon: 0.01, Episode reward: -5.0, Mean reward: -4.79
***************************
Episode 1109 finished (timesteps: 2015519/10000000)
Epsilon: 0.01, Episode reward: -5.0, Mean reward: -4.71
***************************
Episode 1110 finished (timesteps: 2019655/10000000)
Epsilon: 0.01, Episode reward: -6.0, Mean reward: -4.71
***************************
Episode 1111 finished (timesteps: 2023499/10000000)
Epsilon: 0.01, Episode reward: 8.0, Mean reward: -4.59
***************************
Episode 1112 finished (timesteps: 2027077/10000000)
Epsilon: 0.01, Epis

                                                                 

Moviepy - Done !
Moviepy - video ready runs/videos/CNN_DDQN_Pong-v5_11-04-2023_19:37:01/rl-video-step-2400000.mp4
Episode 1213 finished (timesteps: 2402631/10000000)
Epsilon: 0.01, Episode reward: 4.0, Mean reward: 0.14
***************************
Episode 1214 finished (timesteps: 2406112/10000000)
Epsilon: 0.01, Episode reward: 5.0, Mean reward: 0.16
***************************
Episode 1215 finished (timesteps: 2408710/10000000)
Epsilon: 0.01, Episode reward: 9.0, Mean reward: 0.19
***************************
Episode 1216 finished (timesteps: 2411942/10000000)
Epsilon: 0.01, Episode reward: 9.0, Mean reward: 0.21
***************************
Episode 1217 finished (timesteps: 2415356/10000000)
Epsilon: 0.01, Episode reward: 12.0, Mean reward: 0.37
***************************
Episode 1218 finished (timesteps: 2418416/10000000)
Epsilon: 0.01, Episode reward: 7.0, Mean reward: 0.51
***************************
Episode 1219 finished (timesteps: 2421908/10000000)
Epsilon: 0.01, Episode reward

                                                                 

Moviepy - Done !
Moviepy - video ready runs/videos/CNN_DDQN_Pong-v5_11-04-2023_19:37:01/rl-video-step-2800000.mp4
Episode 1329 finished (timesteps: 2803091/10000000)
Epsilon: 0.01, Episode reward: 9.0, Mean reward: 4.16
***************************
Episode 1330 finished (timesteps: 2806370/10000000)
Epsilon: 0.01, Episode reward: 6.0, Mean reward: 4.21
***************************
Episode 1331 finished (timesteps: 2808999/10000000)
Epsilon: 0.01, Episode reward: 11.0, Mean reward: 4.40
***************************
Episode 1332 finished (timesteps: 2811521/10000000)
Epsilon: 0.01, Episode reward: 10.0, Mean reward: 4.52
***************************
Episode 1333 finished (timesteps: 2814220/10000000)
Epsilon: 0.01, Episode reward: 9.0, Mean reward: 4.68
***************************
Episode 1334 finished (timesteps: 2817440/10000000)
Epsilon: 0.01, Episode reward: 11.0, Mean reward: 4.77
***************************
Episode 1335 finished (timesteps: 2820539/10000000)
Epsilon: 0.01, Episode rewa

                                                                 

Moviepy - Done !
Moviepy - video ready runs/videos/CNN_DDQN_Pong-v5_11-04-2023_19:37:01/rl-video-step-3200000.mp4
Episode 1462 finished (timesteps: 3202861/10000000)
Epsilon: 0.01, Episode reward: 11.0, Mean reward: 9.30
***************************
Episode 1463 finished (timesteps: 3204845/10000000)
Epsilon: 0.01, Episode reward: 16.0, Mean reward: 9.36
***************************
Episode 1464 finished (timesteps: 3207298/10000000)
Epsilon: 0.01, Episode reward: 10.0, Mean reward: 9.30
***************************
Episode 1465 finished (timesteps: 3210004/10000000)
Epsilon: 0.01, Episode reward: 12.0, Mean reward: 9.34
***************************
Episode 1466 finished (timesteps: 3212572/10000000)
Epsilon: 0.01, Episode reward: 11.0, Mean reward: 9.27
***************************
Episode 1467 finished (timesteps: 3215663/10000000)
Epsilon: 0.01, Episode reward: 6.0, Mean reward: 9.21
***************************
Episode 1468 finished (timesteps: 3217913/10000000)
Epsilon: 0.01, Episode re

                                                                 

Moviepy - Done !
Moviepy - video ready runs/videos/CNN_DDQN_Pong-v5_11-04-2023_19:37:01/rl-video-step-3600000.mp4
Episode 1615 finished (timesteps: 3601303/10000000)
Epsilon: 0.01, Episode reward: 18.0, Mean reward: 12.61
***************************
Episode 1616 finished (timesteps: 3603974/10000000)
Epsilon: 0.01, Episode reward: 12.0, Mean reward: 12.60
***************************
Episode 1617 finished (timesteps: 3606761/10000000)
Epsilon: 0.01, Episode reward: 11.0, Mean reward: 12.52
***************************
Episode 1618 finished (timesteps: 3609250/10000000)
Epsilon: 0.01, Episode reward: 15.0, Mean reward: 12.51
***************************
Episode 1619 finished (timesteps: 3611107/10000000)
Epsilon: 0.01, Episode reward: 20.0, Mean reward: 12.51
***************************
Episode 1620 finished (timesteps: 3614033/10000000)
Epsilon: 0.01, Episode reward: 11.0, Mean reward: 12.46
***************************
Episode 1621 finished (timesteps: 3617274/10000000)
Epsilon: 0.01, Epi

                                                                 

Moviepy - Done !
Moviepy - video ready runs/videos/CNN_DDQN_Pong-v5_11-04-2023_19:37:01/rl-video-step-4000000.mp4
Episode 1764 finished (timesteps: 4002720/10000000)
Epsilon: 0.01, Episode reward: 15.0, Mean reward: 12.44
***************************
Episode 1765 finished (timesteps: 4004902/10000000)
Epsilon: 0.01, Episode reward: 18.0, Mean reward: 12.51
***************************
Episode 1766 finished (timesteps: 4007526/10000000)
Epsilon: 0.01, Episode reward: 15.0, Mean reward: 12.51
***************************
Episode 1767 finished (timesteps: 4010378/10000000)
Epsilon: 0.01, Episode reward: 13.0, Mean reward: 12.50
***************************
Episode 1768 finished (timesteps: 4013028/10000000)
Epsilon: 0.01, Episode reward: 12.0, Mean reward: 12.47
***************************
Episode 1769 finished (timesteps: 4016376/10000000)
Epsilon: 0.01, Episode reward: 6.0, Mean reward: 12.37
***************************
Episode 1770 finished (timesteps: 4018627/10000000)
Epsilon: 0.01, Epis

                                                                 

Moviepy - Done !
Moviepy - video ready runs/videos/CNN_DDQN_Pong-v5_11-04-2023_19:37:01/rl-video-step-4400000.mp4
Episode 1922 finished (timesteps: 4402756/10000000)
Epsilon: 0.01, Episode reward: 4.0, Mean reward: 12.49
***************************
Episode 1923 finished (timesteps: 4405073/10000000)
Epsilon: 0.01, Episode reward: 17.0, Mean reward: 12.56
***************************
Episode 1924 finished (timesteps: 4408389/10000000)
Epsilon: 0.01, Episode reward: 7.0, Mean reward: 12.50
***************************
Episode 1925 finished (timesteps: 4410878/10000000)
Epsilon: 0.01, Episode reward: 10.0, Mean reward: 12.47
***************************
Episode 1926 finished (timesteps: 4413081/10000000)
Epsilon: 0.01, Episode reward: 17.0, Mean reward: 12.48
***************************
Episode 1927 finished (timesteps: 4415290/10000000)
Epsilon: 0.01, Episode reward: 16.0, Mean reward: 12.48
***************************
Episode 1928 finished (timesteps: 4417454/10000000)
Epsilon: 0.01, Episo

                                                                 

Moviepy - Done !
Moviepy - video ready runs/videos/CNN_DDQN_Pong-v5_11-04-2023_19:37:01/rl-video-step-4800000.mp4
Episode 2083 finished (timesteps: 4803184/10000000)
Epsilon: 0.01, Episode reward: 11.0, Mean reward: 14.25
***************************
Episode 2084 finished (timesteps: 4806123/10000000)
Epsilon: 0.01, Episode reward: 12.0, Mean reward: 14.28
***************************
Episode 2085 finished (timesteps: 4808905/10000000)
Epsilon: 0.01, Episode reward: 14.0, Mean reward: 14.39
***************************
Episode 2086 finished (timesteps: 4811318/10000000)
Epsilon: 0.01, Episode reward: 14.0, Mean reward: 14.37
***************************
Episode 2087 finished (timesteps: 4814302/10000000)
Epsilon: 0.01, Episode reward: 11.0, Mean reward: 14.30
***************************
Episode 2088 finished (timesteps: 4816701/10000000)
Epsilon: 0.01, Episode reward: 18.0, Mean reward: 14.30
***************************
Episode 2089 finished (timesteps: 4819256/10000000)
Epsilon: 0.01, Epi

                                                                 

Moviepy - Done !
Moviepy - video ready runs/videos/CNN_DDQN_Pong-v5_11-04-2023_19:37:01/rl-video-step-5200000.mp4
Episode 2241 finished (timesteps: 5203036/10000000)
Epsilon: 0.01, Episode reward: 15.0, Mean reward: 13.59
***************************
Episode 2242 finished (timesteps: 5205660/10000000)
Epsilon: 0.01, Episode reward: 11.0, Mean reward: 13.53
***************************
Episode 2243 finished (timesteps: 5208268/10000000)
Epsilon: 0.01, Episode reward: 11.0, Mean reward: 13.56
***************************
Episode 2244 finished (timesteps: 5210238/10000000)
Epsilon: 0.01, Episode reward: 16.0, Mean reward: 13.61
***************************
Episode 2245 finished (timesteps: 5212361/10000000)
Epsilon: 0.01, Episode reward: 17.0, Mean reward: 13.65
***************************
Episode 2246 finished (timesteps: 5214798/10000000)
Epsilon: 0.01, Episode reward: 12.0, Mean reward: 13.67
***************************
Episode 2247 finished (timesteps: 5216994/10000000)
Epsilon: 0.01, Epi

                                                                 

Moviepy - Done !
Moviepy - video ready runs/videos/CNN_DDQN_Pong-v5_11-04-2023_19:37:01/rl-video-step-5600000.mp4
Episode 2403 finished (timesteps: 5601867/10000000)
Epsilon: 0.01, Episode reward: 15.0, Mean reward: 13.36
***************************
Episode 2404 finished (timesteps: 5603725/10000000)
Epsilon: 0.01, Episode reward: 20.0, Mean reward: 13.38
***************************
Episode 2405 finished (timesteps: 5605910/10000000)
Epsilon: 0.01, Episode reward: 15.0, Mean reward: 13.39
***************************
Episode 2406 finished (timesteps: 5607730/10000000)
Epsilon: 0.01, Episode reward: 18.0, Mean reward: 13.39
***************************
Episode 2407 finished (timesteps: 5609861/10000000)
Epsilon: 0.01, Episode reward: 14.0, Mean reward: 13.36
***************************
Episode 2408 finished (timesteps: 5611852/10000000)
Epsilon: 0.01, Episode reward: 17.0, Mean reward: 13.39
***************************
Episode 2409 finished (timesteps: 5613994/10000000)
Epsilon: 0.01, Epi

                                                                 

Moviepy - Done !
Moviepy - video ready runs/videos/CNN_DDQN_Pong-v5_11-04-2023_19:37:01/rl-video-step-6000000.mp4
Episode 2562 finished (timesteps: 6002775/10000000)
Epsilon: 0.01, Episode reward: 15.0, Mean reward: 13.07
***************************
Episode 2563 finished (timesteps: 6005209/10000000)
Epsilon: 0.01, Episode reward: 13.0, Mean reward: 13.10
***************************
Episode 2564 finished (timesteps: 6007377/10000000)
Epsilon: 0.01, Episode reward: 16.0, Mean reward: 13.08
***************************
Episode 2565 finished (timesteps: 6009709/10000000)
Epsilon: 0.01, Episode reward: 15.0, Mean reward: 13.10
***************************
Episode 2566 finished (timesteps: 6011696/10000000)
Epsilon: 0.01, Episode reward: 19.0, Mean reward: 13.19
***************************
Episode 2567 finished (timesteps: 6013856/10000000)
Epsilon: 0.01, Episode reward: 18.0, Mean reward: 13.23
***************************
Episode 2568 finished (timesteps: 6016289/10000000)
Epsilon: 0.01, Epi

                                                                 

Moviepy - Done !
Moviepy - video ready runs/videos/CNN_DDQN_Pong-v5_11-04-2023_19:37:01/rl-video-step-6400000.mp4
Episode 2726 finished (timesteps: 6401161/10000000)
Epsilon: 0.01, Episode reward: 10.0, Mean reward: 13.27
***************************
Episode 2727 finished (timesteps: 6403924/10000000)
Epsilon: 0.01, Episode reward: 14.0, Mean reward: 13.28
***************************
Episode 2728 finished (timesteps: 6406216/10000000)
Epsilon: 0.01, Episode reward: 13.0, Mean reward: 13.30
***************************
Episode 2729 finished (timesteps: 6408289/10000000)
Epsilon: 0.01, Episode reward: 17.0, Mean reward: 13.34
***************************
Episode 2730 finished (timesteps: 6410459/10000000)
Epsilon: 0.01, Episode reward: 15.0, Mean reward: 13.46
***************************
Episode 2731 finished (timesteps: 6412598/10000000)
Epsilon: 0.01, Episode reward: 17.0, Mean reward: 13.45
***************************
Episode 2732 finished (timesteps: 6414945/10000000)
Epsilon: 0.01, Epi

                                                                 

Moviepy - Done !
Moviepy - video ready runs/videos/CNN_DDQN_Pong-v5_11-04-2023_19:37:01/rl-video-step-6800000.mp4
Episode 2893 finished (timesteps: 6802272/10000000)
Epsilon: 0.01, Episode reward: 12.0, Mean reward: 13.73
***************************
Episode 2894 finished (timesteps: 6804912/10000000)
Epsilon: 0.01, Episode reward: 11.0, Mean reward: 13.76
***************************
Episode 2895 finished (timesteps: 6807173/10000000)
Epsilon: 0.01, Episode reward: 15.0, Mean reward: 13.79
***************************
Episode 2896 finished (timesteps: 6809365/10000000)
Epsilon: 0.01, Episode reward: 15.0, Mean reward: 13.88
***************************
Episode 2897 finished (timesteps: 6812035/10000000)
Epsilon: 0.01, Episode reward: 13.0, Mean reward: 13.90
***************************
Episode 2898 finished (timesteps: 6814896/10000000)
Epsilon: 0.01, Episode reward: 8.0, Mean reward: 13.81
***************************
Episode 2899 finished (timesteps: 6817280/10000000)
Epsilon: 0.01, Epis

                                                                 

Moviepy - Done !
Moviepy - video ready runs/videos/CNN_DDQN_Pong-v5_11-04-2023_19:37:01/rl-video-step-7200000.mp4
Episode 3061 finished (timesteps: 7201668/10000000)
Epsilon: 0.01, Episode reward: 11.0, Mean reward: 14.66
***************************
Episode 3062 finished (timesteps: 7203576/10000000)
Epsilon: 0.01, Episode reward: 19.0, Mean reward: 14.73
***************************
Episode 3063 finished (timesteps: 7205864/10000000)
Epsilon: 0.01, Episode reward: 13.0, Mean reward: 14.71
***************************
Episode 3064 finished (timesteps: 7208238/10000000)
Epsilon: 0.01, Episode reward: 15.0, Mean reward: 14.68
***************************
Episode 3065 finished (timesteps: 7210598/10000000)
Epsilon: 0.01, Episode reward: 14.0, Mean reward: 14.64
***************************
Episode 3066 finished (timesteps: 7213173/10000000)
Epsilon: 0.01, Episode reward: 13.0, Mean reward: 14.61
***************************
Episode 3067 finished (timesteps: 7215278/10000000)
Epsilon: 0.01, Epi

                                                                 

Moviepy - Done !
Moviepy - video ready runs/videos/CNN_DDQN_Pong-v5_11-04-2023_19:37:01/rl-video-step-7600000.mp4
Episode 3237 finished (timesteps: 7601558/10000000)
Epsilon: 0.01, Episode reward: 14.0, Mean reward: 14.67
***************************
Episode 3238 finished (timesteps: 7603867/10000000)
Epsilon: 0.01, Episode reward: 15.0, Mean reward: 14.75
***************************
Episode 3239 finished (timesteps: 7606035/10000000)
Epsilon: 0.01, Episode reward: 16.0, Mean reward: 14.75
***************************
Episode 3240 finished (timesteps: 7608182/10000000)
Epsilon: 0.01, Episode reward: 18.0, Mean reward: 14.76
***************************
Episode 3241 finished (timesteps: 7610532/10000000)
Epsilon: 0.01, Episode reward: 15.0, Mean reward: 14.75
***************************
Episode 3242 finished (timesteps: 7612537/10000000)
Epsilon: 0.01, Episode reward: 17.0, Mean reward: 14.75
***************************
Episode 3243 finished (timesteps: 7615036/10000000)
Epsilon: 0.01, Epi

                                                                 

Moviepy - Done !
Moviepy - video ready runs/videos/CNN_DDQN_Pong-v5_11-04-2023_19:37:01/rl-video-step-8000000.mp4
Episode 3414 finished (timesteps: 8003047/10000000)
Epsilon: 0.01, Episode reward: 16.0, Mean reward: 14.58
***************************
Episode 3415 finished (timesteps: 8005261/10000000)
Epsilon: 0.01, Episode reward: 15.0, Mean reward: 14.59
***************************
Episode 3416 finished (timesteps: 8007289/10000000)
Epsilon: 0.01, Episode reward: 15.0, Mean reward: 14.59
***************************
Episode 3417 finished (timesteps: 8009294/10000000)
Epsilon: 0.01, Episode reward: 18.0, Mean reward: 14.67
***************************
Episode 3418 finished (timesteps: 8011463/10000000)
Epsilon: 0.01, Episode reward: 14.0, Mean reward: 14.69
***************************
Episode 3419 finished (timesteps: 8013302/10000000)
Epsilon: 0.01, Episode reward: 19.0, Mean reward: 14.72
***************************
Episode 3420 finished (timesteps: 8015262/10000000)
Epsilon: 0.01, Epi

                                                                 

Moviepy - Done !
Moviepy - video ready runs/videos/CNN_DDQN_Pong-v5_11-04-2023_19:37:01/rl-video-step-8400000.mp4
Episode 3591 finished (timesteps: 8401759/10000000)
Epsilon: 0.01, Episode reward: 20.0, Mean reward: 14.60
***************************
Episode 3592 finished (timesteps: 8404275/10000000)
Epsilon: 0.01, Episode reward: 16.0, Mean reward: 14.57
***************************
Episode 3593 finished (timesteps: 8406958/10000000)
Epsilon: 0.01, Episode reward: 10.0, Mean reward: 14.49
***************************
Episode 3594 finished (timesteps: 8409428/10000000)
Epsilon: 0.01, Episode reward: 11.0, Mean reward: 14.48
***************************
Episode 3595 finished (timesteps: 8411433/10000000)
Epsilon: 0.01, Episode reward: 18.0, Mean reward: 14.59
***************************
Episode 3596 finished (timesteps: 8413123/10000000)
Epsilon: 0.01, Episode reward: 20.0, Mean reward: 14.72
***************************
Episode 3597 finished (timesteps: 8415212/10000000)
Epsilon: 0.01, Epi

                                                                 

Moviepy - Done !
Moviepy - video ready runs/videos/CNN_DDQN_Pong-v5_11-04-2023_19:37:01/rl-video-step-8800000.mp4
Episode 3778 finished (timesteps: 8801418/10000000)
Epsilon: 0.01, Episode reward: 17.0, Mean reward: 16.38
***************************
Episode 3779 finished (timesteps: 8803382/10000000)
Epsilon: 0.01, Episode reward: 18.0, Mean reward: 16.45
***************************
Episode 3780 finished (timesteps: 8805322/10000000)
Epsilon: 0.01, Episode reward: 18.0, Mean reward: 16.51
***************************
Episode 3781 finished (timesteps: 8807910/10000000)
Epsilon: 0.01, Episode reward: 14.0, Mean reward: 16.50
***************************
Episode 3782 finished (timesteps: 8810640/10000000)
Epsilon: 0.01, Episode reward: 13.0, Mean reward: 16.45
***************************
Episode 3783 finished (timesteps: 8812779/10000000)
Epsilon: 0.01, Episode reward: 16.0, Mean reward: 16.46
***************************
Episode 3784 finished (timesteps: 8814563/10000000)
Epsilon: 0.01, Epi

                                                                 

Moviepy - Done !
Moviepy - video ready runs/videos/CNN_DDQN_Pong-v5_11-04-2023_19:37:01/rl-video-step-9200000.mp4
Episode 3969 finished (timesteps: 9201416/10000000)
Epsilon: 0.01, Episode reward: 19.0, Mean reward: 16.39
***************************
Episode 3970 finished (timesteps: 9203577/10000000)
Epsilon: 0.01, Episode reward: 13.0, Mean reward: 16.34
***************************
Episode 3971 finished (timesteps: 9205336/10000000)
Epsilon: 0.01, Episode reward: 21.0, Mean reward: 16.40
***************************
Episode 3972 finished (timesteps: 9207419/10000000)
Epsilon: 0.01, Episode reward: 16.0, Mean reward: 16.44
***************************
Episode 3973 finished (timesteps: 9209112/10000000)
Epsilon: 0.01, Episode reward: 20.0, Mean reward: 16.51
***************************
Episode 3974 finished (timesteps: 9210913/10000000)
Epsilon: 0.01, Episode reward: 20.0, Mean reward: 16.52
***************************
Episode 3975 finished (timesteps: 9213134/10000000)
Epsilon: 0.01, Epi

                                                                 

Moviepy - Done !
Moviepy - video ready runs/videos/CNN_DDQN_Pong-v5_11-04-2023_19:37:01/rl-video-step-9600000.mp4
Episode 4157 finished (timesteps: 9602703/10000000)
Epsilon: 0.01, Episode reward: 20.0, Mean reward: 16.66
***************************
Episode 4158 finished (timesteps: 9605062/10000000)
Epsilon: 0.01, Episode reward: 15.0, Mean reward: 16.62
***************************
Episode 4159 finished (timesteps: 9606768/10000000)
Epsilon: 0.01, Episode reward: 20.0, Mean reward: 16.66
***************************
Episode 4160 finished (timesteps: 9609005/10000000)
Epsilon: 0.01, Episode reward: 17.0, Mean reward: 16.65
***************************
Episode 4161 finished (timesteps: 9611364/10000000)
Epsilon: 0.01, Episode reward: 18.0, Mean reward: 16.65
***************************
Episode 4162 finished (timesteps: 9613570/10000000)
Epsilon: 0.01, Episode reward: 18.0, Mean reward: 16.64
***************************
Episode 4163 finished (timesteps: 9615976/10000000)
Epsilon: 0.01, Epi

                                                                 

Moviepy - Done !
Moviepy - video ready runs/videos/CNN_DDQN_Pong-v5_11-04-2023_19:37:01/rl-video-step-10000000.mp4
Episode 4348 finished (timesteps: 10001160/10000000)
Epsilon: 0.01, Episode reward: 16.0, Mean reward: 16.33
***************************
Saving checkpoint...
Checkpoint saved into runs/checkpoints/CNN_DDQN_Pong-v5_11-04-2023_19:37:01
