In [None]:
# Replay Buffer class for storing and retrieving sampled experiences
class ReplayBuffer:
    def __init__(self, env, mem_size=MEM_SIZE):
        # Initialising memory count and creating arrays to store experiences
        self.mem_count = 0
        self.states = np.zeros((MEM_SIZE, *env.observation_space.shape),dtype=np.float32)
        self.actions = np.zeros(MEM_SIZE, dtype=np.int64)
        self.rewards = np.zeros(MEM_SIZE, dtype=np.float32)
        self.states_ = np.zeros((MEM_SIZE, *env.observation_space.shape),dtype=np.float32)
        self.dones = np.zeros(MEM_SIZE, dtype=np.bool)

    def add(self, state, action, reward, state_, done):
        # If memory count is at max size, overwrite previous values
        if self.mem_count < MEM_SIZE:
            mem_index = self.mem_count
        else:
            # Avoiding catastrophic forgetting - retrain initial 10% of the replay buffer
            mem_index = int(self.mem_count % ((1-MEM_RETAIN) * MEM_SIZE) + (MEM_RETAIN * MEM_SIZE))

        self.states[mem_index]  = state     # Storing the state
        self.actions[mem_index] = action    # Storing the action
        self.rewards[mem_index] = reward    # Storing the reward
        self.states_[mem_index] = state_    # Storing the next state
        self.dones[mem_index] =  1 - done   # Storing the done flag
        self.mem_count += 1                 # Incrementing memory count
    
    def sample(self):
        # Randomly sample a batch of experiences
        MEM_MAX = min(self.mem_count, MEM_SIZE)
        batch_indices = np.random.choice(MEM_MAX, BATCH_SIZE, replace=True)

        states  = self.states[batch_indices]    # Getting the states
        actions = self.actions[batch_indices]   # Getting the actions
        rewards = self.rewards[batch_indices]   # Getting the rewards
        states_ = self.states_[batch_indices]   # Getting the next states
        dones   = self.dones[batch_indices]     # Getting the done flags

        # Returning the random sampled experiences
        return states, actions, rewards, states_, dones

In [None]:
# Replay Buffer class for storing and retrieving sampled experiences
class ReplayBuffer:
    def __init__(self, env, mem_size=MEM_SIZE):
        # Initialising memory count and creating arrays to store experiences
        self.memory = deque(maxlen=mem_size)
        self.mem_count = 0

    def add(self, state, action, reward, state_, done):
        # Adding experience to memory
        self.memory.append((state, action, reward, state_, done))
        self.mem_count += 1

    def sample(self):
        # Randomly sample a batch of experiences
        batch_size = min(BATCH_SIZE, self.mem_count)
        batch = random.sample(self.memory, batch_size)

        states, actions, rewards, states_, dones = zip(*batch)
        return np.array(states), np.array(actions), np.array(rewards), np.array(states_), np.array(dones)
    
    def __len__(self):
        return self.mem_count

In [None]:
# Defining convolutional layers
# Convolutional Neural Network (CNN) used for image inputs
self.conv_layers = torch.nn.Sequential(
        nn.Conv2d(self.input_shape[0], 32, kernel_size=8, stride=4),
        nn.ReLU(),
        nn.Conv2d(32, 64, kernel_size=4, stride=2),
        nn.ReLU(),
        nn.Conv2d(64, 64, kernel_size=3, stride=1),
        nn.ReLU(),
)

# Getting the output size of the convolutional layers
conv_out_size = self._get_conv_out(self.input_shape)

# Defining the linear layers
self.layers = torch.nn.Sequential(
        self.conv_layers,
        nn.Flatten(),
        nn.Linear(conv_out_size, FC1_DIMS),
        nn.ReLU(),
        nn.Linear(FC1_DIMS, FC2_DIMS),
        nn.ReLU(),
        nn.Linear(FC2_DIMS, self.action_space)
)