In [32]:
class SumTree:
    def __init__(self, capacity):
        self.capacity = capacity
        self.tree = np.zeros(2 * capacity - 1)
        self.data = np.zeros(capacity, dtype=object)
        self.n_entries = 0
        self.data_pointer = 0

    def add(self, priority, data):
        if not isinstance(data, tuple) or len(data) != 5:
            raise ValueError(f"Invalid data inserted into SumTree: {data}")
        
        tree_idx = self.data_pointer + self.capacity - 1
        self.data[self.data_pointer] = data
        self.update(tree_idx, priority)
        self.n_entries = min(self.n_entries + 1, self.capacity)

        self.data_pointer += 1
        if self.data_pointer >= self.capacity:
            self.data_pointer = 0


    def update(self, tree_idx, priority):
        change = priority - self.tree[tree_idx]
        self.tree[tree_idx] = priority

        # Propagate the change through tree
        while tree_idx != 0:
            tree_idx = (tree_idx - 1) // 2
            self.tree[tree_idx] += change

    def get_leaf(self, value):
        parent_idx = 0

        while True:
            left = 2 * parent_idx + 1
            right = left + 1

            if left >= len(self.tree):  # leaf node
                leaf_idx = parent_idx
                break
            else:
                if value <= self.tree[left]:
                    parent_idx = left
                else:
                    value -= self.tree[left]
                    parent_idx = right

        data_idx = leaf_idx - self.capacity + 1
        return leaf_idx, self.tree[leaf_idx], self.data[data_idx]

    def total_priority(self):
        return self.tree[0]


class ReplayBuffer:
    def __init__(self, buffer_size, batch_size):
        self.buffer = deque(maxlen=buffer_size)
        self.batch_size = batch_size

    def __len__(self):
        return len(self.buffer)

    def add(self, state, action, reward, next_state, done):
        """Add a new experience to the buffer."""
        # Convert states to float32 for consistent dtype
        state = np.array(state, dtype=np.float32)
        next_state = np.array(next_state, dtype=np.float32)
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self):
        batch = random.sample(self.buffer, min(self.batch_size, len(self.buffer)))
        states, actions, rewards, next_states, dones = zip(*batch)

        # Convert to NumPy arrays with correct shapes
        states = np.stack(states)                           # Shape: (batch_size, *state_shape)
        next_states = np.stack(next_states)
        actions = np.array(actions, dtype=np.int32)         # Shape: (batch_size,)
        rewards = np.array(rewards, dtype=np.float32)       # Shape: (batch_size,)
        dones = np.array(dones, dtype=np.uint8)             # Shape: (batch_size,)

        return states, actions, rewards, next_states, dones


In [None]:
import gym
import tensorflow as tf
from tensorflow.keras import layers, models,regularizers
import random
import numpy as np
from collections import deque
from tensorflow.keras.layers import Dense, Conv2D, Flatten, Lambda, Input,Add, Reshape, Activation, Softmax,Multiply, BatchNormalization, Dropout, MaxPooling2D
from tensorflow.keras.activations import gelu
from keras.optimizers import Adam,SGD,RMSprop
import matplotlib.pyplot as plt
import cv2
import os
import matplotlib.pyplot as plt
from IPython.display import clear_output
import tensorflow as tf
import numpy as np
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.trajectories import trajectory
from tf_agents.specs import tensor_spec

ModuleNotFoundError: No module named 'cloudpickle'

In [2]:
def set_seed(seed=42):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

In [3]:
class DemonAttackEnviroment:
    def __init__(self, render_mode):
        self.render_mode = render_mode if render_mode else None
        self.frame_stack = deque(maxlen=4)
        self.env = gym.make("DemonAttackDeterministic-v4",render_mode=render_mode,full_action_space=True,frameskip=2)
        self.state_dim = (4, 84, 84)
        self.action_dim = self.env.action_space.n
        self.state = self.reset()

    def preprocess_state(self, state):
        if isinstance(state, np.ndarray) and state.ndim == 3 and state.shape[2] == 3:
            gray = cv2.cvtColor(state, cv2.COLOR_RGB2GRAY)                                  #convert to grayscale
            resized = cv2.resize(gray, (84, 84)) / 255.0                                    #resize and normalise
            resized = np.float32(resized)                                                   #convert to float32
            return resized                                                                  

        else:
            raise ValueError(f"Unsupported state type: {type(state)} with shape {getattr(state, 'shape', None)}")
        
    def reset(self):
        self.state, _ = self.env.reset()
        self.frame_stack.clear()
        preprocessed = self.preprocess_state(self.state)
        for _ in range(4):
            self.frame_stack.append(preprocessed)
        return np.stack(self.frame_stack, axis=2)

    def step(self,action):
        self.state, reward, terminated, truncated, info = self.env.step(action)
        done = terminated or truncated
        self.state = self.preprocess_state(self.state)
        self.frame_stack.append(self.state)
        return np.stack(self.frame_stack, axis=2), reward, done, info['lives']

    def render(self):
        self.env.render()

    def get_state(self):
        return self.state
    
    def env_id(self):
        return self.env.spec.id

In [31]:
class Q_Network:
    def __init__(self, input_shape, action_size, optimizer='adam'):
        tf.random.set_seed(42)  # For reproducibility
        self.input_shape = input_shape
        self.action_size = action_size
        self.optimizer = optimizer
        self.model = self.build_network()

    def build_network(self):
        def dueling_q_values(inputs):
            v, a = inputs
            a_mean = tf.reduce_mean(a, axis=1, keepdims=True)
            return v + (a - a_mean)

        inputs = Input(shape=self.input_shape)
        x = Conv2D(16, (8, 8),
                   kernel_initializer='he_normal',
                   kernel_regularizer=regularizers.l2(1e-4))(inputs)
        x = BatchNormalization()(x)
        x = Activation(gelu)(x)
        x = MaxPooling2D((2,2),strides=2)(x)
        x = Dropout(0.2)(x)
        
        x = Conv2D(32, (4, 4),
                   kernel_initializer='he_normal',
                   kernel_regularizer=regularizers.l2(1e-4))(x)
        x = BatchNormalization()(x)
        x = Activation(gelu)(x)
        x = MaxPooling2D((2,2))(x)
        '''
        x = Dropout(0.2)(x)
        x = Conv2D(64, (7, 7),
                   kernel_initializer='he_normal',
                   kernel_regularizer=regularizers.l2(1e-4))(x)
        x = BatchNormalization()(x)
        x = MaxPooling2D((2,2))(x)
        x = Activation(gelu)(x)
        # Compute attention scores
        attention = Conv2D(1, (1, 1), activation='linear')(x)  # shape: (batch, h, w, 1)
        attention = Flatten()(attention)                      # shape: (batch, h * w)
        attention = Activation('softmax')(attention)          # soft attention across all locations
        attention = Reshape(x.shape[1:3] + (1,))(attention)    # reshape back to (batch, h, w, 1)

        # Apply attention
        x = Multiply()([x, attention])  # broadcast attention across channels


        # Value stream
        v = Dense(8,
                  kernel_initializer='he_normal',
                  kernel_regularizer=regularizers.l2(1e-4))(x)
        v = Activation(gelu)(v)

        v = Dense(1, activation='linear')(v)
        '''
        # Advantage stream
        x = Flatten()(x)

        x = Dense(256,
                  kernel_initializer='he_normal',
                  kernel_regularizer=regularizers.l2(1e-4))(x)
        x = Activation(gelu)(x)

        x = Dense(self.action_size, activation='linear')(x)

        # Combine value and advantage into Q-values
        #q_values = Lambda(dueling_q_values)([v, a])

        model = models.Model(inputs=inputs, outputs=x)
        model.compile(optimizer=self.optimizer, loss='mae')

        return model

    def predict(self, state_batch):
        return self.model(state_batch, training=False).numpy()

In [None]:
class PrioritizedReplayBuffer:
    def __init__(self, buffer_size, batch_size, alpha=0.6, beta=0.4, beta_increment=1e-4, epsilon=1e-6):
        self.tree = SumTree(buffer_size)
        self.batch_size = batch_size
        self.alpha = alpha  # how much prioritization to use (0 = none, 1 = full)
        self.beta = beta    # importance sampling weight correction
        self.beta_increment = beta_increment
        self.epsilon = epsilon  # small value to avoid zero priority

    def __len__(self):
        return len(self.tree.data)

    def add(self, state, action, reward, next_state, done):
        # Create experience tuple
        data = (state, action, reward, next_state, done)

        # Optional: sanity check on experience shape
        if not isinstance(data, tuple) or len(data) != 5:
            raise ValueError(f"Invalid data inserted into SumTree: {data}")

        # Set initial priority (will be updated later during learning)
        priority = 0.6

        # Add to the sum tree with priority^alpha
        self.tree.add(priority ** self.alpha, data)


    def sample(self):
        batch = []
        idxs = []
        priorities = []

        total_priority = self.tree.total_priority()
        if total_priority == 0:
            raise ValueError("Total priority is zero. SumTree might be empty or improperly updated.")

        segment = total_priority / self.batch_size
        self.beta = min(1.0, self.beta + self.beta_increment)

        attempts = 0
        max_attempts = self.batch_size * 5  # avoid infinite loops
        while len(batch) < self.batch_size and attempts < max_attempts:
            i = len(batch)
            a = segment * i
            b = segment * (i + 1)
            s = np.random.uniform(a, b)
            idx, priority, data = self.tree.get_leaf(s)

            # Skip if data is not valid or priority is zero (unfilled leaf)
            if priority == 0 or not isinstance(data, tuple) or len(data) != 5:
                attempts += 1
                continue

            batch.append(data)
            idxs.append(idx)
            priorities.append(priority)
            attempts += 1

        if len(batch) < self.batch_size:
            raise RuntimeError(f"Failed to sample a full batch. Collected {len(batch)} out of {self.batch_size}.")

        # Importance sampling weights
        priorities = np.array(priorities, dtype=np.float32)
        sampling_probs = priorities / (total_priority + 1e-8)
        sampling_probs = np.maximum(sampling_probs, 1e-6)
        #print("Sampling props normalised and clipped",sampling_probs[0:5])

        weights = (self.tree.n_entries * sampling_probs) ** (-self.beta)
        #print('weights after multiplying by sampling probs:', weights[0:5] )
        weights = np.clip(weights, a_min=1e-3, a_max=10.0)  

        weights = weights /  max(weights)
        #print('weights after dividing by max:', weights[0:5] )

        # Unpack batch
        states, actions, rewards, next_states, dones = zip(*batch)
        return (np.stack(states), np.array(actions), np.array(rewards), 
                np.stack(next_states), np.array(dones), np.array(idxs), np.array(weights, dtype=np.float32))


    def update_priorities(self, indices, priorities):
        for idx, priority in zip(indices, priorities):
            min_priority = 1e-6
            priority = max(priority, min_priority)
            self.tree.update(idx, priority)  


AttributeError: module 'keras._tf_keras.keras' has no attribute '__internal__'

In [34]:
class DQNAgent():
    def __init__(self, buffer_size, batch_size, episodes, input_shape = (84,84,4), action_size=2,
                 gamma = 0.9, epsilon = 0.9, epsilon_min = 0.1,epsilon_decay = 0.999, learning_rate = 0.0001, 
                 tau = 0.001, optimiser = 'Adam', environment = None, update_frequency=100, mode = 'Train'):
        
        set_seed(42) # Seeding for result reproducibility
        self.fixed_states = []
        self.mode = mode
        self.input_shape = input_shape
        self.action_size = action_size
        self.batch_size = batch_size
        self.episodes = episodes
        
        self.episode_rewards = []
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay
        self.update_frequency = update_frequency


        self.learning_rate = learning_rate
        self.tau = tau
        self.step = 0

        self.lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate=self.learning_rate,
        decay_steps=self.episodes,
        decay_rate=0.999,
        staircase=True
        )
        
        # Optimiser value handling
        if isinstance(optimiser, str):
            if optimiser == 'Adam':
                self.optimizer = Adam(learning_rate=self.lr_schedule, clipnorm=1.0)
            elif optimiser == 'SGD':
                self.optimizer = SGD(learning_rate=self.lr_schedule)
            else:
                raise ValueError("Unsupported optimizer string.")
        else:
            self.optimizer = optimiser  # Assume it's already a compiled tf optimizer object

        self.qnet = Q_Network(self.input_shape, self.action_size, self.optimizer)
        self.target_net = Q_Network(self.input_shape, self.action_size, self.optimizer)
        self.target_net.model.set_weights(self.qnet.model.get_weights())

        
        q_weights = self.qnet.model.get_weights()
        target_weights = self.target_net.model.get_weights()

        for qw, tw in zip(q_weights, target_weights):
            assert np.array_equal(qw, tw), "Weights differ!"

        self.replay_buffer = PrioritizedReplayBuffer(buffer_size=buffer_size, batch_size=self.batch_size)

        #Environment Parameter Handling
        self.env = environment if environment is not None else gym.make("Blackjack-v1", sab=True)
        self.env_name = self.env.env_id() # Used to determine env type
        self.collect_fixed_states()



    ########################################################################################################################
    # Greedy epsilon function for action selection (Optimal vs Random Choice) ##############################################
    def select_action(self, state, epsilon):
        if random.random() < epsilon:
            return random.randint(0, self.action_size - 1)
        else:
            state = np.expand_dims(np.array(state, dtype=np.float32), axis=0)
            return int(np.argmax(self.qnet.predict(state)))

    ########################################################################################################################
    # Updates Target Network from the Q_Network weights at a rate of Tau ###################################################
    def soft_update_target_network(self):
        qnet_weights = self.qnet.model.get_weights()
        target_net_weights = self.target_net.model.get_weights()

        updated_weights = [
            self.tau * q_w + (1 - self.tau) * t_w
            for q_w, t_w in zip(qnet_weights, target_net_weights)
        ]

        self.target_net.model.set_weights(updated_weights)
    
    def collect_fixed_states(self, num_states=1000):
        obs = self.env.reset()
        for _ in range(num_states):
            action = self.env.env.action_space.sample()
            next_obs, _, done, _ = self.env.step(action)
            self.fixed_states.append(obs)
            obs = next_obs
            if done:
                obs = self.env.reset()

    def evaluate_q_on_fixed_states(self):
        q_values = []
        for state in self.fixed_states:
            state_array = np.expand_dims(np.array(state, dtype=np.float32), axis=0)
            q = self.qnet.predict(state_array)  # shape: (1, action_size)
            max_q = np.max(q)
            q_values.append(max_q)
        return np.mean(q_values)




    def training_step(self):
    # Sample a batch of experiences from the prioritized replay buffer
        states, actions, rewards, next_states, dones, weights, indices = self.replay_buffer.sample()

        # Compute target Q-values
        next_q_values = self.target_net.model(next_states)
        max_next_q_values = tf.reduce_max(next_q_values, axis=1)
        targets = rewards + self.gamma * max_next_q_values * (1 - dones)

        # Prepare indices to gather Q-values of chosen actions
        action_indices = tf.stack([tf.range(self.batch_size), actions], axis=1)

        with tf.GradientTape() as tape:
            # Forward pass through the main Q-network
            q_values = self.qnet.model(states)
            chosen_q_values = tf.gather_nd(q_values, action_indices)

            # Compute TD errors
            td_errors = targets - chosen_q_values

            # Weighted MSE loss using importance-sampling weights
            clipped_td_errors = tf.clip_by_value(td_errors, -1.0, 1.0)

            loss = tf.reduce_mean(tf.square(clipped_td_errors) * weights)
            #print('weights: ',np.mean(weights))
            #print('loss:',loss)

        # Apply gradients
        grads = tape.gradient(loss, self.qnet.model.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.qnet.model.trainable_variables))

        # Update priorities in the replay buffer
        indices = np.array(indices, dtype=np.int32)
        new_priorities = np.array(np.abs(td_errors.numpy()), dtype=np.float32)
        clipped_priorities = np.clip(new_priorities, a_min=1e-6, a_max=10.0)

        self.replay_buffer.update_priorities(indices, clipped_priorities)


        # Return loss and mean Q-value for logging
        mean_q_value = tf.reduce_mean(chosen_q_values).numpy()
        return loss.numpy(), mean_q_value





    def train(self):
        self.episode_rewards = []
        self.mean_losses = []
        self.mean_exp_return = []
        self.q_eval_track = []
        print("Using device:", tf.test.gpu_device_name())

        for episode in range(self.episodes):
            state = self.env.reset()
            done = False
            total_reward = 0
            old_lives = 4 # or set to 5 initially if known

            i = 0
            episode_losses = []
            episode_q_values = []
            total_raw_reward = 0
            total_scaled_reward = 0
            lives_lost = 0
            while not done:
                action = self.select_action(state, self.epsilon)
                next_state, raw_reward, done, new_lives = self.env.step(action)
                
                # Only compare lives after first step
                # 🧠 Custom reward shaping logic
                reward_scale = 100.0
                death_penalty = -0.5
                survival_bonus = 0.005


                if raw_reward > 0:
                    reward = raw_reward / reward_scale
                elif old_lives > new_lives:
                    reward = death_penalty
                else:
                    reward = survival_bonus
                    
                total_raw_reward += raw_reward
                total_scaled_reward += reward
                lives_lost += int(old_lives > new_lives)
                old_lives = new_lives
                



                self.replay_buffer.add(state, action, reward, next_state, done)
                



                state = next_state
                total_reward += reward
                i+=1
                # Ensure learning only starts after the buffer is at least full once
                if (
                    i % self.update_frequency == 0
                    and self.replay_buffer.tree.n_entries == self.replay_buffer.tree.capacity  # or .n_entries if you use it
                    and len(self.replay_buffer) >= self.batch_size
                ):
                    loss, mean_q = self.training_step()
                    episode_losses.append(loss)
                    episode_q_values.append(mean_q)


                        # Update priorities in the PER buffer


            avg_q_eval = self.evaluate_q_on_fixed_states()
            self.q_eval_track.append(avg_q_eval)

            self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)

            self.episode_rewards.append(total_reward)
            self.mean_losses.append(np.mean(episode_losses) if episode_losses else 0)
            self.mean_exp_return.append(np.mean(episode_q_values) if episode_q_values else 0)

            clear_output(wait=True)
            fig, axs = plt.subplots(1, 4, figsize=(24, 4))
            axs[0].plot(self.episode_rewards, label="Episode Reward")
            axs[0].set_title("Episode Reward")
            axs[1].plot(self.mean_losses, label="Mean Loss", color='orange')
            axs[1].set_title("Mean Loss")
            axs[2].plot(self.mean_exp_return, label="Mean Q-Value", color='green')
            axs[2].set_title("Mean Exp return")
            axs[3].plot(self.q_eval_track, label="Avg Max Q (Fixed States)", color='purple')
            axs[3].set_title("Diagnostic Q-Metric")
            for ax in axs:
                ax.set_xlabel("Episode")
                ax.grid(True)
                ax.legend()
            plt.suptitle("Training Metrics")
            plt.tight_layout()
            plt.show()

            print(f"Episode: {episode}/{self.episodes}, Steps: {i}, "
                f"Total Reward: {total_reward:.2f}, "
                f"Mean Loss: {np.mean(episode_losses):.4f}, "
                f"Mean Exp-Q: {np.mean(episode_q_values):.4f}, "
                f"Average State (Q): {avg_q_eval:.4f}, "
                f"Epsilon: {self.epsilon:.2f}")
            print(f"Raw: {total_raw_reward}, Scaled: {total_scaled_reward}, Lives Lost: {lives_lost}")

            if episode % 10 == 0:
                self.qnet.model.save_weights('./saved_model.weights.h5')

    def test(self, weights_path = ''):
        self.episode_rewards = []
        self.qnet.model.load_weights(weights_path)
        self.q_eval_track = []
        print("Using device:", tf.test.gpu_device_name())

        for episode in range(self.episodes):
            state = self.env.reset()
            done = False
            total_reward = 0
            old_lives = 4 # or set to 5 initially if known

            i = 0
            total_raw_reward = 0
            total_scaled_reward = 0
            lives_lost = 0
            while not done:
                action = self.select_action(state, 0)
                next_state, raw_reward, done, new_lives = self.env.step(action)
                
                # Only compare lives after first step
                # 🧠 Custom reward shaping logic
                reward_scale = 100.0
                death_penalty = -0.5
                survival_bonus = 0.005


                if raw_reward > 0:
                    reward = raw_reward / reward_scale
                elif old_lives > new_lives:
                    reward = death_penalty
                else:
                    reward = survival_bonus
                    
                total_raw_reward += raw_reward
                total_scaled_reward += reward
                lives_lost += int(old_lives > new_lives)
                old_lives = new_lives

                state = next_state
                total_reward += reward
                i+=1



            avg_q_eval = self.evaluate_q_on_fixed_states()
            self.q_eval_track.append(avg_q_eval)
            self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)
            self.episode_rewards.append(total_reward)
            clear_output(wait=True)
            fig, axs = plt.subplots(1, 2, figsize=(12, 2))
            axs[0].plot(self.episode_rewards, label="Episode Reward")
            axs[0].set_title("Episode Reward")
            axs[1].plot(self.q_eval_track, label="Avg Max Q (Fixed States)", color='purple')
            axs[1].set_title("Diagnostic Q-Metric")
            for ax in axs:
                ax.set_xlabel("Episode")
                ax.grid(True)
                ax.legend()
            plt.suptitle("Training Metrics")
            plt.tight_layout()
            plt.show()

            print(f"Episode: {episode}/{self.episodes}, Steps: {i}, "
                f"Total Reward: {total_reward:.2f}, "
                f"Average State (Q): {avg_q_eval:.4f}, ")
            print(f"Raw: {total_raw_reward}, Scaled: {total_scaled_reward}, Lives Lost: {lives_lost}")


In [35]:
env = DemonAttackEnviroment(None)
Agent1 = DQNAgent(100,                                                  #Buffer Size
                  64,                                                   #Batch Size
                  1000,                                                 #Episodes
                  (84,84,4),                                            #Input Shape
                  18,                                                   #Action Size
                  0.95,                                                 #Gamma
                  1,                                                    #Epsilon
                  0.1,                                                  #Epslon Minimum
                  0.90,                                                 #Epslon Decay
                  0.001,                                                 #Learning Rate
                  0.0005,                                                 #Tau
                  'Adam',                                               #Optimiser  
                  env,                                                  #Environment
                  1)                                                    #Train Frquency 
Agent1.train()

Using device: /device:GPU:0


2025-06-24 20:50:12.948286: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-06-24 20:50:12.948303: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


KeyboardInterrupt: 

In [None]:
Test_env = DemonAttackEnviroment('human')
Test_Agent = DQNAgent(1000000,                                                 #Buffer Size
                  32,                                                   #Batch Size
                  1000,                                                  #Episodes
                  (84,84,4),                                            #Input Shape
                  18,                                                   #Action Size
                  0.9,                                                  #Gamma
                  1,                                                    #Epsilon
                  0.1,                                                  #Epslon Minimum
                  0.99,                                                 #Epslon Decay
                  0.01,                                                 #Learning Rate
                  0.001,                                                  #Tau
                  RMSprop(learning_rate=0.0001),                        #Optimiser  
                  Test_env,                                                  #Environment
                  1)  
Test_Agent.test('./saved_model.weights.h5')


KeyboardInterrupt: 

: 