In [1]:


# DQN Agent
# -----------------------------

# -----------------------------
# DDQN Agent
# -----------------------------

    # Use the same update_epsilon() method as in DQNAgent.

# -----------------------------
# PPO Agent
# -----------------------------


# -----------------------------
# RCPSP Environment with Makespan Calculation and Critical Path KPI
# -----------------------------


# -----------------------------
# Training Loop Helper Function
# -----------------------------

# -----------------------------
# Main: Compare DQN, DDQN, PPO, and GPHH Optimization
# -----------------------------


Importing Libraries

In [2]:
!pip install torch-geometric
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import random
from torch_geometric.nn import GATConv
from torch_geometric.data import Data
from collections import deque
import math
import pandas as pd
import networkx as nx


Collecting torch-geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.6.1-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m25.5 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: torch-geometric
Successfully installed torch-geometric-2.6.1


Defining Graph Attention Network

In [3]:
# -----------------------------
# Graph Attention Network (GAT)
# -----------------------------
class GATQNetwork(nn.Module):
    def __init__(self, in_features, hidden_dim, out_features):
        super(GATQNetwork, self).__init__()
        self.gat1 = GATConv(in_features, hidden_dim, heads=4, concat=True)
        self.gat2 = GATConv(hidden_dim * 4, hidden_dim, heads=4, concat=True)
        self.fc = nn.Linear(hidden_dim * 4, out_features)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = F.relu(self.gat1(x, edge_index))
        x = F.relu(self.gat2(x, edge_index))
        q_values = self.fc(x).mean(dim=0)
        return q_values


DQN Agent

In [4]:
class DQNAgent:
    def __init__(self, state_dim, action_dim, hidden_dim=64, lr=0.001, gamma=0.95):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.action_dim = action_dim
        self.gamma = gamma
        self.epsilon = 1.0
        # Change decay parameters so that epsilon decays only once per episode
        self.epsilon_decay = 0.995  # decay factor per episode
        self.epsilon_min = 0.1      # keep a higher minimum epsilon for more exploration
        self.batch_size = 32
        self.memory = deque(maxlen=5000)

        self.q_network = GATQNetwork(state_dim, hidden_dim, action_dim).to(self.device)
        self.optimizer = optim.Adam(self.q_network.parameters(), lr=lr)

    def select_action(self, state):
        if np.random.rand() < self.epsilon:
            return np.random.choice(self.action_dim)
        state = state.to(self.device)
        with torch.no_grad():
            q_values = self.q_network(state)
        return torch.argmax(q_values).item()

    def store_experience(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def train(self):
        if len(self.memory) < self.batch_size:
            return
        batch = random.sample(self.memory, self.batch_size)
        state_batch, action_batch, reward_batch, next_state_batch, done_batch = zip(*batch)
        
        state_batch = [s.to(self.device) for s in state_batch]
        next_state_batch = [s.to(self.device) for s in next_state_batch]
        action_batch = torch.tensor(action_batch, dtype=torch.long, device=self.device)
        reward_batch = torch.tensor(reward_batch, dtype=torch.float32, device=self.device)
        done_batch = torch.tensor(done_batch, dtype=torch.float32, device=self.device)
        
        q_values = torch.stack([self.q_network(s) for s in state_batch])
        next_q_values = torch.stack([self.q_network(s) for s in next_state_batch]).max(dim=1)[0]
        
        target_q_values = reward_batch + self.gamma * next_q_values * (1 - done_batch)
        q_values = q_values.gather(1, action_batch.unsqueeze(1)).squeeze(1)
        
        loss = F.mse_loss(q_values, target_q_values)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def update_epsilon(self):
        # Update epsilon once per episode
        if self.epsilon > self.epsilon_min:
            self.epsilon = max(self.epsilon * self.epsilon_decay, self.epsilon_min)


DDQN Agent

In [5]:
class DDQNAgent(DQNAgent):
    def __init__(self, state_dim, action_dim, hidden_dim=64, lr=0.001, gamma=0.95):
        super(DDQNAgent, self).__init__(state_dim, action_dim, hidden_dim, lr, gamma)
        self.target_network = GATQNetwork(state_dim, hidden_dim, action_dim).to(self.device)
        self.update_target_network()

    def update_target_network(self):
        self.target_network.load_state_dict(self.q_network.state_dict())

    def train(self):
        if len(self.memory) < self.batch_size:
            return
        batch = random.sample(self.memory, self.batch_size)
        state_batch, action_batch, reward_batch, next_state_batch, done_batch = zip(*batch)
        
        state_batch = [s.to(self.device) for s in state_batch]
        next_state_batch = [s.to(self.device) for s in next_state_batch]
        action_batch = torch.tensor(action_batch, dtype=torch.long, device=self.device)
        reward_batch = torch.tensor(reward_batch, dtype=torch.float32, device=self.device)
        done_batch = torch.tensor(done_batch, dtype=torch.float32, device=self.device)
        
        q_values = torch.stack([self.q_network(s) for s in state_batch])
        next_actions = torch.stack([self.q_network(s) for s in next_state_batch]).argmax(dim=1)
        next_q_values_target = torch.stack([self.target_network(s) for s in next_state_batch])
        next_q_values = next_q_values_target.gather(1, next_actions.unsqueeze(1)).squeeze(1)
        
        target_q_values = reward_batch + self.gamma * next_q_values * (1 - done_batch)
        q_values = q_values.gather(1, action_batch.unsqueeze(1)).squeeze(1)
        
        loss = F.mse_loss(q_values, target_q_values)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()


PPO Agent

In [6]:
class PPOAgent:
    def __init__(self, state_dim, action_dim, hidden_dim=64, lr=0.001, gamma=0.95,
                 clip_epsilon=0.2, update_epochs=10):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.gamma = gamma
        self.clip_epsilon = clip_epsilon
        self.update_epochs = update_epochs
        
        self.policy = GATQNetwork(state_dim, hidden_dim, action_dim).to(self.device)
        self.old_policy = GATQNetwork(state_dim, hidden_dim, action_dim).to(self.device)
        self.old_policy.load_state_dict(self.policy.state_dict())
        self.optimizer = optim.Adam(self.policy.parameters(), lr=lr)
        
        self.memory = []

    def select_action(self, state):
        state = state.to(self.device)
        with torch.no_grad():
            logits = self.policy(state)
            probabilities = F.softmax(logits, dim=-1)
            action = torch.multinomial(probabilities, 1).item()
        return action

    def store_experience(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def train(self):
        if not self.memory:
            return
        state_batch, action_batch, reward_batch, next_state_batch, done_batch = zip(*self.memory)
        state_batch = [s.to(self.device) for s in state_batch]
        action_batch = torch.tensor(action_batch, dtype=torch.long, device=self.device)
        reward_batch = torch.tensor(reward_batch, dtype=torch.float32, device=self.device)
        done_batch = torch.tensor(done_batch, dtype=torch.float32, device=self.device)
        
        for _ in range(self.update_epochs):
            old_logits = torch.stack([self.old_policy(s) for s in state_batch])
            old_probs = F.softmax(old_logits, dim=-1).gather(1, action_batch.unsqueeze(1)).squeeze(1)
            
            new_logits = torch.stack([self.policy(s) for s in state_batch])
            new_probs = F.softmax(new_logits, dim=-1).gather(1, action_batch.unsqueeze(1)).squeeze(1)
            
            ratios = new_probs / (old_probs + 1e-8)
            advantages = reward_batch + self.gamma * (1 - done_batch) - reward_batch
            surr1 = ratios * advantages
            surr2 = torch.clamp(ratios, 1 - self.clip_epsilon, 1 + self.clip_epsilon) * advantages
            loss = -torch.min(surr1, surr2).mean()
            
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()
        
        self.old_policy.load_state_dict(self.policy.state_dict())
        self.memory = []

RCPSP Enviorment

In [7]:
class RCPSPEnv:
    def __init__(self, instance_file=None, num_tasks=20, resource_limits=[10, 10, 10]):
        self.num_tasks = num_tasks
        self.resource_limits = resource_limits
        self.graph = self.generate_graph(num_tasks)
        self.resource_limits_array = np.array(self.resource_limits, dtype=int)
        self.horizon = 10000
        self.resource_usage = pd.DataFrame(
            np.zeros((self.horizon, len(self.resource_limits)), dtype=int),
            columns=range(len(self.resource_limits))
        )
        self.task_ids = sorted(list(self.graph.nodes))
        self.reset()
    
    def reset(self):
        self.completed_tasks = set()
        self.schedule = {}  # Task start times
        self.time_elapsed = 0
        self.resource_usage.iloc[:, :] = 0
        return self.get_state()

    def generate_graph(self, num_tasks):
        G = nx.DiGraph()
        for i in range(num_tasks):
            G.add_node(
                i,
                duration=np.random.randint(5, 20),
                resources=[np.random.randint(1, 5) for _ in self.resource_limits]
            )
        for _ in range(num_tasks // 2):
            a, b = np.random.choice(num_tasks, 2, replace=False)
            if a != b and not nx.has_path(G, b, a):
                G.add_edge(a, b)
        return G

    def get_state(self):
        node_features = []
        for i in self.graph.nodes:
            duration = self.graph.nodes[i]['duration']
            resources = self.graph.nodes[i]['resources']
            is_scheduled = 1 if i in self.completed_tasks else 0
            node_features.append([duration] + resources + [is_scheduled])
        if self.graph.number_of_edges() > 0:
            edge_index = torch.tensor(list(self.graph.edges), dtype=torch.long).t().contiguous()
        else:
            edge_index = torch.empty((2, 0), dtype=torch.long)
        node_features = torch.tensor(node_features, dtype=torch.float)
        return Data(x=node_features, edge_index=edge_index)

    def compute_makespan(self):
        if not self.completed_tasks:
            return 0
        finish_times = [self.schedule[task] + self.graph.nodes[task]['duration'] for task in self.completed_tasks]
        return max(finish_times) if finish_times else 0

    def compute_critical_path_length(self):
        cp = {node: 0 for node in self.graph.nodes}
        for node in nx.topological_sort(self.graph):
            duration = self.graph.nodes[node]['duration']
            preds = list(self.graph.predecessors(node))
            if preds:
                cp[node] = max(cp[pred] for pred in preds) + duration
            else:
                cp[node] = duration
        return max(cp.values())

    def step(self, action):
        if hasattr(self, 'task_ids'):
            if action < 0 or action >= len(self.task_ids):
                print(f"Warning: action index {action} out of range; clipping to maximum index {len(self.task_ids)-1}")
                action = len(self.task_ids) - 1
            actual_action = self.task_ids[action]
        else:
            actual_action = action

        # Instead of terminating the episode on an invalid action,
        # we penalize it and continue.
        if actual_action in self.completed_tasks:
            return self.get_state(), -10, False

        try:
            predecessors = list(self.graph.predecessors(actual_action))
        except Exception as e:
            print(f"Error getting predecessors for task {actual_action}: {e}")
            predecessors = []
        start_time = max([self.schedule.get(p, 0) + self.graph.nodes[p]['duration'] for p in predecessors], default=0)
        task_resources = np.array(self.graph.nodes[actual_action]['resources'], dtype=int)
        duration = self.graph.nodes[actual_action]['duration']
        found_feasible = False
        candidate = start_time
        limits = self.resource_limits_array
        while candidate <= self.horizon - duration:
            usage_slice = self.resource_usage.iloc[candidate:candidate+duration, :]
            if np.all(usage_slice.values + task_resources <= limits):
                found_feasible = True
                break
            candidate += 1
        if not found_feasible:
            candidate = self.horizon - duration
        start_time = candidate
        self.resource_usage.iloc[start_time:start_time+duration, :] += task_resources
        self.schedule[actual_action] = start_time
        self.completed_tasks.add(actual_action)
        self.time_elapsed = max(self.time_elapsed, start_time + duration)
        done = len(self.completed_tasks) == self.num_tasks
        makespan = self.compute_makespan()
        reward = -makespan if done else 0
        return self.get_state(), reward, done

    def schedule_from_ordering(self, ordering):
        self.completed_tasks = set()
        self.schedule = {}
        self.time_elapsed = 0
        self.resource_usage = pd.DataFrame(
            np.zeros((self.horizon, len(self.resource_limits)), dtype=int),
            columns=range(len(self.resource_limits))
        )
        for task in ordering:
            predecessors = list(self.graph.predecessors(task))
            start_time = max([self.schedule.get(p, 0) + self.graph.nodes[p]['duration'] for p in predecessors], default=0)
            task_resources = np.array(self.graph.nodes[task]['resources'], dtype=int)
            duration = self.graph.nodes[task]['duration']
            found_feasible = False
            candidate = start_time
            limits = self.resource_limits_array
            while candidate <= self.horizon - duration:
                usage_slice = self.resource_usage.iloc[candidate:candidate+duration, :]
                if np.all(usage_slice.values + task_resources <= limits):
                    found_feasible = True
                    break
                candidate += 1
            if not found_feasible:
                candidate = self.horizon - duration
            start_time = candidate
            self.resource_usage.iloc[start_time:start_time+duration, :] += task_resources
            self.schedule[task] = start_time
            self.completed_tasks.add(task)
            self.time_elapsed = max(self.time_elapsed, start_time + duration)
        return self.compute_makespan()

    # -----------------------------
    # Genetic Programming Hyper-Heuristic (GPHH) Method
    # -----------------------------
    def schedule_from_weights(self, weights):
        self.completed_tasks = set()
        self.schedule = {}
        self.time_elapsed = 0
        ordering = []
        unscheduled = set(self.graph.nodes)
        feasible = [task for task in self.graph.nodes if self.graph.in_degree(task) == 0]
        while unscheduled:
            if not feasible:
                break
            scores = {}
            for task in feasible:
                duration = self.graph.nodes[task]['duration']
                resources = self.graph.nodes[task]['resources']
                score = weights[0]*duration + weights[1]*resources[0] + weights[2]*resources[1] + weights[3]*resources[2]
                scores[task] = score
            chosen = max(scores, key=scores.get)
            ordering.append(chosen)
            unscheduled.remove(chosen)
            feasible.remove(chosen)
            for succ in self.graph.successors(chosen):
                if succ in unscheduled and all(pred in ordering for pred in self.graph.predecessors(succ)):
                    feasible.append(succ)
        return self.schedule_from_ordering(ordering)

    def optimize_schedule_gphh(self, population_size=50, generations=100, mutation_prob=0.3):
        pop = [np.random.uniform(-10, 10, 4).tolist() for _ in range(population_size)]
        def fitness(weights):
            return self.schedule_from_weights(weights)
        best_weights = None
        best_fit = float('inf')
        for g in range(generations):
            fit_values = [fitness(ind) for ind in pop]
            min_idx = np.argmin(fit_values)
            if fit_values[min_idx] < best_fit:
                best_fit = fit_values[min_idx]
                best_weights = pop[min_idx]
            sorted_pop = [ind for _, ind in sorted(zip(fit_values, pop), key=lambda pair: pair[0])]
            survivors = sorted_pop[:population_size//2]
            new_pop = survivors.copy()
            while len(new_pop) < population_size:
                parent = random.choice(survivors)
                child = parent.copy()
                for i in range(len(child)):
                    if random.random() < mutation_prob:
                        child[i] += np.random.normal(0, 1)
                        child[i] = max(min(child[i], 10), -10)
                new_pop.append(child)
            pop = new_pop
        return best_weights, best_fit

Training Agents

In [8]:
def run_agent(agent, env, num_episodes=15, agent_name='Agent'):
    episode_rewards = []
    episode_makespans = []
    episode_deviations = []
    best_makespan = float('inf')
    best_policy = None
    best_episode = -1
    cp_length = env.compute_critical_path_length()  # Compute the critical path length once
    for episode in range(num_episodes):
        state = env.reset()
        total_reward = 0
        done = False
        while not done:
            action = agent.select_action(state)
            next_state, reward, done = env.step(action)
            agent.store_experience(state, action, reward, next_state, done)
            state = next_state
            total_reward += reward
            if not isinstance(agent, PPOAgent):
                agent.train()
        if isinstance(agent, PPOAgent):
            agent.train()
        if hasattr(agent, 'update_target_network'):
            agent.update_target_network()
        makespan = env.compute_makespan()
        deviation = makespan - cp_length
        episode_rewards.append(total_reward)
        episode_makespans.append(makespan)
        episode_deviations.append(deviation)
        eps_str = f"{agent.epsilon:.4f}" if hasattr(agent, 'epsilon') else "N/A"
        print(f"{agent_name} Episode {episode+1}/{num_episodes} - Makespan: {makespan}, Deviation: {deviation}, Total Reward: {total_reward}, Epsilon: {eps_str}")
        if makespan < best_makespan:
            best_makespan = makespan
            best_episode = episode + 1
            if hasattr(agent, 'q_network'):
                best_policy = agent.q_network.state_dict().copy()
            elif hasattr(agent, 'policy'):
                best_policy = agent.policy.state_dict().copy()
        # Update epsilon only once per episode
        if hasattr(agent, 'update_epsilon'):
            agent.update_epsilon()
    avg_deviation = np.mean(episode_deviations)
    print(f"Best {agent_name} policy found in Episode {best_episode} with Makespan: {best_makespan}")
    print(f"Average deviation from critical path ({cp_length}): {avg_deviation}")
    return episode_rewards, episode_makespans, best_policy, avg_deviation


First Try With 20 Activities and 3 Resources

In [9]:
if __name__ == '__main__':
    num_episodes = 15
    state_dim = 5  # Each node: [duration, resource1, resource2, resource3, is_scheduled]
    num_tasks = 20
    resource_limits = [10, 10, 10]
    action_dim = num_tasks

    # DQN Agent
    print("=== Training DQN Agent ===")
    env_dqn = RCPSPEnv(num_tasks=num_tasks, resource_limits=resource_limits)
    dqn_agent = DQNAgent(state_dim, action_dim)
    dqn_rewards, dqn_makespans, best_dqn_policy, avg_deviation_dqn = run_agent(dqn_agent, env_dqn, num_episodes, agent_name='DQN')

    # DDQN Agent
    print("\n=== Training DDQN Agent ===")
    env_ddqn = RCPSPEnv(num_tasks=num_tasks, resource_limits=resource_limits)
    ddqn_agent = DDQNAgent(state_dim, action_dim)
    ddqn_rewards, ddqn_makespans, best_ddqn_policy, avg_deviation_ddqn = run_agent(ddqn_agent, env_ddqn, num_episodes, agent_name='DDQN')

    # PPO Agent
    print("\n=== Training PPO Agent ===")
    env_ppo = RCPSPEnv(num_tasks=num_tasks, resource_limits=resource_limits)
    ppo_agent = PPOAgent(state_dim, action_dim)
    ppo_rewards, ppo_makespans, best_ppo_policy, avg_deviation_ppo = run_agent(ppo_agent, env_ppo, num_episodes, agent_name='PPO')

    print("\n=== RL Agents Summary ===")
    print(f"DQN Average Makespan: {np.mean(dqn_makespans):.2f}, Average Deviation: {avg_deviation_dqn:.2f}")
    print(f"DDQN Average Makespan: {np.mean(ddqn_makespans):.2f}, Average Deviation: {avg_deviation_ddqn:.2f}")
    print(f"PPO Average Makespan: {np.mean(ppo_makespans):.2f}, Average Deviation: {avg_deviation_ppo:.2f}")

    # Genetic Programming Hyper-Heuristic Optimization
    print("\n=== GPHH Optimization ===")
    env_gphh = RCPSPEnv(num_tasks=num_tasks, resource_limits=resource_limits)
    best_weights, best_makespan = env_gphh.optimize_schedule_gphh(population_size=50, generations=100, mutation_prob=0.3)
    cp_length = env_gphh.compute_critical_path_length()
    deviation_gphh = best_makespan - cp_length
    print(f"Best makespan from GPHH: {best_makespan}")
    print(f"Best weight vector: {best_weights}")
    print(f"Deviation from critical path ({cp_length}): {deviation_gphh}")

=== Training DQN Agent ===
DQN Episode 1/15 - Makespan: 82, Deviation: 38, Total Reward: -292, Epsilon: 1.0000
DQN Episode 2/15 - Makespan: 84, Deviation: 40, Total Reward: -464, Epsilon: 0.9950
DQN Episode 3/15 - Makespan: 78, Deviation: 34, Total Reward: -538, Epsilon: 0.9900
DQN Episode 4/15 - Makespan: 86, Deviation: 42, Total Reward: -1386, Epsilon: 0.9851
DQN Episode 5/15 - Makespan: 83, Deviation: 39, Total Reward: -463, Epsilon: 0.9801
DQN Episode 6/15 - Makespan: 74, Deviation: 30, Total Reward: -594, Epsilon: 0.9752
DQN Episode 7/15 - Makespan: 79, Deviation: 35, Total Reward: -519, Epsilon: 0.9704
DQN Episode 8/15 - Makespan: 80, Deviation: 36, Total Reward: -350, Epsilon: 0.9655
DQN Episode 9/15 - Makespan: 80, Deviation: 36, Total Reward: -1560, Epsilon: 0.9607
DQN Episode 10/15 - Makespan: 76, Deviation: 32, Total Reward: -466, Epsilon: 0.9559
DQN Episode 11/15 - Makespan: 89, Deviation: 45, Total Reward: -779, Epsilon: 0.9511
DQN Episode 12/15 - Makespan: 76, Deviation: 

Second Try With 30 Activities and 4 Resources

In [None]:
if __name__ == '__main__':
    num_episodes = 15
    state_dim = 6  # Each node: [duration, resource1, resource2, resource3, is_scheduled]
    num_tasks = 30
    resource_limits = [12, 6, 10, 8]
    action_dim = num_tasks

    # DQN Agent
    print("=== Training DQN Agent ===")
    env_dqn = RCPSPEnv(num_tasks=num_tasks, resource_limits=resource_limits)
    dqn_agent = DQNAgent(state_dim, action_dim)
    dqn_rewards, dqn_makespans, best_dqn_policy, avg_deviation_dqn = run_agent(dqn_agent, env_dqn, num_episodes, agent_name='DQN')

    # DDQN Agent
    print("\n=== Training DDQN Agent ===")
    env_ddqn = RCPSPEnv(num_tasks=num_tasks, resource_limits=resource_limits)
    ddqn_agent = DDQNAgent(state_dim, action_dim)
    ddqn_rewards, ddqn_makespans, best_ddqn_policy, avg_deviation_ddqn = run_agent(ddqn_agent, env_ddqn, num_episodes, agent_name='DDQN')

    # PPO Agent
    print("\n=== Training PPO Agent ===")
    env_ppo = RCPSPEnv(num_tasks=num_tasks, resource_limits=resource_limits)
    ppo_agent = PPOAgent(state_dim, action_dim)
    ppo_rewards, ppo_makespans, best_ppo_policy, avg_deviation_ppo = run_agent(ppo_agent, env_ppo, num_episodes, agent_name='PPO')

    print("\n=== RL Agents Summary ===")
    print(f"DQN Average Makespan: {np.mean(dqn_makespans):.2f}, Average Deviation: {avg_deviation_dqn:.2f}")
    print(f"DDQN Average Makespan: {np.mean(ddqn_makespans):.2f}, Average Deviation: {avg_deviation_ddqn:.2f}")
    print(f"PPO Average Makespan: {np.mean(ppo_makespans):.2f}, Average Deviation: {avg_deviation_ppo:.2f}")

    # Genetic Programming Hyper-Heuristic Optimization
    print("\n=== GPHH Optimization ===")
    env_gphh = RCPSPEnv(num_tasks=num_tasks, resource_limits=resource_limits)
    best_weights, best_makespan = env_gphh.optimize_schedule_gphh(population_size=50, generations=100, mutation_prob=0.3)
    cp_length = env_gphh.compute_critical_path_length()
    deviation_gphh = best_makespan - cp_length
    print(f"Best makespan from GPHH: {best_makespan}")
    print(f"Best weight vector: {best_weights}")
    print(f"Deviation from critical path ({cp_length}): {deviation_gphh}")

=== Training DQN Agent ===
DQN Episode 1/15 - Makespan: 163, Deviation: 98, Total Reward: -1063, Epsilon: 1.0000
DQN Episode 2/15 - Makespan: 166, Deviation: 101, Total Reward: -806, Epsilon: 0.9950
DQN Episode 3/15 - Makespan: 164, Deviation: 99, Total Reward: -894, Epsilon: 0.9900
DQN Episode 4/15 - Makespan: 163, Deviation: 98, Total Reward: -1383, Epsilon: 0.9851
DQN Episode 5/15 - Makespan: 175, Deviation: 110, Total Reward: -795, Epsilon: 0.9801
DQN Episode 6/15 - Makespan: 162, Deviation: 97, Total Reward: -1742, Epsilon: 0.9752
DQN Episode 7/15 - Makespan: 168, Deviation: 103, Total Reward: -978, Epsilon: 0.9704
DQN Episode 8/15 - Makespan: 170, Deviation: 105, Total Reward: -1130, Epsilon: 0.9655
DQN Episode 9/15 - Makespan: 176, Deviation: 111, Total Reward: -1006, Epsilon: 0.9607
DQN Episode 10/15 - Makespan: 171, Deviation: 106, Total Reward: -861, Epsilon: 0.9559
DQN Episode 11/15 - Makespan: 165, Deviation: 100, Total Reward: -1055, Epsilon: 0.9511
DQN Episode 12/15 - Mak