In [12]:
from linearApprox import *

class QuantumNetworkQLearning:
    def __init__(self, edges, goalEdges, pSwap, pGen, maxAge, alpha, gamma, epsilon, softmax, temperature, temperature_decay):
        self.edges = edges
        self.goalEdges = goalEdges
        self.pSwap = pSwap
        self.pGen = pGen
        self.maxAge = maxAge
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.softmax = softmax
        self.temperature = temperature
        self.temperature_decay = temperature_decay

        feature_size = len(edges) + len(goalEdges)
        self.Q = LinearQApproximator(feature_size=feature_size)

        self.goal_success_queues = {goal: [] for goal in self.goalEdges}


    def choose_action(self, state, training=True): # GET RID OF THIS TRAINING PARAMETER, WHATST HE POINT
        ent_state, _ = state
        possible_actions = getPossibleActions(ent_state, self.goalEdges)

        features = featurize_state(state, self.goalEdges)
        q_scores = [(action, self.Q.get_q_value(features, action)) for action in possible_actions]


        if self.softmax and training:
            q_vals = [q for (_, q) in q_scores]
            probs = softmax_probs(q_vals, self.temperature)
            chosen_action = random.choices([a for a, _ in q_scores], weights=probs)[0]
        else:
            if training and random.random() < self.epsilon:
                chosen_action = random.choice(possible_actions)
            else:
                chosen_action = max(q_scores, key=lambda x: x[1])[0]

        return chosen_action

    def train(self, num_episodes=10, max_steps=1000, plot=False):
        for episode in range(num_episodes):
            ent_state = [(edge, -1) for edge in self.edges]
            edrs = {goal: 0.0 for goal in self.goalEdges}
            state = get_augmented_state(ent_state, edrs, goal_order=self.goalEdges)

            self.goal_success_queues = {goal: [] for goal in self.goalEdges}
            total_timesteps = 1

            for step in range(max_steps):
                action = self.choose_action(state, training=True)
                next_state = performAction(action, state)
                next_state = ageEntanglements(next_state, self.maxAge)
                next_state = generateEntanglement(next_state, self.pGen)
                
                # Determine success before reward (but DO NOT manually update the success queues here)
                consumed_edges, goal = action
                success = False
                if goal is not None and consumed_edges:
                    success_prob = self.pSwap ** (len(consumed_edges) - 1)
                    success = random.random() < success_prob


                reward = getReward(
                    action=action,
                    goal_success_queues=self.goal_success_queues,
                    total_timesteps=total_timesteps,
                    pSwap=self.pSwap,
                    success=success
                )

                # Augment new state with updated EDR snapshot
                edr_snapshot = {
                    g: sum(self.goal_success_queues[g]) / max(1, len(self.goal_success_queues[g]))
                    for g in self.goalEdges
                }
                next_state = get_augmented_state(next_state[0], edr_snapshot, goal_order=self.goalEdges)

                # Q-learning update
                features = featurize_state(state, self.goalEdges)
                next_features = featurize_state(next_state, self.goalEdges)
                possible_next_actions = getPossibleActions(next_state[0], self.goalEdges)

                max_next_q = max([self.Q.get_q_value(next_features, a) for a in possible_next_actions], default=0.0)
                target = reward + self.gamma * max_next_q

                self.Q.update(features, action, target, self.alpha)


                state = next_state
                total_timesteps += 1

            # Optional: Decay temperature
            if self.softmax and self.temperature_decay:
                self.temperature = max(0.01, self.temperature * self.temperature_decay)

            if (episode + 1) % 10 == 0:
                print(f"Episode {episode + 1}")

# === Q-Learning Wrapper for Experiment Framework ===
def train_q_learning_policy(edges, goal_edges, p_swap, p_gen, max_age, seed=None, **kwargs):
    random.seed(seed)
    np.random.seed(seed)

    agent = QuantumNetworkQLearning(
        edges=edges,
        goalEdges=goal_edges,
        pSwap=p_swap,
        pGen=p_gen,
        maxAge=max_age,
        alpha=kwargs.get("alpha", 0.1),
        gamma=kwargs.get("gamma", 0.99),
        epsilon=kwargs.get("epsilon", 0.001),
        softmax=kwargs.get("softmax", False),
        temperature=kwargs.get("temperature", 1.0),
        temperature_decay=kwargs.get("temperature_decay", 0.9),
    )
    agent.train(
        num_episodes=kwargs.get("num_episodes", 5),
        max_steps=kwargs.get("max_steps", 1000),
        plot=False
    )
    return agent.Q


In [None]:
# === Setup parameters ===
edges = [(0, 1), (1,2), (2,3), (3,4)]
goal_edges = [(0, 2), (1, 4)]
pSwap = 0.6
pGen = 0.6
maxAge = 2
temperature = 3
temperature_decay = 0.99
num_episodes = 30
max_steps = 100000
epsilon = 0.05
# === Train Q-Learning agent ===
Q = train_q_learning_policy(
    edges=edges,
    goal_edges=goal_edges,
    p_swap=pSwap,
    p_gen=pGen,
    max_age=maxAge,
    seed=42,
    alpha=0.1,
    gamma=0.99,
    epsilon=epsilon,
    num_episodes=num_episodes,
    max_steps=max_steps,
    softmax=False,
    temperature=temperature,
    temperature_decay= temperature_decay
)

# === Simulate policy (with EDR tracking) ===
simulate_policy(
    Q_table=Q,
    edges=edges,
    goal_edges=goal_edges,
    p_swap=pSwap,
    p_gen=pGen,
    max_age=maxAge,
    num_steps=100000,
    edr_window_size=1000,
    plot=True
)
print('Done!')

Episode 10


In [None]:
# === Setup parameters ===
edges = [(0, 1), (1,2), (2,3), (3,4)]
goal_edges = [(0, 2), (4, 1)]
pSwap = 0.6
pGen = 0.6
maxAge = 2

compare_policies_across_param(
    policy_name="Q-Learning",
    policy_train_fn=train_q_learning_policy,
    param_name="pGen",
    param_values=[0.1 * i for i in range(1, 11)],  # 0.1 to 1.0
    edges=edges,
    goal_edges=goal_edges,
    p_gen=pGen,  # gets overridden
    p_swap=pSwap,
    max_age=maxAge,
    train_kwargs={
        "alpha": 0.1,
        "gamma": 0.99,
        "epsilon": 0.1,
        "num_episodes": 10,
        "max_steps": 10000,
        "temperature": 1.0,
        "temperature_decay": 0.98,
        "softmax": False
    },
    validate_kwargs={},  # optional
    plot=True,
    num_runs=5,
    num_steps=7000,
    num_simulations=5
)
print('Done')


=== Evaluating Q-Learning for pGen = 0.1 ===

=== Q-Learning (pGen=0.1) Policy Training Run 1 ===
Episode 10

=== Q-Learning (pGen=0.1) Policy Training Run 2 ===
Episode 10

=== Q-Learning (pGen=0.1) Policy Training Run 3 ===
Episode 10

=== Q-Learning (pGen=0.1) Policy Training Run 4 ===
Episode 10

=== Q-Learning (pGen=0.1) Policy Training Run 5 ===
Episode 10

=== Evaluating Q-Learning for pGen = 0.2 ===

=== Q-Learning (pGen=0.2) Policy Training Run 1 ===
Episode 10

=== Q-Learning (pGen=0.2) Policy Training Run 2 ===
Episode 10

=== Q-Learning (pGen=0.2) Policy Training Run 3 ===
Episode 10

=== Q-Learning (pGen=0.2) Policy Training Run 4 ===
Episode 10

=== Q-Learning (pGen=0.2) Policy Training Run 5 ===
Episode 10

=== Evaluating Q-Learning for pGen = 0.30000000000000004 ===

=== Q-Learning (pGen=0.30000000000000004) Policy Training Run 1 ===
Episode 10

=== Q-Learning (pGen=0.30000000000000004) Policy Training Run 2 ===
Episode 10

=== Q-Learning (pGen=0.30000000000000004) Pol

KeyboardInterrupt: 