In [None]:
from linearApprox import *
import math
import random
import numpy as np

class QuantumNetworkQLearning:
    def __init__(
        self,
        edges,
        goalEdges,
        pSwap,
        pGen,
        maxAge,
        alpha,
        gamma,
        epsilon,
        softmax,
        temperature,
        temperature_decay,
        reward_mode="basic",
        reward_alpha=0.5,
        reward_epsilon=0.001
    ):
        self.edges = edges
        self.goalEdges = goalEdges
        self.pSwap = pSwap
        self.pGen = pGen
        self.maxAge = maxAge
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.softmax = softmax
        self.temperature = temperature
        self.temperature_decay = temperature_decay

        self.reward_mode = reward_mode
        self.reward_alpha = reward_alpha
        self.reward_epsilon = reward_epsilon

        feature_size = len(edges) + len(goalEdges)
        self.Q = LinearQApproximator(feature_size=feature_size)

        self.goal_success_queues = {goal: [] for goal in self.goalEdges}

    def choose_action(self, state):
        ent_state, _ = state
        possible_actions = getPossibleActions(ent_state, self.goalEdges)
        features = featurize_state(state, self.goalEdges)
        q_scores = [(action, self.Q.get_q_value(features, action)) for action in possible_actions]

        if self.softmax:
            q_vals = [q for (_, q) in q_scores]
            probs = softmax_probs(q_vals, self.temperature)
            return random.choices([a for a, _ in q_scores], weights=probs)[0]

        if random.random() < self.epsilon:
            return random.choice(possible_actions)

        return max(q_scores, key=lambda x: x[1])[0]

    def train(self, num_episodes=10, max_steps=1000, plot=False):
        q_value_diffs = []
        q_value_diffs_per_goal = {goal: [] for goal in self.goalEdges}


        for episode in range(num_episodes):
            ent_state = [(edge, -1) for edge in self.edges]
            edrs = {goal: 0.0 for goal in self.goalEdges}
            state = get_augmented_state(ent_state, edrs, goal_order=self.goalEdges)

            self.goal_success_queues = {goal: [] for goal in self.goalEdges}
            total_timesteps = 1

            for step in range(max_steps):
                action = self.choose_action(state)
                next_state = performAction(action, state)
                next_state = ageEntanglements(next_state, self.maxAge)
                next_state = generateEntanglement(next_state, self.pGen)

                consumed_edges, goal = action
                success = False
                if goal is not None and consumed_edges:
                    success_prob = self.pSwap ** (len(consumed_edges) - 1)
                    success = random.random() < success_prob

                reward = compute_reward(
                    action=action,
                    goal_success_queues=self.goal_success_queues,
                    total_timesteps=total_timesteps,
                    pSwap=self.pSwap,
                    mode=self.reward_mode,
                    alpha=self.reward_alpha,
                    epsilon=self.reward_epsilon,
                    success=success
                )

                edr_snapshot = {
                    g: sum(self.goal_success_queues[g]) / max(1, len(self.goal_success_queues[g]))
                    for g in self.goalEdges
                }
                next_state = get_augmented_state(next_state[0], edr_snapshot, goal_order=self.goalEdges)

                features = featurize_state(state, self.goalEdges)
                next_features = featurize_state(next_state, self.goalEdges)
                possible_next_actions = getPossibleActions(next_state[0], self.goalEdges)

                max_next_q = max([self.Q.get_q_value(next_features, a) for a in possible_next_actions], default=0.0)
                target = reward + self.gamma * max_next_q
                current_q = self.Q.get_q_value(features, action)
                q_diff = abs(target - current_q)
                q_value_diffs.append(q_diff)
                # For each goal, log q_diff if it was relevant; else 0 or np.nan
                for g in self.goalEdges:
                    if g == goal and consumed_edges:
                        q_value_diffs_per_goal[g].append(q_diff)
                    else:
                        q_value_diffs_per_goal[g].append(np.nan)  # or np.nan if you want to ignore it in avg



                self.Q.update(features, action, target, self.alpha)

                state = next_state
                total_timesteps += 1

            if self.softmax and self.temperature_decay:
                self.temperature = max(0.01, self.temperature * self.temperature_decay)

            if (episode + 1) % 10 == 0:
                print(f"Episode {episode + 1}")

        return q_value_diffs, q_value_diffs_per_goal





# === Q-Learning Wrapper for Experiment Framework ===
def train_q_learning_policy(edges, goal_edges, p_swap, p_gen, max_age, seed=None, plot_q_convergence=False, **kwargs):
    random.seed(seed)
    np.random.seed(seed)

    agent = QuantumNetworkQLearning(
        edges=edges,
        goalEdges=goal_edges,
        pSwap=p_swap,
        pGen=p_gen,
        maxAge=max_age,
        alpha=kwargs.get("alpha", 0.1),
        gamma=kwargs.get("gamma", 0.99),
        epsilon=kwargs.get("epsilon", 0.001),
        softmax=kwargs.get("softmax", False),
        temperature=kwargs.get("temperature", 1.0),
        temperature_decay=kwargs.get("temperature_decay", 0.9),
        reward_mode=kwargs.get("reward_mode", "basic"),
        reward_alpha=kwargs.get("reward_alpha", 0.5),
        reward_epsilon=kwargs.get("reward_epsilon", 0.001)
    )

    q_diffs, q_diffs_per_goal = agent.train( # dont really need q_diffs
        num_episodes=kwargs.get("num_episodes", 5),
        max_steps=kwargs.get("max_steps", 1000),
        plot=True  
    )

    if plot_q_convergence:
        plot_q_value_convergence(q_diffs, q_diffs_per_goal)

    return agent.Q


# **Get Plotting**


In [None]:
# Assuming bootstrap_policy_runs() has been defined above,
# and you have train_q_learning_policy available

bootstrap_policy_runs(
    policy_train_fn=train_q_learning_policy,
    policy_name="Q-Learning",
    edges=[(0, 1), (1, 3), (2, 3), (3, 4), (4, 5)],
    goal_edges=[(0, 5), (2, 4)],
    p_swap=0.7,
    p_gen=0.7,
    max_age=3,
    num_runs=10,         # How many seeds to test across
    num_steps=200000,    # Length of each simulation (policy validation)
    train_kwargs={
        "nLookahead": 3,       # If your Q-learning doesn't use this, remove it
        "epsilon": 0.05,
        "gamma": 0.995,
        "alpha": 0.1,
        "softmax": True,
        "temperature": 3,
        "temperature_decay": 0.995,
        "reward_mode": "basic",
        "reward_alpha": 1,
        "reward_epsilon": 0.001
    },
    window=1000,          # Averaging window for final Jain/throughput stats
    plot=True             # Set to False if you just want the data back
)


In [None]:
def compare_reward_modes_across_param_full_qlearning(
    policy_train_fn,
    reward_modes,
    mode_labels,
    param_name,
    param_values,
    edges,
    goal_edges,
    p_gen,
    p_swap,
    max_age,
    num_runs,
    num_steps,
    num_simulations,
    base_train_kwargs,
    validate_kwargs={}
):
    assert param_name in ['pGen', 'pSwap'], "param_name must be either 'pGen' or 'pSwap'"
    
    color_map = plt.get_cmap("tab10")
    fig, axs = plt.subplots(1, 3, figsize=(22, 6))
    ax_jain, ax_tp, ax_pareto = axs

    for i, mode in enumerate(reward_modes):
        avg_jains = []
        avg_throughputs = []

        for param_val in param_values:
            curr_p_gen = param_val if param_name == 'pGen' else p_gen
            curr_p_swap = param_val if param_name == 'pSwap' else p_swap

            train_kwargs = base_train_kwargs.copy()
            train_kwargs['reward_mode'] = mode

            print(f"\n→ {mode_labels[i]} | {param_name} = {param_val}")
            results = run_policy_experiments(
                train_policy_fn=policy_train_fn,
                policy_name=f"{mode_labels[i]} | {param_name}={param_val}",
                edges=edges,
                goal_edges=goal_edges,
                p_gen=curr_p_gen,
                p_swap=curr_p_swap,
                max_age=max_age,
                num_runs=num_runs,
                num_steps=num_steps,
                num_simulations=num_simulations,
                train_kwargs=train_kwargs,
                validate_kwargs=validate_kwargs,
                plot=False
            )

            jains = results['jains']
            throughputs = [
                sum(results['edrs'][goal][run_i] for goal in goal_edges)
                for run_i in range(num_runs)
            ]

            avg_jains.append(np.mean(jains))
            avg_throughputs.append(np.mean(throughputs))

        ax_jain.plot(param_values, avg_jains, label=mode_labels[i], marker='o', linestyle='-', color=color_map(i))
        ax_tp.plot(param_values, avg_throughputs, label=mode_labels[i], marker='o', linestyle='-', color=color_map(i))
        ax_pareto.plot(avg_throughputs, avg_jains, label=mode_labels[i], marker='o', linestyle='-', color=color_map(i))

    ax_jain.set_title(f"Fairness vs {param_name}")
    ax_jain.set_xlabel(param_name)
    ax_jain.set_ylabel("Jain's Index")
    ax_jain.set_ylim(0.45, 1.05)
    ax_jain.grid(True)
    ax_jain.legend()

    ax_tp.set_title(f"Throughput vs {param_name}")
    ax_tp.set_xlabel(param_name)
    ax_tp.set_ylabel("Total Throughput")
    ax_tp.grid(True)
    ax_tp.legend()

    ax_pareto.set_title("Pareto Curve (Throughput vs Fairness)")
    ax_pareto.set_xlabel("Total Throughput")
    ax_pareto.set_ylabel("Jain's Index")
    ax_pareto.set_xlim(0, 1.05)
    ax_pareto.set_ylim(0.45, 1.05)
    ax_pareto.grid(True)
    ax_pareto.legend()

    plt.tight_layout()
    plt.show()

#####################################################################
# Now call the Q-learning version
#####################################################################

from linearApprox import *
mainEdge = [(0, 1), (1, 3), (2, 3), (3, 4), (4, 5)]
mainGoals = [(0, 5), (2, 4)]

sparseEdge = [(0,1), (1, 2), (2,3), (2,4), (3, 5), (5, 6)]
sparseGoals = [(0, 6), (0, 4)]

param_values = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

base_kwargs_q = {
    "alpha": 0.1,
    "gamma": 0.995,
    "epsilon": 0.05,
    "num_episodes": 20,
    "max_steps": 5000,
    "temperature": 5,
    "temperature_decay": 0.98,
    "softmax": False,
    "reward_alpha": 1,
    "reward_epsilon": 0.001
}

print("SPARSE — Q-LEARNING")
compare_reward_modes_across_param_full_qlearning(
    policy_train_fn=train_q_learning_policy,
    reward_modes=["basic", "partial", "logless", "without+1"],
    mode_labels=["Basic (Log, Full)", "Partial Reward", "Linear (Logless)", "Log w/o + 1"],
    param_name="pSwap",
    param_values=param_values,
    edges=sparseEdge,
    goal_edges=sparseGoals,
    p_gen=0.6,
    p_swap=0.6,
    max_age=3,
    num_runs=3,
    num_steps=10000,
    num_simulations=3,
    base_train_kwargs=base_kwargs_q
)

compare_reward_modes_across_param_full_qlearning(
    policy_train_fn=train_q_learning_policy,
    reward_modes=["basic", "partial", "logless", "without+1"],
    mode_labels=["Basic (Log, Full)", "Partial Reward", "Linear (Logless)", "Log w/o + 1"],
    param_name="pGen",
    param_values=param_values,
    edges=sparseEdge,
    goal_edges=sparseGoals,
    p_gen=0.6,
    p_swap=0.6,
    max_age=3,
    num_runs=3,
    num_steps=10000,
    num_simulations=3,
    base_train_kwargs=base_kwargs_q
)

print("MAIN — Q-LEARNING")
compare_reward_modes_across_param_full_qlearning(
    policy_train_fn=train_q_learning_policy,
    reward_modes=["basic", "partial", "logless", "without+1"],
    mode_labels=["Basic (Log, Full)", "Partial Reward", "Linear (Logless)", "Log w/o + 1"],
    param_name="pSwap",
    param_values=param_values,
    edges=mainEdge,
    goal_edges=mainGoals,
    p_gen=0.6,
    p_swap=0.6,
    max_age=3,
    num_runs=3,
    num_steps=10000,
    num_simulations=3,
    base_train_kwargs=base_kwargs_q
)

compare_reward_modes_across_param_full_qlearning(
    policy_train_fn=train_q_learning_policy,
    reward_modes=["basic", "partial", "logless", "without+1"],
    mode_labels=["Basic (Log, Full)", "Partial Reward", "Linear (Logless)", "Log w/o + 1"],
    param_name="pGen",
    param_values=param_values,
    edges=mainEdge,
    goal_edges=mainGoals,
    p_gen=0.6,
    p_swap=0.6,
    max_age=2,
    num_runs=4,
    num_steps=10000,
    num_simulations=3,
    base_train_kwargs=base_kwargs_q
)


In [None]:
edges = [(0,1), (1,3),(2,3), (3, 4), (4, 5), (4, 6)]
goal_edges = [(0, 5), (2, 4)]
pSwap = 0.5
pGen = 0.5
maxAge = 2
max_steps = 10000
num_episodes = 100
nLookahead = 3
epsilon = 0.05
gamma = 0.99
alpha = 0.1
edr_window_size=100
temperature = 5
temperature_decay = 0.995

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import random

# Define network and training parameters
edges = [(0, 1), (1, 3), (2, 3), (3, 4), (4, 5)]
goal_edges = [(0, 5), (2, 4)]
max_age_params = [1, 5, 10, 15, 20]

p = 0.7
num_episodes = 1
max_steps = 40000
epsilon = 0.05
gamma = 0.995
alpha = 0.1
softmax = False
reward_mode = 'basic'
reward_alpha = 1
reward_epsilon = 0.001
temperature = 5
temperature_decay = 0.995
seed = 10

random.seed(seed)
np.random.seed(seed)

results = {}

for max_age in max_age_params:
    print(max_age)
    agent = QuantumNetworkQLearning(
        edges=edges,
        goalEdges=goal_edges,
        pSwap=p,
        pGen=p,
        maxAge=max_age,
        alpha=alpha,
        gamma=gamma,
        epsilon=epsilon,
        softmax=softmax,
        temperature=temperature,
        temperature_decay=temperature_decay,
        reward_mode=reward_mode,
        reward_alpha=reward_alpha,
        reward_epsilon=reward_epsilon
    )

    q_value_diffs, q_diffs_per_goal = agent.train(
        num_episodes=num_episodes,
        max_steps=max_steps,
        plot=False
    )

    results[max_age] = (q_value_diffs, q_diffs_per_goal)

# --- Helpers ---
def interpolate_nans(data):
    data = np.array(data, dtype=np.float64)
    nans = np.isnan(data)
    if np.all(nans):
        return data
    indices = np.arange(len(data))
    data[nans] = np.interp(indices[nans], indices[~nans], data[~nans])
    return data

def smooth(data, window=1000):
    kernel = np.ones(window) / window
    return np.convolve(data, kernel, mode='same')

# --- Plotting ---
fig, axs = plt.subplots(2, 1, figsize=(12, 10), sharex=True)
window = 2000
colors = ['tab:blue', 'tab:purple', 'tab:green', 'tab:red', 'tab:orange']

# --- Per-goal Q-value diffs (top plot) ---
print('ahhh')
for idx, max_age in enumerate(max_age_params):
    print(idx)
    color = colors[idx]
    q_diffs_per_goal = results[max_age][1]

    # Short Goal
    short_goal_smoothed = smooth(interpolate_nans(q_diffs_per_goal[(2, 4)]), window)
    axs[0].plot(short_goal_smoothed, label=f'Short Goal (maxAge={max_age})', linestyle='--', color=color)

    # Long Goal
    long_goal_smoothed = smooth(interpolate_nans(q_diffs_per_goal[(0, 5)]), window)
    axs[0].plot(long_goal_smoothed, label=f'Long Goal (maxAge={max_age})', linestyle='-', color=color)

axs[0].set_ylabel('Per-Goal Q-value Update')
axs[0].set_title('Smoothed Q-value Updates (Per Goal)')
axs[0].legend()
axs[0].grid(True)

# --- Global Q-value diffs (bottom plot) ---
for idx, max_age in enumerate(max_age_params):
    color = colors[idx]
    q_value_diffs = results[max_age][0]
    global_smoothed = smooth(q_value_diffs, window)
    axs[1].plot(global_smoothed, label=f'{max_age}', color=color)

axs[1].set_xlabel('Timestep')
axs[1].set_ylabel('Global Q-value Update')
axs[1].set_title('Smoothed Global Q-value Convergence')
axs[1].legend(title='maxAge')
axs[1].grid(True)

plt.tight_layout()
plt.show()


In [None]:
# Single Run
Q = train_q_learning_policy(
    seed = 10,
    edges=edges,
    goal_edges=goal_edges,
    p_swap=pSwap,
    p_gen=pGen,
    max_age=maxAge,
    alpha=alpha,
    gamma=gamma,
    epsilon=epsilon,
    num_episodes=num_episodes,
    max_steps=max_steps,
    softmax=False,
    temperature=temperature,
    temperature_decay=temperature_decay,
    reward_mode="basic",     
    reward_alpha=1,
    plot_q_convergence= True
)

simulate_policy(
    Q_table=Q,
    edges=edges,
    goal_edges=goal_edges,
    p_swap=pSwap,
    p_gen=pGen,
    max_age=maxAge,
    num_steps=100000,
    edr_window_size=10000,
    plot=True
)

In [None]:
param_values = [0.1 * i for i in range(1, 11)]
compare_policies_across_param(
    policy_name="Q-Learning",
    policy_train_fn=train_q_learning_policy,
    param_name="pGen",
    param_values=param_values,  # 0.1 to 1.0
    edges=edges,
    goal_edges=goal_edges,
    p_gen=pGen,  # gets overridden
    p_swap=pSwap,
    max_age=maxAge,
    train_kwargs={
        "alpha": 0.1,
        "gamma": 0.99,
        "epsilon": 0.1,
        "num_episodes": 10,
        "max_steps": 10000,
        "temperature": 1.0,
        "temperature_decay": 0.98,
        "softmax": False
    },
    validate_kwargs={},  # optional
    plot=True,
    num_runs=10,
    num_steps=20000,
    num_simulations=10)

compare_policies_across_param(
    policy_name="Q-Learning",
    policy_train_fn=train_q_learning_policy,
    param_name="pSwap",
    param_values=param_values,  # 0.1 to 1.0
    edges=edges,
    goal_edges=goal_edges,
    p_gen=pGen,  # gets overridden
    p_swap=pSwap,
    max_age=maxAge,
    train_kwargs={
        "alpha": 0.1,
        "gamma": 0.99,
        "epsilon": 0.1,
        "num_episodes": 10,
        "max_steps": 10000,
        "temperature": 1.0,
        "temperature_decay": 0.98,
        "softmax": False
    },
    validate_kwargs={},  # optional
    plot=True,
    num_runs=10,
    num_steps=20000,
    num_simulations=10)


In [None]:
alpha_vals = [0.1, 0.5, 1.0, 1.5, 2.0]


pswap_vals = [0.4, 0.6, 0.8]

compare_alpha_vs_env_param(
    policy_name="Q-Learning",
    policy_train_fn=train_q_learning_policy,
    param_name="pSwap",
    param_values=pswap_vals,
    alpha_values=alpha_vals,
    edges=edges,
    goal_edges=goal_edges,
    p_gen=pGen,
    p_swap=pSwap,
    max_age=maxAge,
    train_kwargs={
        "alpha": 0.1,
        "gamma": 0.99,
        "epsilon": 0.05,
        "softmax": False,
        "temperature": 3,
        "temperature_decay": 0.99,
        "num_episodes": 5,
        "max_steps": 30000
    },
    num_runs=3,  # Lower for fast debugging
    num_steps=5000,
    num_simulations=5,
    plot=True
)

In [None]:
alpha_vals = [0.1, 0.3, 0.5, 0.7, 0.9, 1, 1.1, 1.3, 1.5, 1.7, 1.9 ]
# Define network and training parameters
edges = [(0,1), (1,3),(2,3), (3, 4), (4, 5)]
goal_edges = [(0, 5), (2, 4)]
pSwap = 0.7
pGen = 0.7
maxAge = 2
epsilon = 0.05
gamma = 0.995
alpha = 0.1

edr_window_size = 1000
temperature = 5
temperature_decay = 0.995

seed = 10
random.seed(seed)
np.random.seed(seed)
results = compare_policies_across_alpha(
    policy_name="Q-Learning",
    policy_train_fn=train_q_learning_policy,
    alpha_values=alpha_vals,
    edges=edges,
    goal_edges=goal_edges,
    p_gen=pGen,
    p_swap=pSwap,
    max_age=maxAge,
    train_kwargs={
        "alpha": 0.1,
        "gamma": 0.99,
        "epsilon": 0.05,
        "softmax": False,
        "temperature": 3,
        "temperature_decay": 0.99,
        "num_episodes": 5,
        "max_steps": 30000,
    },
    num_runs=4,
    num_steps=20000,
    num_simulations=3,
    plot=True
)


In [None]:
import numpy as np
import matplotlib.pyplot as plt

# === Setup ===
edges = [(0, 1), (1, 2), (2, 3), (3, 4)]
goal_edges = [(0, 2), (4, 1)]
maxAge = 2

pGen_values = np.linspace(0.1, 1.0, 10)
pSwap_values = np.linspace(0.1, 1.0, 10)

jains_matrix = np.zeros((len(pSwap_values), len(pGen_values)))
throughput_matrix = np.zeros((len(pSwap_values), len(pGen_values)))

# === Run Experiments Over Grid ===
for i, p_swap in enumerate(pSwap_values):
    for j, p_gen in enumerate(pGen_values):
        print(f"Running: pSwap={p_swap:.2f}, pGen={p_gen:.2f}")

        result = run_policy_experiments(
            train_policy_fn=train_q_learning_policy,
            policy_name=f"Q-Learning (pSwap={p_swap:.2f}, pGen={p_gen:.2f})",
            edges=edges,
            goal_edges=goal_edges,
            p_gen=p_gen,
            p_swap=p_swap,
            max_age=maxAge,
            num_runs=3,
            num_steps=5000,
            num_simulations=3,
            train_kwargs={
                "alpha": 0.1,
                "gamma": 0.99,
                "epsilon": 0.1,
                "num_episodes": 5,
                "max_steps": 8000,
                "temperature": 1.0,
                "temperature_decay": 0.98,
                "softmax": False,
            },
            plot=False
        )

        jains_matrix[i, j] = np.mean(result["jains"])
        throughput_matrix[i, j] = np.mean([
            sum(result["edrs"][goal][run_i] for goal in goal_edges)
            for run_i in range(len(result["jains"]))
        ])

# === Plot Heatmaps ===
fig, axs = plt.subplots(2, 1, figsize=(10, 12))

# Shared color scale
vmin = min(jains_matrix.min(), throughput_matrix.min())
vmax = max(jains_matrix.max(), throughput_matrix.max())

# Jain's Fairness Heatmap
im1 = axs[0].imshow(jains_matrix, cmap='viridis', origin='lower',
                    extent=[pGen_values[0], pGen_values[-1], pSwap_values[0], pSwap_values[-1]],
                    aspect='auto', vmin=vmin, vmax=vmax)
axs[0].set_title("Jain's Fairness Index")
axs[0].set_xlabel("pGen")
axs[0].set_ylabel("pSwap")

# Throughput Heatmap
im2 = axs[1].imshow(throughput_matrix, cmap='viridis', origin='lower',
                    extent=[pGen_values[0], pGen_values[-1], pSwap_values[0], pSwap_values[-1]],
                    aspect='auto', vmin=vmin, vmax=vmax)
axs[1].set_title("Total Throughput")
axs[1].set_xlabel("pGen")
axs[1].set_ylabel("pSwap")

# Shared colorbar at the top
cbar_ax = fig.add_axes([0.2, 0.92, 0.6, 0.02])
fig.colorbar(im2, cax=cbar_ax, orientation='horizontal')
cbar_ax.set_title("Shared Value Scale")

plt.tight_layout(rect=[0, 0, 1, 0.9])
plt.show()




