In [164]:
import pandas as pd
import random
import numpy as np
import copy
import matplotlib.pyplot as plt
import collections
from collections import deque  # Add this import
import math
!which python

/Users/berat/Desktop/quantum_entanglement/.venv/bin/python


In [165]:
class QuantumInternet():
    def __init__(self, initialEdges, pGen, cutOffAge, goalStates, goalWeights):
        self.initialEdges = initialEdges  
        self.currentEdges = {} 
        self.pGen = pGen
        self.cutOffAge = cutOffAge
        self.goalStates = goalStates
        self.goalWeights = goalWeights
        self.maxLinks = 1
        # Track EDR for each goal state separately
        self.total_timesteps = 0
        self.successful_links = {i: 1e-2 for i in range(len(goalStates))}  # Dictionary for each goal
    
    def get_edrs(self):
        """Returns current EDR for each goal state"""
        return {
            i: self.successful_links[i] / max(1, self.total_timesteps)
            for i in range(len(self.goalStates))
        }
    
    def reset(self):
        self.currentEdges = {}
        # Don't reset EDR counters here as they should persist across episodes
        self.globallyGenerateEntanglements()
        return self.currentEdges
    
    def reset_edr_stats(self):
        """Resets EDR tracking counters"""
        self.total_timesteps = 0
        self.successful_links = {i: 0 for i in range(len(self.goalStates))}
    
    def getState(self) -> dict:
        return self.currentEdges
                
    def generateEntanglement(self, node1, node2):
        edge = tuple(sorted([node1, node2]))
        if edge not in self.currentEdges:
            self.currentEdges[edge] = deque([0])
        else:
            if len(self.currentEdges[edge]) < self.maxLinks:
                self.currentEdges[edge].appendleft(0)

    def globallyGenerateEntanglements(self):
        for edge in self.initialEdges:
            if random.random() < self.pGen:
                self.generateEntanglement(*edge)
    
    def discardEntanglement(self, edge: tuple):
        if edge in self.currentEdges and len(self.currentEdges[edge]) > 0:
            self.currentEdges[edge].pop()
            if len(self.currentEdges[edge]) == 0:
                del self.currentEdges[edge]
    
    def ageEntanglements(self):
        edges_to_check = list(self.currentEdges.keys())
        for edge in edges_to_check:
            newAges = [age + 1 for age in self.currentEdges[edge] if age + 1 <= self.cutOffAge]
            self.currentEdges[edge] = deque(newAges)
            
            if len(self.currentEdges[edge]) == 0:
                self.discardEntanglement(edge)
        
    def isTerminal(self) -> tuple[bool, list]:
        graph = collections.defaultdict(set)
        for (a, b) in self.currentEdges:
            graph[a].add(b)
            graph[b].add(a)
        
        def has_path(start, end):
            if start == end:
                return True
            
            visited = set()
            stack = [start]
            
            while stack:
                current = stack.pop()
                if current not in visited:
                    visited.add(current)
                    
                    if current == end:
                        return True
                    
                    # Add unvisited neighbors to stack
                    stack.extend(
                        next_node for next_node in graph[current] 
                        if next_node not in visited
                    )
            
            return False
        
        matching = [goal for goal in self.goalStates if has_path(goal[0], goal[-1])]
        return bool(matching), matching
                
    def rewardForAction(self, action):
        self.total_timesteps += 1
        if not action:  # If waiting
            return -0.001  # Small penalty for waiting
            
        # Check if we have a successful path
        is_terminal, matching = self.isTerminal()
        
        if not is_terminal:
            return -0.001  # Penalty for failed attempt
        
        # Since we can only achieve one goal at a time, find which goal was matched
        for i, goal in enumerate(self.goalStates):
            if goal in matching:
                self.successful_links[i] += 1
                
                # Calculate EDRs for all goals
                edrs = {j: self.successful_links[j] / max(1, self.total_timesteps) 
                    for j in range(len(self.goalStates))}
                
                # Calculate proportional fairness using p²/EDR formula
                epsilon = 1e-10  # Small constant to avoid division by zero
                reward = 0
                for j in range(len(self.goalStates)):
                    # Square of weight divided by current EDR for that goal
                    reward += self.goalWeights[j] * (self.pGen**2 / (edrs[j]))
                
                return reward
        
        return -0.001  # Fallback case
                
        

In [166]:
def epsilon_greedy_policy(Q, state_key, epsilon):
    # state_key is already a tuple, no need to convert again
    if np.random.rand() < epsilon:
        # Exploration: randomly choose between attempting path or waiting
        return random.choice([True, False])
    else:
        # Exploitation: choose action with highest Q-value
        if state_key not in Q:
            Q[state_key] = {True: 0, False: 0}  # Initialize both actions
        
        return max(Q[state_key].items(), key=lambda x: x[1])[0]

In [171]:
Q = {}  # State-action value function

def n_step_sarsa(env, n, alpha, gamma, epsilon, num_episodes):
    global Q
    episode_rewards = np.zeros(num_episodes)
    
    for episode in range(num_episodes):
        if episode % 100 == 0:
            edrs = env.get_edrs()
            print(f"Episode {episode} EDRs:")
            for goal_idx, edr in edrs.items():
                print(f"Goal {env.goalStates[goal_idx]}: EDR = {edr:.3f}")
        
        state = env.reset()
        state_key = tuple(
            (edge, tuple(ages))
            for edge, ages in sorted(state.items())
        )
        
        action = epsilon_greedy_policy(Q, state_key, epsilon)
        
        T = float('inf')
        t = 0
        tau = 0
        
        # Store states, actions, rewards
        states = [state_key]
        actions = [action]
        rewards = []
        
        while tau < (T - 1):  # Add step limit check
            if t < T:
                # Take action and get reward
                reward = env.rewardForAction(action)
                rewards.append(reward)
                
                # Age entanglements and generate new ones
                env.ageEntanglements()
                env.globallyGenerateEntanglements()
                
                # Get next state
                next_state = env.getState()
                is_terminal, _ = env.isTerminal()
                
                # Convert next_state to hashable format
                next_state_key = tuple(
                    (edge, tuple(ages))
                    for edge, ages in sorted(next_state.items())
                )
                states.append(next_state_key)
                
                if is_terminal:
                    T = t + 1
                else:
                    next_action = epsilon_greedy_policy(Q, next_state_key, epsilon)
                    actions.append(next_action)
            
            tau = t - n + 1
            
            if tau >= 0:
                G = sum([gamma**(i - tau - 1) * rewards[i] for i in range(tau + 1, min(tau + n, T))])
                
                if tau + n < T:
                    if states[tau + n] not in Q:
                        Q[states[tau + n]] = {True: 0, False: 0}
                    G += gamma**n * Q[states[tau + n]][actions[tau + n]]
                
                # Update Q-value
                if states[tau] not in Q:
                    Q[states[tau]] = {True: 0, False: 0}
                Q[states[tau]][actions[tau]] += alpha * (G - Q[states[tau]][actions[tau]])
            
            t += 1
            state = next_state
            action = next_action
        
        episode_rewards[episode] = sum(rewards)
    
    return Q, episode_rewards


# env
random.seed(27)
initialEdges = [(1,3), (2,3), (3,4), (4,5), (6,7), (6,8)]
goalStates = [(1, 4), (2,4)]
goalWeights = [0.3, 0.7]
pGen = 0.3
cutOffAge = 1
# sarsa
n = 1  
alpha = 0.1  
gamma = 0.90  # discount factor
epsilon = 0.1  # exploration rate
num_episodes = 10000
myNetwork = QuantumInternet(initialEdges, pGen, cutOffAge, goalStates, goalWeights)
myQ, myEpisodeRewards = n_step_sarsa(myNetwork, n, alpha, gamma, epsilon, num_episodes)
# THE WEIGHTS DID NOT CHANGE ANYTHING!
# Infact p^2 really disfavours the longer values signficiantly

Episode 0 EDRs:
Goal (1, 4): EDR = 0.010
Goal (2, 4): EDR = 0.010
Episode 100 EDRs:
Goal (1, 4): EDR = 0.007
Goal (2, 4): EDR = 0.010
Episode 200 EDRs:
Goal (1, 4): EDR = 0.022
Goal (2, 4): EDR = 0.014
Episode 300 EDRs:
Goal (1, 4): EDR = 0.022
Goal (2, 4): EDR = 0.015
Episode 400 EDRs:
Goal (1, 4): EDR = 0.026
Goal (2, 4): EDR = 0.020
Episode 500 EDRs:
Goal (1, 4): EDR = 0.024
Goal (2, 4): EDR = 0.020
Episode 600 EDRs:
Goal (1, 4): EDR = 0.027
Goal (2, 4): EDR = 0.020
Episode 700 EDRs:
Goal (1, 4): EDR = 0.027
Goal (2, 4): EDR = 0.024
Episode 800 EDRs:
Goal (1, 4): EDR = 0.027
Goal (2, 4): EDR = 0.024
Episode 900 EDRs:
Goal (1, 4): EDR = 0.026
Goal (2, 4): EDR = 0.023
Episode 1000 EDRs:
Goal (1, 4): EDR = 0.026
Goal (2, 4): EDR = 0.022
Episode 1100 EDRs:
Goal (1, 4): EDR = 0.026
Goal (2, 4): EDR = 0.023
Episode 1200 EDRs:
Goal (1, 4): EDR = 0.026
Goal (2, 4): EDR = 0.022
Episode 1300 EDRs:
Goal (1, 4): EDR = 0.027
Goal (2, 4): EDR = 0.023
Episode 1400 EDRs:
Goal (1, 4): EDR = 0.026
Go