In [2]:
import collections

def move_conveyor(conveyor):
    """
    Simulate the conveyor belt moving by one step.
    We'll rotate the list so that the last element becomes first.
    """
    conveyor = collections.deque(conveyor)
    conveyor.rotate(1)  # moves all elements one position to the right
    return list(conveyor)

def extract_observation(conveyor, agent_position, window_size):
    """
    Extracts the observation window for the agent.
    The agent can observe the job at its own position and the preceding window_size-1 positions.
    If there are fewer than window_size jobs available, pad with 0.
    """
    observation = []
    for i in range(window_size):
        pos = (agent_position - i) % len(conveyor)
        observation.append(conveyor[pos])
    # Reverse so that the closest job is first in the list
    observation = list(reversed(observation))
    return observation

def compute_wait_reward(state):
    """
    Compute a dummy wait reward.
    In the actual paper, this reward is scaled by the distance.
    Here, we'll simply return a fixed reward for demonstration.
    """
    return 0.5

def compute_accept_reward(state):
    """
    Compute a dummy accept reward.
    """
    return 1.0

def remove_job_at_position(conveyor, position):
    """
    Simulate removing a job from the conveyor at a given position.
    We'll set that position to 0 (empty) for simplicity.
    """
    new_conveyor = conveyor.copy()
    new_conveyor[position] = 0
    return new_conveyor

def environment_step(state, action):
    """
    Simulates a single timestep in the environment.
    `state` is a dictionary containing:
      - 'conveyor': list representing jobs on the conveyor (0 means empty)
      - 'agent_position': index where the agent is located
      - 'window_size': how many positions the agent can observe (including its own)
      - 'observation': the current observation window (list of job values)
      - 'internal_flag': internal flag indicating the agent's internal decision (e.g., 'waiting')
    `action` can be 'accept', 'wait', or 'decline'.
    Returns the updated state and reward.
    """
    next_state = state.copy()
    reward = 0.0

    if action == 'wait':
        reward = compute_wait_reward(state)
        # The agent remains idle; however, the conveyor moves.
        next_state['conveyor'] = move_conveyor(state['conveyor'])
        # Update the observation based on the new conveyor positions.
        next_state['observation'] = extract_observation(
            next_state['conveyor'], state['agent_position'], state['window_size'])
        # Set internal flag to indicate waiting
        next_state['internal_flag'] = 'waiting'
    
    elif action == 'accept':
        reward = compute_accept_reward(state)
        # Remove the job at the agent's position (simulate acceptance)
        next_state['conveyor'] = remove_job_at_position(state['conveyor'], state['agent_position'])
        # Update observation
        next_state['observation'] = extract_observation(
            next_state['conveyor'], state['agent_position'], state['window_size'])
        # Change internal flag to indicate working (or processing)
        next_state['internal_flag'] = 'working'
    
    elif action == 'decline':
        # Decline action: remove the job at agent's position without waiting
        next_state['conveyor'] = remove_job_at_position(state['conveyor'], state['agent_position'])
        next_state['observation'] = extract_observation(
            next_state['conveyor'], state['agent_position'], state['window_size'])
        # Reset the internal flag
        next_state['internal_flag'] = None
        reward = 0.0
    
    return next_state, reward

def main():
    # Initialize a simple state:
    # Conveyor is a list of 10 positions, with some jobs represented by non-zero integers.
    initial_conveyor = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
    agent_position = 3  # The agent is adjacent to position index 3
    window_size = 4     # The agent observes 4 positions: its own plus 3 preceding positions
    
    state = {
        'conveyor': initial_conveyor,
        'agent_position': agent_position,
        'window_size': window_size,
        'observation': extract_observation(initial_conveyor, agent_position, window_size),
        'internal_flag': None
    }
    
    print("Initial State:")
    print("Conveyor:", state['conveyor'])
    print("Agent Position:", state['agent_position'])
    print("Observation:", state['observation'])
    print("Internal Flag:", state['internal_flag'])
    print("---------")
    
    # Simulate a wait action
    action = 'wait'
    next_state, reward = environment_step(state, action)
    print(f"After action '{action}':")
    print("Conveyor:", next_state['conveyor'])
    print("Observation:", next_state['observation'])
    print("Internal Flag:", next_state['internal_flag'])
    print("Reward:", reward)
    print("---------")
    
    # Simulate an accept action on the new state
    action = 'accept'
    next_state2, reward2 = environment_step(next_state, action)
    print(f"After action '{action}':")
    print("Conveyor:", next_state2['conveyor'])
    print("Observation:", next_state2['observation'])
    print("Internal Flag:", next_state2['internal_flag'])
    print("Reward:", reward2)
    print("---------")
    
    # Simulate a decline action on the new state
    action = 'decline'
    next_state3, reward3 = environment_step(next_state, action)
    print(f"After action '{action}':")
    print("Conveyor:", next_state3['conveyor'])
    print("Observation:", next_state3['observation'])
    print("Internal Flag:", next_state3['internal_flag'])
    print("Reward:", reward3)

if __name__ == '__main__':
    main()


Initial State:
Conveyor: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
Agent Position: 3
Observation: [1, 2, 3, 4]
Internal Flag: None
---------
After action 'wait':
Conveyor: [10, 1, 2, 3, 4, 5, 6, 7, 8, 9]
Observation: [10, 1, 2, 3]
Internal Flag: waiting
Reward: 0.5
---------
After action 'accept':
Conveyor: [10, 1, 2, 0, 4, 5, 6, 7, 8, 9]
Observation: [10, 1, 2, 0]
Internal Flag: working
Reward: 1.0
---------
After action 'decline':
Conveyor: [10, 1, 2, 0, 4, 5, 6, 7, 8, 9]
Observation: [10, 1, 2, 0]
Internal Flag: None
Reward: 0.0


In [3]:
import collections

def move_conveyor(conveyor):
    """
    Simulate the conveyor belt moving by one step.
    We'll rotate the list so that the last element becomes first.
    """
    conveyor = collections.deque(conveyor)
    conveyor.rotate(1)  # moves all elements one position to the right
    return list(conveyor)

def get_observation(conveyor, agent_position, window_size, 
                    agent_capabilities, current_op, global_status):
    """
    Constructs the observation vector ω₍r,t₎ as defined in the paper.
    
    Parameters:
      - agent_position (yᵣ): The fixed position (index) where the agent is located.
      - agent_capabilities (Oᵣ): A list representing the set of operations the agent can perform.
      - current_op (O₍r,t₎): The current operation in use (0 if idle).
      - global_status (Zₜ): A list representing the statuses of all agents.
      - conveyor: The list representing jobs on the conveyor. Each index corresponds to a section.
      - window_size (w): The number of sections the agent can observe (its own plus w-1 preceding sections).
      
    Returns a dictionary representing the observation vector.
    """
    # The agent can see its own section and the preceding window_size-1 sections.
    jobs_window = []
    for i in range(window_size):
        pos = (agent_position - i) % len(conveyor)
        jobs_window.append(conveyor[pos])
    # For clarity, we reverse the window so that the closest (at agent_position) comes first.
    jobs_window = list(reversed(jobs_window))
    
    observation = {
        'agent_position': agent_position,    # yᵣ
        'capabilities': agent_capabilities,    # Oᵣ
        'current_operation': current_op,       # O₍r,t₎ (0 if idle)
        'global_status': global_status,        # Zₜ (list of statuses for all agents)
        'jobs_window': jobs_window             # [Ŝ₍yᵣ,t₎, Ŝ₍yᵣ₋1,t₎, …, Ŝ₍yᵣ₋w+1,t₎]
    }
    return observation

def compute_wait_reward(state):
    """
    Compute a dummy wait reward.
    In the actual paper, this reward is scaled by the distance (number of sections).
    Here, we return a fixed reward for demonstration.
    """
    return 0.5

def compute_accept_reward(state):
    """
    Compute a dummy accept reward.
    """
    return 1.0

def remove_job_at_position(conveyor, position):
    """
    Simulate removing a job from the conveyor at a given position.
    We set that position to 0 (empty).
    """
    new_conveyor = conveyor.copy()
    new_conveyor[position] = 0
    return new_conveyor

def environment_step(state, action):
    """
    Simulates one timestep in the environment.
    
    The state is a dictionary containing:
      - 'conveyor': list of jobs (each position corresponds to Ŝ₍q,t₎)
      - 'agent_position': fixed index yᵣ of the agent
      - 'window_size': w (number of observable sections)
      - 'capabilities': Oᵣ (agent's operation capabilities)
      - 'current_operation': O₍r,t₎ (0 if idle)
      - 'global_status': Zₜ (dummy list of statuses for all agents)
      - 'internal_flag': internal marker for the agent (e.g. 'waiting')
      - 'observation': current observation vector (dict)
    
    The action can be 'accept', 'wait', or 'decline'.
    Returns the updated state and a reward.
    """
    next_state = state.copy()
    reward = 0.0
    
    if action == 'wait':
        reward = compute_wait_reward(state)
        # The conveyor moves.
        next_state['conveyor'] = move_conveyor(state['conveyor'])
        # The agent remains idle: current_operation remains 0.
        # Update the observation using the new conveyor configuration.
        next_state['observation'] = get_observation(
            next_state['conveyor'],
            state['agent_position'],
            state['window_size'],
            state['capabilities'],
            state['current_operation'],
            state['global_status']
        )
        # Set an internal flag to record that the agent is waiting.
        next_state['internal_flag'] = 'waiting'
    
    elif action == 'accept':
        reward = compute_accept_reward(state)
        # Remove the job at the agent's position (simulate acceptance).
        next_state['conveyor'] = remove_job_at_position(state['conveyor'], state['agent_position'])
        # Update observation.
        next_state['observation'] = get_observation(
            next_state['conveyor'],
            state['agent_position'],
            state['window_size'],
            state['capabilities'],
            1,  # Assume 1 indicates the agent has started processing.
            state['global_status']
        )
        # Update internal flag to indicate the agent is now working.
        next_state['internal_flag'] = 'working'
        # Update current_operation to a non-zero value (simulated here as 1).
        next_state['current_operation'] = 1
    
    elif action == 'decline':
        # Decline: remove the job from the agent's immediate view.
        next_state['conveyor'] = remove_job_at_position(state['conveyor'], state['agent_position'])
        # Update observation.
        next_state['observation'] = get_observation(
            next_state['conveyor'],
            state['agent_position'],
            state['window_size'],
            state['capabilities'],
            state['current_operation'],  # remains 0
            state['global_status']
        )
        # Reset internal flag.
        next_state['internal_flag'] = None
        reward = 0.0
    
    return next_state, reward

def main():
    # Initialize a simple state.
    # For example, we have a conveyor with 10 positions.
    initial_conveyor = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
    agent_position = 3        # Agent is located at index 3.
    window_size = 4           # Agent observes 4 sections: from its own (index 3) and 3 preceding indices.
    capabilities = [1, 2]     # Dummy operation capabilities.
    current_op = 0            # Agent is idle initially (0 indicates no operation in use).
    global_status = [0, 0, 0]  # Dummy global statuses for all agents (e.g., all idle).
    
    # Create the initial observation vector according to the paper.
    observation = get_observation(initial_conveyor, agent_position, window_size,
                                  capabilities, current_op, global_status)
    
    state = {
        'conveyor': initial_conveyor,
        'agent_position': agent_position,
        'window_size': window_size,
        'capabilities': capabilities,
        'current_operation': current_op,
        'global_status': global_status,
        'internal_flag': None,
        'observation': observation
    }
    
    print("Initial State:")
    print("Conveyor:", state['conveyor'])
    print("Observation:", state['observation'])
    print("Internal Flag:", state['internal_flag'])
    print("---------")
    
    # Simulate a 'wait' action.
    action = 'wait'
    next_state, reward = environment_step(state, action)
    print(f"After action '{action}':")
    print("Conveyor:", next_state['conveyor'])
    print("Observation:", next_state['observation'])
    print("Internal Flag:", next_state['internal_flag'])
    print("Reward:", reward)
    print("---------")
    
    # Simulate an 'accept' action on the new state.
    action = 'accept'
    next_state2, reward2 = environment_step(next_state, action)
    print(f"After action '{action}':")
    print("Conveyor:", next_state2['conveyor'])
    print("Observation:", next_state2['observation'])
    print("Internal Flag:", next_state2['internal_flag'])
    print("Reward:", reward2)
    print("---------")
    
    # Simulate a 'decline' action on the new state.
    action = 'decline'
    next_state3, reward3 = environment_step(next_state, action)
    print(f"After action '{action}':")
    print("Conveyor:", next_state3['conveyor'])
    print("Observation:", next_state3['observation'])
    print("Internal Flag:", next_state3['internal_flag'])
    print("Reward:", reward3)

if __name__ == '__main__':
    main()


Initial State:
Conveyor: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
Observation: {'agent_position': 3, 'capabilities': [1, 2], 'current_operation': 0, 'global_status': [0, 0, 0], 'jobs_window': [1, 2, 3, 4]}
Internal Flag: None
---------
After action 'wait':
Conveyor: [10, 1, 2, 3, 4, 5, 6, 7, 8, 9]
Observation: {'agent_position': 3, 'capabilities': [1, 2], 'current_operation': 0, 'global_status': [0, 0, 0], 'jobs_window': [10, 1, 2, 3]}
Internal Flag: waiting
Reward: 0.5
---------
After action 'accept':
Conveyor: [10, 1, 2, 0, 4, 5, 6, 7, 8, 9]
Observation: {'agent_position': 3, 'capabilities': [1, 2], 'current_operation': 1, 'global_status': [0, 0, 0], 'jobs_window': [10, 1, 2, 0]}
Internal Flag: working
Reward: 1.0
---------
After action 'decline':
Conveyor: [10, 1, 2, 0, 4, 5, 6, 7, 8, 9]
Observation: {'agent_position': 3, 'capabilities': [1, 2], 'current_operation': 0, 'global_status': [0, 0, 0], 'jobs_window': [10, 1, 2, 0]}
Internal Flag: None
Reward: 0.0


In [4]:
import collections

def move_conveyor(conveyor):
    """
    Simulate the conveyor belt moving by one timestep.
    We rotate the list so that the last element becomes the first.
    """
    conveyor = collections.deque(conveyor)
    conveyor.rotate(1)  # rotate right by one position
    return list(conveyor)

def get_observation(conveyor, agent_position, window_size, 
                    agent_capabilities, current_op, global_status):
    """
    Constructs the observation vector ω₍r,t₎ as defined in the paper.
    The observation includes:
      - agent_position (yᵣ)
      - agent_capabilities (Oᵣ)
      - current_operation (O₍r,t₎, 0 if idle)
      - global_status (Zₜ)
      - jobs_window: [Ŝ₍yᵣ,t₎, Ŝ₍yᵣ₋1,t₎, …, Ŝ₍yᵣ₋w+1,t₎]
    """
    jobs_window = []
    for i in range(window_size):
        pos = (agent_position - i) % len(conveyor)
        jobs_window.append(conveyor[pos])
    # Reverse so that the closest (at agent_position) comes first in the list.
    jobs_window = list(reversed(jobs_window))
    
    observation = {
        'agent_position': agent_position,    # yᵣ
        'capabilities': agent_capabilities,    # Oᵣ
        'current_operation': current_op,       # O₍r,t₎ (0 if idle)
        'global_status': global_status,        # Zₜ (list for all agents)
        'jobs_window': jobs_window             # [Ŝ₍yᵣ,t₎, Ŝ₍yᵣ₋1,t₎, Ŝ₍yᵣ₋2,t₎] for window_size=3
    }
    return observation

def compute_wait_reward(state, distance):
    """
    Compute a dummy wait reward.
    In the paper, this reward is scaled by the distance x.
    For demonstration, we return 0.5 divided by the distance.
    """
    return 0.5 / distance

def compute_accept_reward(state):
    """
    Compute a dummy accept reward.
    """
    return 1.0

def remove_job_at_position(conveyor, position):
    """
    Simulate removing a job from the conveyor at a given position.
    For simplicity, we set that position to 0 (empty).
    """
    new_conveyor = conveyor.copy()
    new_conveyor[position] = 0
    return new_conveyor

def environment_step(state, action):
    """
    Simulate one timestep in the environment.
    The state is a dictionary containing:
      - 'conveyor': list representing jobs on the conveyor (each section is Ŝ₍q,t₎)
      - 'agent_position': fixed index yᵣ of the agent
      - 'window_size': number of observable sections (w, here 3)
      - 'capabilities': Oᵣ (dummy list)
      - 'current_operation': O₍r,t₎ (0 if idle)
      - 'global_status': Zₜ (dummy list for all agents)
      - 'internal_flag': an internal marker (e.g., 'waiting')
      - 'observation': current observation vector (dict)
    
    The action is a string. Allowed actions are:
      'accept'         - Accept job at current position (A₀)
      'wait_1' or 'wait_2'  - Wait for the job at distance 1 or 2 (A₁ or A₂)
      'decline'        - Decline the current job (A_d)
      'continue'       - Continue with no decision (A_c)
    
    Returns the updated state and a reward.
    """
    next_state = state.copy()
    reward = 0.0

    if action.startswith('wait'):
        # Extract the distance parameter from the action string, e.g., 'wait_1' means distance=1.
        _, distance_str = action.split('_')
        distance = int(distance_str)
        reward = compute_wait_reward(state, distance)
        
        # In a wait action, the agent does not pick the job at its own section.
        # Instead, it waits for the job at distance 'distance'.
        # Meanwhile, the environment updates: the conveyor moves.
        next_state['conveyor'] = move_conveyor(state['conveyor'])
        # Update the observation using the new conveyor configuration.
        next_state['observation'] = get_observation(
            next_state['conveyor'],
            state['agent_position'],
            state['window_size'],
            state['capabilities'],
            state['current_operation'],  # remains 0 (idle)
            state['global_status']
        )
        # Internally, record that the agent is waiting for the job at the given distance.
        next_state['internal_flag'] = f'waiting_for_distance_{distance}'
    
    elif action == 'accept':
        reward = compute_accept_reward(state)
        # Accept action: the agent picks the job at its own section (position yᵣ).
        next_state['conveyor'] = remove_job_at_position(state['conveyor'], state['agent_position'])
        # Update the observation.
        # Now, assume the agent starts working, so current_operation is set to 1.
        next_state['observation'] = get_observation(
            next_state['conveyor'],
            state['agent_position'],
            state['window_size'],
            state['capabilities'],
            1,  # non-zero indicates that the agent is processing the job
            state['global_status']
        )
        next_state['internal_flag'] = 'working'
        next_state['current_operation'] = 1
    
    elif action == 'decline':
        # Decline action: the agent rejects the current job without waiting for any upcoming job.
        next_state['conveyor'] = remove_job_at_position(state['conveyor'], state['agent_position'])
        next_state['observation'] = get_observation(
            next_state['conveyor'],
            state['agent_position'],
            state['window_size'],
            state['capabilities'],
            state['current_operation'],  # remains 0 (idle)
            state['global_status']
        )
        next_state['internal_flag'] = None
        reward = 0.0
    
    elif action == 'continue':
        # Continue action: no decision is required.
        # The agent simply observes the next state (the environment moves).
        next_state['conveyor'] = move_conveyor(state['conveyor'])
        next_state['observation'] = get_observation(
            next_state['conveyor'],
            state['agent_position'],
            state['window_size'],
            state['capabilities'],
            state['current_operation'],  # remains unchanged
            state['global_status']
        )
        # No change in the internal flag.
        reward = 0.0
    
    return next_state, reward

def main():
    # Initial state setup:
    # Let's assume a conveyor with 10 sections where jobs are represented by numbers.
    initial_conveyor = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
    agent_position = 3        # The agent is adjacent to section at index 3.
    window_size = 3           # The agent observes 3 sections: its own and 2 preceding sections.
    capabilities = [1, 2]     # Dummy operation capabilities.
    current_op = 0            # Initially idle.
    global_status = [0, 0, 0]  # Dummy statuses for all agents.
    
    # Build the initial observation vector.
    observation = get_observation(initial_conveyor, agent_position, window_size,
                                  capabilities, current_op, global_status)
    
    state = {
        'conveyor': initial_conveyor,
        'agent_position': agent_position,
        'window_size': window_size,
        'capabilities': capabilities,
        'current_operation': current_op,
        'global_status': global_status,
        'internal_flag': None,
        'observation': observation
    }
    
    print("Initial State:")
    print("Conveyor:", state['conveyor'])
    print("Observation:", state['observation'])
    print("Internal Flag:", state['internal_flag'])
    print("---------")
    
    # Simulate a 'wait' action for distance 1.
    action = 'wait_1'
    next_state, reward = environment_step(state, action)
    print(f"After action '{action}':")
    print("Conveyor:", next_state['conveyor'])
    print("Observation:", next_state['observation'])
    print("Internal Flag:", next_state['internal_flag'])
    print("Reward:", reward)
    print("---------")
    
    # Simulate an 'accept' action on the resulting state.
    action = 'accept'
    next_state2, reward2 = environment_step(next_state, action)
    print(f"After action '{action}':")
    print("Conveyor:", next_state2['conveyor'])
    print("Observation:", next_state2['observation'])
    print("Internal Flag:", next_state2['internal_flag'])
    print("Reward:", reward2)
    print("---------")
    
    # Simulate a 'decline' action on the original state.
    action = 'decline'
    next_state3, reward3 = environment_step(state, action)
    print(f"After action '{action}':")
    print("Conveyor:", next_state3['conveyor'])
    print("Observation:", next_state3['observation'])
    print("Internal Flag:", next_state3['internal_flag'])
    print("Reward:", reward3)
    print("---------")
    
    # Simulate a 'continue' action on the original state.
    action = 'continue'
    next_state4, reward4 = environment_step(state, action)
    print(f"After action '{action}':")
    print("Conveyor:", next_state4['conveyor'])
    print("Observation:", next_state4['observation'])
    print("Internal Flag:", next_state4['internal_flag'])
    print("Reward:", reward4)

if __name__ == '__main__':
    main()


Initial State:
Conveyor: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
Observation: {'agent_position': 3, 'capabilities': [1, 2], 'current_operation': 0, 'global_status': [0, 0, 0], 'jobs_window': [2, 3, 4]}
Internal Flag: None
---------
After action 'wait_1':
Conveyor: [10, 1, 2, 3, 4, 5, 6, 7, 8, 9]
Observation: {'agent_position': 3, 'capabilities': [1, 2], 'current_operation': 0, 'global_status': [0, 0, 0], 'jobs_window': [1, 2, 3]}
Internal Flag: waiting_for_distance_1
Reward: 0.5
---------
After action 'accept':
Conveyor: [10, 1, 2, 0, 4, 5, 6, 7, 8, 9]
Observation: {'agent_position': 3, 'capabilities': [1, 2], 'current_operation': 1, 'global_status': [0, 0, 0], 'jobs_window': [1, 2, 0]}
Internal Flag: working
Reward: 1.0
---------
After action 'decline':
Conveyor: [1, 2, 3, 0, 5, 6, 7, 8, 9, 10]
Observation: {'agent_position': 3, 'capabilities': [1, 2], 'current_operation': 0, 'global_status': [0, 0, 0], 'jobs_window': [2, 3, 0]}
Internal Flag: None
Reward: 0.0
---------
After action 'continue