In [28]:
import yaml
import pandas as pd

document = """
comm_name_to_numa_id:
  Task1->Task3: NUMA IDs = 0 1 
  Task1->Task2: NUMA IDs = 0 1 

exec_name_to_locality:
  Task2: NUMA ID = 0, Core ID = 2, Voluntary CS = 0, Involuntary CS = 83, Core Migrations = 0
  Task3: NUMA ID = 0, Core ID = 20, Voluntary CS = 1, Involuntary CS = 27, Core Migrations = 0
  Task1: NUMA ID = 0, Core ID = 23, Voluntary CS = 0, Involuntary CS = 33, Core Migrations = 0
"""

# print(yaml.dump(yaml.load(document, Loader=yaml.FullLoader), default_flow_style=False, allow_unicode=True, indent=4))

data = yaml.load(document, Loader=yaml.FullLoader)

# Extract relevant data
exec_name_to_locality = data["exec_name_to_locality"]
comm_name_to_numa_id = data["comm_name_to_numa_id"]

# Initialize an empty list to store rows for the DataFrame
rows = []

# Process each task_name and cpu_node from exec_name_to_locality
for task_name, locality in exec_name_to_locality.items():
    cpu_node = locality.split("NUMA ID = ")[1].split(",")[0].strip()
    
    # Check if task_name exists in comm_name_to_numa_id
    for comm_name, numa_ids in comm_name_to_numa_id.items():
        if task_name in comm_name:
            mem_nodes = numa_ids.split("NUMA IDs = ")[1].strip().split()
            # Determine the access type (write if task_name is on the left, read if on the right)
            access_type = "write" if comm_name.split("->")[0] == task_name else "read"
            # Add rows for each mem_node with access type and comm_name as data_item
            for mem_node in mem_nodes:
                rows.append([comm_name, task_name, cpu_node, mem_node, access_type])

# Create a DataFrame
df = pd.DataFrame(rows, columns=["data_item", "task_name", "cpu_node", "mem_node", "access_type"])

# Display the DataFrame
print(df)

      data_item task_name cpu_node mem_node access_type
0  Task1->Task2     Task2        0        0        read
1  Task1->Task2     Task2        0        1        read
2  Task1->Task3     Task3        0        0        read
3  Task1->Task3     Task3        0        1        read
4  Task1->Task3     Task1        0        0       write
5  Task1->Task3     Task1        0        1       write
6  Task1->Task2     Task1        0        0       write
7  Task1->Task2     Task1        0        1       write


In [27]:
# Function to aggregate based on equal or different cpu_node and mem_node
def aggregate_data(df, equal=True):
    if equal:
        # Aggregate when cpu_node and mem_node are equal
        aggregated_df = df[df['cpu_node'] == df['mem_node']].groupby(['cpu_node', 'mem_node']).size().reset_index(name='count')
    else:
        # Aggregate when cpu_node and mem_node are different
        aggregated_df = df[df['cpu_node'] != df['mem_node']].groupby(['cpu_node', 'mem_node']).size().reset_index(name='count')
    
    return aggregated_df

# Example usage: aggregate when cpu_node and mem_node are equal
equal_aggregated_df = aggregate_data(df, equal=True)
print("Aggregated Data (cpu_node == mem_node):")
print(equal_aggregated_df)

# Example usage: aggregate when cpu_node and mem_node are different
different_aggregated_df = aggregate_data(df, equal=False)
print("\nAggregated Data (cpu_node != mem_node):")
print(different_aggregated_df)

Aggregated Data (cpu_node == mem_node):
  cpu_node mem_node  count
0        0        0      4

Aggregated Data (cpu_node != mem_node):
  cpu_node mem_node  count
0        0        1      4
