In [54]:
import os
import json
import regex

from collections import (
    namedtuple,
    defaultdict,
)


from pathlib import Path


import matplotlib.pyplot as plt
import seaborn as sns

import pandas as pd


In [1]:
"""
PY:
- Every function has 2 graphs:
	- 1 graph for memory
	- 1 graph for performance
	
	- Each graph contains 2 lines or 2 bar charts:
		- 1 for the object
		- 1 for the tuple
		
	- Each Graph:
		- X axis: Depends on the parameter that is being varied for the particular function
			- E.g: node position or node count
"""

'\nPY:\n- Every function has 2 graphs:\n\t- 1 graph for memory\n\t- 1 graph for performance\n\t\n\t- Each graph contains 2 lines or 2 bar charts:\n\t\t- 1 for the object\n\t\t- 1 for the tuple\n\t\t\n\t- Each Graph:\n\t\t- X axis: Depends on the parameter that is being varied for the particular function\n\t\t\t- E.g: node position or node count\n'

In [55]:
NODE_SIZES = [1, 10, 100, 1000, 10000, 100000, 1000000]

OUTPUT_DIR = Path(Path.cwd() / ".." / "output" ).resolve()
OUTPUT_DIR.mkdir(exist_ok=True)

PLOT_DIR = Path(OUTPUT_DIR / "plots" )
PLOT_DIR.mkdir(exist_ok=True)

BENCHMARK_JSON = Path(OUTPUT_DIR / "benchmark.report.json") # Memory Analysis

ReportEntry = namedtuple("ReportEntry", ["spec", "measurement", "entry_value"])


print(OUTPUT_DIR.resolve())

/home/dimitriy/Effect_Healthcare/TS-Mapper-DataStructures/output


In [61]:
def load_json_report(fp: Path):
    with open(fp, "r") as f:
        data = json.load(fp.open())
    return data

benchmarks_data = load_json_report(BENCHMARK_JSON)


# Filter the nodes to extract those whose measurement is "performance"

def extract(benchmark):
    if "measurement" not in benchmark:
        raise ValueError("The benchmark does not have a measurement key")
    
    spec = benchmark.get("spec")
    measurement_dict = benchmark.get("measurement") # Get the measurement dictionary
    benchmark_type = list(measurement_dict.keys())[0] # Get the type of the benchmark (memory or performance)
    recorded_value_dict = measurement_dict.get(benchmark_type) # Get the value of the measurement
    entry_value = recorded_value_dict.get("time") if benchmark_type == "performance" else recorded_value_dict.get("size")    
    return ReportEntry(
        spec=spec,
        measurement=benchmark_type,
        entry_value=entry_value
    )
    

performance_benchmarks = list(
    map(extract, filter(lambda x: "performance" in x["measurement"], benchmarks_data))
)
memory_benchmarks = list(
    map(extract, filter(lambda x:  "memory" in x["measurement"], benchmarks_data))
)



print(len(performance_benchmarks))
print(len(memory_benchmarks))

278
279


In [57]:
# Split the benchmark specs based on the data structure used
# The spec description is in the form of "Given <Object|Tuple> Data Structure..."

def split_benchmarks(benchmarks):
    object_benchmarks = list(filter(lambda x: "Object" in x.spec, benchmarks))
    tuple_benchmarks = list(filter(lambda x: "Tuple" in x.spec, benchmarks))
    return object_benchmarks, tuple_benchmarks

object_performance_benchmarks, tuple_performance_benchmarks = split_benchmarks(performance_benchmarks)
object_memory_benchmarks, tuple_memory_benchmarks = split_benchmarks(memory_benchmarks)

print(len(object_memory_benchmarks))
print(len(tuple_performance_benchmarks))

print(len(object_performance_benchmarks))
print(len(tuple_memory_benchmarks))

140
139
139
139


In [58]:
def create_size_dict(benchmarks):
    size_dict = defaultdict(list)
    for benchmark in benchmarks:
        size = int(regex.search(r"\d+", benchmark.spec).group())
        size_dict[size].append(benchmark)
    return size_dict

def sort_benchmarks(benchmarks):
    #benchmarks.sort(key=lambda x: int(regex.search(r"\d+", x.spec).group()))
    return sorted(benchmarks, key=lambda x: int(regex.search(r"\d+", x.spec).group()))

def split_on_details(benchmarks):
    # Split on if the word small or large is in the spec
    small_benchmarks = list(filter(lambda x: "small" in x.spec and "large" not in x.spec, benchmarks))
    large_benchmarks = list(filter(lambda x: "large" in x.spec and "small" not in x.spec, benchmarks))

    small_benchmarks, large_benchmarks= sort_benchmarks(small_benchmarks), sort_benchmarks(large_benchmarks)
    
    # Create a dictionary of the benchmarks where the key is the size of the data structure and the values are the benchmarks
    return create_size_dict(small_benchmarks), create_size_dict(large_benchmarks)

    #return small_benchmarks, large_benchmarks
            
object_small_performance_benchmarks, object_large_performance_benchmarks = split_on_details(object_performance_benchmarks)
object_small_memory_benchmarks, object_large_memory_benchmarks = split_on_details(object_memory_benchmarks)

tuple_small_performance_benchmarks, tuple_large_performance_benchmarks = split_on_details(tuple_performance_benchmarks)
tuple_small_memory_benchmarks, tuple_large_memory_benchmarks = split_on_details(tuple_memory_benchmarks)

for key, value in object_large_memory_benchmarks.items():
    print(key, len(value))



1 10
10 10
100 10
1000 10
10000 10
100000 10
1000000 10


In [59]:

def create_dataframe(benchmarks_dict):
    data = []    # Sort the benchmarks based on the size of the data structure

    for size, benchmarks in benchmarks_dict.items():
        for benchmark in benchmarks:
            recorded_value_name = "time_(ms)" if benchmark.measurement == "performance" else "size_(bytes)"
            recorded_value = benchmark.entry_value

            # Convert the recorded value to seconds if the measurement is performance
            if benchmark.measurement == "performance":
                recorded_value_name = "time_(s)"
                recorded_value = recorded_value / 1000

            data.append({
                "data_structure": "Object" if "Object" in benchmark.spec else "Tuple",
                "node_size": size,
                "measurement": benchmark.measurement,
                recorded_value_name: recorded_value,
            })
    df = pd.DataFrame(data)

    #df = df.sort_values(by=["node_size", "measurement"])
    # Max-scale the recorded values by dividing by the maximum value in the column
    if "time_(s)" in df.columns:
        df["time_(s)_max_scaled"] = df["time_(s)"] / df["time_(s)"].max()

    if "size (bytes)" in df.columns:
        df["size (bytes)_max_scaled"] = df["size_(bytes)"] / df["size_(bytes)"].max()

    return df



object_small_performance_df = create_dataframe(object_small_performance_benchmarks)
object_large_performance_df = create_dataframe(object_large_performance_benchmarks)

object_small_memory_df = create_dataframe(object_small_memory_benchmarks)
object_large_memory_df = create_dataframe(object_large_memory_benchmarks)

tuple_small_performance_df = create_dataframe(tuple_small_performance_benchmarks)
tuple_large_performance_df = create_dataframe(tuple_large_performance_benchmarks)

tuple_small_memory_df = create_dataframe(tuple_small_memory_benchmarks)
tuple_large_memory_df = create_dataframe(tuple_large_memory_benchmarks)

memory_dfs = [object_small_memory_df, object_large_memory_df, tuple_small_memory_df, tuple_large_memory_df]
performance_dfs = [object_small_performance_df, object_large_performance_df, tuple_small_performance_df, tuple_large_performance_df]
object_dfs = [object_small_memory_df, object_large_memory_df, object_small_performance_df, object_large_performance_df]
tuple_dfs = [tuple_small_memory_df, tuple_large_memory_df, tuple_small_performance_df, tuple_large_performance_df]

def plot_memory(df, title, output_fp):
    plt.figure(figsize=(12, 8))
    sns.set_style("whitegrid")
    sns.lineplot(x="node_size", y="size_(bytes)", hue="data_structure", style="measurement", markers=True, data=df, )
    plt.title(title)
    plt.savefig(output_fp)
    plt.close()


# Plot the memory benchmarks against the node size for both the object and tuple data structures

#plot_memory(object_small_memory_df, "Memory Usage vs Node Size (Object Data Structure)", PLOT_DIR / "memory_object_small.png")



tuple_large_performance_df

Unnamed: 0,data_structure,node_size,measurement,time_(s),time_(s)_max_scaled
0,Tuple,1,performance,0.0001,0.000033
1,Tuple,1,performance,0.0002,0.000066
2,Tuple,1,performance,0.0002,0.000066
3,Tuple,1,performance,0.0000,0.000000
4,Tuple,1,performance,0.0001,0.000033
...,...,...,...,...,...
65,Tuple,1000000,performance,0.0288,0.009476
66,Tuple,1000000,performance,0.5301,0.174421
67,Tuple,1000000,performance,0.0000,0.000000
68,Tuple,1000000,performance,0.0359,0.011812


In [60]:


# The raw values of the benchmarks are not very useful for visualization, so we will normalize the values by dividing them by the maximum value in the data frame

"""
Number of Graphs:
    - 1 per operation (3*3 + 1) = 10 (Assuming performance is Y1 and memory is Y2)
        - Else: 20 graphs, but only 1 Y axis 


Generic Graph:
- Axes:
    - Y:
        - Y1: Performance (Time in seconds)
        - Y2: Memory (Size in bytes)
    - X: Node Size (1, 10, 100, 1000, 10000, 100000, 1000000)

- Data (Series 1 => Small | Series 2 => Large):
    - Data Point:

What we want to see:
    - Difference in performance between the object and tuple data structures
    - 


"""


'\nNumber of Graphs:\n    - 1 per operation (3*3 + 1) = 10 (Assuming performance is Y1 and memory is Y2)\n        - Else: 20 graphs, but only 1 Y axis \n\n\nGeneric Graph:\n- Axes:\n    - Y:\n        - Y1: Performance (Time in seconds)\n        - Y2: Memory (Size in bytes)\n    - X: Node Size (1, 10, 100, 1000, 10000, 100000, 1000000)\n\n- Data (Series 1 => Small | Series 2 => Large):\n    - Data Point:\n\nWhat we want to see:\n    - Difference in performance between the object and tuple data structures\n    - \n\n\n'