In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from matplotlib.ticker import FuncFormatter
from matplotlib.ticker import MaxNLocator, MultipleLocator

import warnings
warnings.filterwarnings('ignore')

plt.rcParams["font.family"] = "serif"
plt.rcParams["font.serif"] = ["Times New Roman"]

In [2]:
proj_path = "/home/4cv/project/gc_openPMD-viewer/"

# df_path = proj_path + "results/final/middle.csv"
df_path = proj_path + "results/final/middle_max_1GB.csv"
df = pd.read_csv(df_path, header=[0])

In [3]:
def data_process(df):
    print("Before Sampling, each group size:")
    print(df.groupby(['target_percentage', 'select_set']).size() / 7)
    
    number_of_each_group = 10
    target_percentage = "0.01%"
    
    # Filter DataFrame by target_percentage
    df_filtered = df[df['target_percentage'] == target_percentage]
    
    # Group by select_set and get unique envelope values for each group
    unique_envelopes = df_filtered.groupby('select_set')['envelope'].unique().reset_index()
    
    # Rename columns for clarity
    unique_envelopes.columns = ['select_set', 'unique_envelopes']
    
    # print the number of unique envelopes for each group
    # for i in range(number_of_each_group):
    #     print(f"Group {i+1} has {len(unique_envelopes['unique_envelopes'][i])} unique envelopes.")
    
    # Function to sample envelopes
    # def sample_envelopes(envelopes, n):
    #     return list(envelopes.sample(n=min(n, len(envelopes))))
    
    # Apply the sampling to each group
    # unique_envelopes['sampled_envelopes'] = unique_envelopes['unique_envelopes'].apply(lambda x: list(pd.Series(x).sample(number_of_each_group)))
    # # unique_envelopes['sampled_envelopes'] = unique_envelopes['unique_envelopes'].apply(lambda x: sample_envelopes(pd.Series(x), number_of_each_group))
    #
    # # Clean up the DataFrame by dropping the original unique envelopes column
    # for i in range(number_of_each_group):
    #     print(f"Group {i+1} has {len(unique_envelopes['sampled_envelopes'][i])} sampled envelopes.")
    
    unique_envelopes['sampled_envelopes'] = unique_envelopes['unique_envelopes']
    
    # Explode the sampled_envelopes into separate rows
    exploded_unique_envelopes = unique_envelopes.explode('sampled_envelopes')
    
    # Merge the exploded_unique_envelopes back to df_filtered to keep only relevant rows
    filtered_df = df_filtered.merge(exploded_unique_envelopes[['select_set', 'sampled_envelopes']],
                                    left_on=['select_set', 'envelope'],
                                    right_on=['select_set', 'sampled_envelopes'])
    
    # Drop the 'sampled_envelopes' column as it's now redundant
    filtered_df = filtered_df.drop(columns='sampled_envelopes')
    
    print("")
    print(filtered_df.groupby(['target_percentage', 'select_set']).size() / 7)
    
    filtered_df = filtered_df[['select_set', 'test_type', 'query_index_time_elapsed', 'remove_duplication_time_elapsed',
                               'sort_block_metadata_time_elapsed', 'find_optimal_read_solution_time_elapsed',
                               'get_target_data_time_elapsed', 'get_support_data_time_elapsed',
                               'data_calculation_time_elapsed', 'data_apply_select_time_elapsed', 'apply_particle_level_select_array_time_elapsed', 'total_time_elapsed', 'query_result_size', 'chunk_range_size']]
    
    filtered_df['indexing'] = filtered_df['query_index_time_elapsed'] + df['remove_duplication_time_elapsed'] + df['sort_block_metadata_time_elapsed'] + df['find_optimal_read_solution_time_elapsed']
    filtered_df['IO_time'] = filtered_df['get_target_data_time_elapsed'] + df['get_support_data_time_elapsed']
    filtered_df['calculation'] = filtered_df['data_calculation_time_elapsed'] + df['data_apply_select_time_elapsed'] + df['apply_particle_level_select_array_time_elapsed']
    
    filtered_df = filtered_df[['select_set', 'test_type', 'indexing', 'IO_time', 'calculation', 'total_time_elapsed', 'query_result_size', 'chunk_range_size']]
    
    # Group by select_set and test_type, then compute the average value
    averaged_df = filtered_df.groupby(['select_set', 'test_type']).mean().reset_index()
    
    
    # only keep the select_set with specific values: ('ux',),  ('ux','uy'),  ('ux','uy','uz'),  ('ux','uy','uz','x'), ('ux','uy','uz','x','y'), ('ux','uy','uz','x','y','z')
    averaged_df = averaged_df[averaged_df['select_set'].isin(["('ux',)", "('ux','uy')", "('ux','uy','uz')", "('ux','uy','uz','x')", "('ux','uy','uz','x','y')", "('ux','uy','uz','x','y','z')"])]
    
    # a new column to indicate the length of select_set
    averaged_df['select_set_length'] = averaged_df['select_set'].apply(lambda x: len(eval(x)))
    
    # sort by select_set_length
    averaged_df = averaged_df.sort_values('select_set_length')
    
    print(averaged_df)
    return averaged_df


In [4]:
averaged_df = data_process(df)
plot_df = averaged_df[averaged_df['test_type'].isin([1])]
print(plot_df)
plot_df['total_time_elapsed'] = plot_df['total_time_elapsed'].apply(lambda x: round(x, 2))
print(plot_df['total_time_elapsed'])

Before Sampling, each group size:
target_percentage  select_set                  
0.01%              ('ux','uy')                     15.0
                   ('ux','uy','uz')                23.0
                   ('ux','uy','uz','x')            15.0
                   ('ux','uy','uz','x','y')        15.0
                   ('ux','uy','uz','x','y','z')    28.0
                   ('ux','uy','x','y')             15.0
                   ('ux',)                         15.0
                   ('x','y')                       15.0
                   ('x','y','z')                   20.0
                   ('x',)                          15.0
dtype: float64

target_percentage  select_set                  
0.01%              ('ux','uy')                     15.0
                   ('ux','uy','uz')                23.0
                   ('ux','uy','uz','x')            15.0
                   ('ux','uy','uz','x','y')        15.0
                   ('ux','uy','uz','x','y','z')    28.0
              