In [1]:
import pandas as pd
import os
import json
import numpy as np

In [2]:
EMPTY = '_'

### Normalize Parameters

In [3]:
def normalize_params(df):
    
    params = df.params
    if 'impl' in params:
        df['impl'] = params['impl']
    else:
        df['impl'] = df['benchmark'].split('.')[-2]
    
    if 'payloadType' in params:
        df['payloadType'] = params['payloadType']
    else:
        df['payloadType'] = EMPTY
        
    if 'workload' in params:
        df['workload'] = params['workload']
    else:
        df['workload'] = EMPTY
    
    df['size'] = params['size']
        
    return df

### Normalize Benchmark Fields

In [4]:
import re
bmatch = re.compile(r"^de.heidelberg.pvs.container_bench.benchmarks\.([a-z0-9.]+)\.([A-Z][A-Za-z0-9]+)\.([a-zA-Z0-9_.]+)$")

def normalize_benchmark(df):
    fullbench = df.benchmark
    match = bmatch.match(fullbench)
    df['bench_id'] = match[1]
    df['class'] = match[2]
    df['method'] = match[3]
    return df
    


### Aggregate Results

#### JSON Structure

##### PrimaryMetric

- score
- scoreError
- scoreConfidence
- scorePercentiles
- scoreUnit
- rawDataHistogram

##### Secondary Metric

- Basically the percentiles (have to explore more)

In [5]:
from scipy import stats

def aggregate_results_from_sample(df): 
    df['mean'] = df.primaryMetric['score']
    df['error'] = df.primaryMetric['scoreError']
    df['confidence'] = df.primaryMetric['scoreConfidence']
    df['median'] = df.primaryMetric['scorePercentiles']['50.0']
    df['min'] = df.primaryMetric['scorePercentiles']['0.0']
    df['max'] = df.primaryMetric['scorePercentiles']['100.0']
    df['scorePercentiles'] = df.primaryMetric['scorePercentiles']
    
    return df

In [6]:
def aggregate_results_from_ss(df):
    raw = np.array(df['rawData']).flatten()
    # Validate the array
    expect = df["measurementIterations"]
    assert (raw.shape[0] == expect)
    
    # Mean and standard deviation
    df['mean'] = np.mean(raw)
    df['error'] = stats.sem(raw, ddof=1)
    df['median'] = np.median(raw)
    df['min'] = np.min(raw)
    df['max'] = np.max(raw)
    df['scorePercentiles'] = np.percentile(raw, q=[.0, .5, .9, .95, .99, 1.])

    return df

### Save into Disk 

Remove very long columns first 
- primaryMetric
- secondaryMetric

### Execute all Steps


In [7]:
files = ['20180822_intsets_100_100k.json',
    '20180828_intsets_1M.json',
    '20180822_sets_100_100k.json',
    '20180827_sets_1M.json']

files = ['20180701_singleoperations_lists.json', 
    '20180712_singleoperations_maps.json', 
    '20180713_singleoperations_sets.json', 
    '20180715_wordcount_sample.json', 
    '20180714_singleoperations_intmaps.json', 
    '20180714_singleoperations_intsets.json']

files = ['20180808_concurrency_lists.json',
         '20180808_concurrency_sets.json',
         '20180808_concurrency_maps']

files = ['20180808_concurrency_maps.json']

files = ['20180713_singleoperations_hppcmaps.json', 
        '20180713_singleoperations_hppcsets.json',
        '20180827_sets_1M.json',
        '20180828_intsets_1M.json']

files = ['20180828_intsets_1M.json']

folder = '../results/EMSE/'
output_folder = '../results/EMSE/'



In [8]:


for f in files:
    
    print('Reading file %s' % f)
    output = f.replace('.json', '_aggregated.csv')
    
    try:
        df = pd.read_json(os.path.join(folder, f))
        
    
    except ValueError as e:
        print('Error while parsing the file. ERROR %s' % e)
        continue
    
    df = df.apply(normalize_params, axis=1)
    df = df.apply(normalize_benchmark, axis=1)
    df = df.apply(aggregate_results_from_sample, axis=1)
    
    del df['primaryMetric']
    del df['secondaryMetrics']
    
    print('Saving file %s' % output)
    df.to_csv(os.path.join(output_folder, output))


Reading file 20180828_intsets_1M.json
Error while parsing the file. ERROR Trailing data


### Append all Results

In [9]:
aggregated_files = [
    '20180701_singleoperations_lists_aggregated',
    '20180712_singleoperations_maps_aggregated',
    '20180713_singleoperations_hppcmaps_aggregated',
    '20180713_singleoperations_hppcsets_aggregated',
    '20180713_singleoperations_sets_aggregated',
    '20180714_singleoperations_intmaps_aggregated',
    '20180714_singleoperations_intsets_aggregated',
    '20180715_wordcount_sample_aggregated',
    '20180808_concurrency_lists_aggregated',
    '20180808_concurrency_maps_aggregated',
    '20180808_concurrency_sets_aggregated',
    # 20180912_intsets_1M_aggregated,
    '20180822_intsets_100_100k_aggregated',
    '20180822_sets_100_100k_aggregated',
    '20180827_sets_1M_aggregated',
]

aggregated_df = pd.DataFrame()

for f in aggregated_files:
    
    tmp = pd.read_csv(os.path.join(folder, f + '.csv'))
    aggregated_df = aggregated_df.append(tmp)

len(aggregated_df)
aggregated_df.to_csv(os.path.join(folder, 'aggregated_results.csv'))