In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
import matplotlib as mpl
import numpy.linalg as la
import matplotlib.style as style

mpl.rcParams['mathtext.fontset'] = 'stix'
mpl.rcParams['font.family'] = 'STIXGeneral'
# print(mpl.rcParams['font.size'])
mpl.rcParams['font.size'] = 13.5

# Secyan

In [9]:
data_sizes = ['1MB', '10MB', '100MB', '1GB']
data_size_nums = [1, 10, 100, 1000]
query_nums = range(5)
query_order = [3,10,18,8,9]

secyan_data_dict = {}

cols = ['query_num', 'time_taken', 'data_size']

# get all data in one pd dataframe
total_data = pd.DataFrame(columns=cols)
for i, data_size in enumerate(data_sizes):
    # secyan_medians = data_dict[data_size]['secyan_medians']
    # secyan_stdevs = data_dict[data_size]['secyan_stdevs']
    path = f'../datastation-escrow/data_comparisons/secyan/secyan_data/{data_size}.csv'
    df = pd.read_csv(path, names=cols, header=None)
    
    # add a column with data_size
    df['data_size'] = data_size_nums[i]
    
    # drop -1 values
    df = df[df['time_taken'] != -1]
    
    # add to total_data
    total_data = pd.concat([total_data, df])

# groupby
for query_num in query_nums:
    query_data = total_data[(total_data['query_num'] == query_num)]

    secyan_medians = query_data.groupby('data_size')['time_taken'].median()
    secyan_stdevs = query_data.groupby('data_size')['time_taken'].std()
    
    secyan_data_dict[query_order[query_num]] = {'secyan_medians': secyan_medians, 'secyan_stdevs': secyan_stdevs}
print(secyan_data_dict)


{3: {'secyan_medians': data_size
1         0.142
10        1.102
100      11.013
1000    128.675
Name: time_taken, dtype: float64, 'secyan_stdevs': data_size
1       0.003055
10      0.033724
100     0.289614
1000    1.949647
Name: time_taken, dtype: float64}, 10: {'secyan_medians': data_size
1         0.144
10        1.084
100      10.716
1000    125.215
Name: time_taken, dtype: float64, 'secyan_stdevs': data_size
1       0.002517
10      0.009292
100     0.212919
1000    0.825508
Name: time_taken, dtype: float64}, 18: {'secyan_medians': data_size
1         0.275
10        2.146
100      20.437
1000    224.919
Name: time_taken, dtype: float64, 'secyan_stdevs': data_size
1       0.010263
10      0.033307
100     0.102137
1000    1.778212
Name: time_taken, dtype: float64}, 8: {'secyan_medians': data_size
1       0.862
10      6.836
100    71.433
Name: time_taken, dtype: float64, 'secyan_stdevs': data_size
1      0.009609
10     0.064583
100    0.222855
Name: time_taken, dtype: float64},

# Data Station

In [10]:
data_sizes = ['1MB', '10MB', '100MB', '1GB']
data_size_nums = [1, 10, 100, 1000]
query_nums = [3,10,18,8,9]
ds_data_dict = {}

cols = ['query_num', 'time_taken', 'dec_time', 'data_size']

# get all data in one pd dataframe
total_data = pd.DataFrame(columns=cols)
for i, data_size in enumerate(data_sizes):
    # secyan_medians = data_dict[data_size]['secyan_medians']
    # secyan_stdevs = data_dict[data_size]['secyan_stdevs']
    if data_size == '1MB':
        path = f'../datastation-escrow/data_comparisons/secyan/ds_data/{data_size}_2.csv'
    else:
        path = f'../datastation-escrow/data_comparisons/baseline/ds_data/{data_size}_2.csv'
    df = pd.read_csv(path, names=cols, header=None)
    
    # add a column with data_size
    df['data_size'] = data_size_nums[i]

    # drop -1 values
    df = df[df['time_taken'] != -1]
    
    if data_size == '1MB':
        # change query nums from 1...5 to 3,10,18,8,9
        df['query_num'] = df['query_num'].map({1:3, 2:10, 3:18, 4:8, 5:9})
    
    # add to total_data
    total_data = pd.concat([total_data, df])

# print(total_data)
# groupby
for query_num in query_nums:
    query_data = total_data[(total_data['query_num'] == query_num)]

    ds_medians = query_data.groupby('data_size')['time_taken'].median()
    ds_stdevs = query_data.groupby('data_size')['time_taken'].std()
    
    ds_data_dict[query_num] = {'ds_medians': ds_medians, 'ds_stdevs': ds_stdevs}
    
print(ds_data_dict)

{3: {'ds_medians': data_size
1        1.686737
10       1.833328
100      3.419430
1000    19.017711
Name: time_taken, dtype: float64, 'ds_stdevs': data_size
1       0.010307
10      0.015920
100     0.018942
1000    0.029519
Name: time_taken, dtype: float64}, 10: {'ds_medians': data_size
1        1.675160
10       1.843133
100      3.424211
1000    19.131534
Name: time_taken, dtype: float64, 'ds_stdevs': data_size
1       0.013255
10      0.012362
100     0.005296
1000    0.066212
Name: time_taken, dtype: float64}, 18: {'ds_medians': data_size
1        1.666968
10       1.851890
100      3.439401
1000    19.305278
Name: time_taken, dtype: float64, 'ds_stdevs': data_size
1       0.003096
10      0.014962
100     0.019313
1000    0.112374
Name: time_taken, dtype: float64}, 8: {'ds_medians': data_size
1        1.681176
10       1.868598
100      3.478058
1000    19.479981
Name: time_taken, dtype: float64, 'ds_stdevs': data_size
1       0.007243
10      0.004205
100     0.012989
1000    0