In [1]:
import warnings
warnings.filterwarnings('ignore')
# For ignoring warnings printed by R magic commands

In [2]:
import os
import glob

In [3]:
# Import pandas
import pandas as pd

In [4]:
# Load R magic
%load_ext rpy2.ipython
# Import ggplot2
%R require(ggplot2);

In [5]:
# If experiment parameters change, modify this function
def AddParametersAsColumns(df, folder):
    expParams = folder.split('.')
    cloudParams = expParams[1].split('_')
    optimizerParams = expParams[2].split('_')
    dynamicParams = expParams[3].split('_')
    df['groupSizeDist'] = cloudParams[9]
    df['placementDist'] = cloudParams[10]
    df['colocateNumHostsPerLeaf'] = int(cloudParams[11])
    df['seed'] = int(cloudParams[14])
    df['algorithm'] = optimizerParams[1]
    df['numBitmaps'] = int(optimizerParams[2])
    df['numLeafsPerBitmap'] = int(optimizerParams[3])
    df['redundancyPerBitmap'] = int(optimizerParams[4])
    df['numRulesPerLeaf'] = int(optimizerParams[5])
    df['numEvents'] = int(dynamicParams[0])
    
# Turns all files of a given filname across a set of folders into a single dataframe
def DataSetAsDataFrame(filename, folders, headers, header=None):
    # Read dataset as dataframe
    def ReadDataSet(folder):
        df = pd.read_csv(folder + '/' + filename, sep=',', header=header, names=headers)
        AddParametersAsColumns(df, folder)
        return df
    # Get the list of dataframes
    dfs = map(ReadDataSet, folders)
    # Combine into a single dataframe
    df = pd.concat(dfs)
    return df

In [6]:
%%R 
plotTheme <- theme(legend.title=element_blank(), legend.position="top", legend.text=element_text(size=6))

In [7]:
directory = '/mnt/sdb1/baseerat/numerical-evals/12-4-2017/logs-100K-random/'
# folders = glob.glob("logs*_0.*")
folders = glob.glob(directory + "dynamic-logs.*")

In [8]:
switchEventTypesToUpdateCount = DataSetAsDataFrame('switch_event_types_to_update_count.csv',
                                                   folders, ['updates', 'switch', 'event'],
                                                   header=0)

In [9]:
switchEventTypesToUpdateCount.head()

Unnamed: 0,updates,switch,event,groupSizeDist,placementDist,colocateNumHostsPerLeaf,seed,algorithm,numBitmaps,numLeafsPerBitmap,redundancyPerBitmap,numRulesPerLeaf,numEvents
0,1,virtual,join,uniform,colocate-random-random,12,0,exact-match,30,3,0,64000,1000000
1,46,virtual,join,uniform,colocate-random-random,12,0,exact-match,30,3,0,64000,1000000
2,62,virtual,join,uniform,colocate-random-random,12,0,exact-match,30,3,0,64000,1000000
3,88,virtual,join,uniform,colocate-random-random,12,0,exact-match,30,3,0,64000,1000000
4,31,virtual,join,uniform,colocate-random-random,12,0,exact-match,30,3,0,64000,1000000


In [10]:
switchEventTypesToUpdateCount.to_csv(directory + 'switch_event_types_to_update_count.csv', index=False)

In [19]:
# %%R -i switchEventTypesToUpdateCount
# ggplot(switchEventTypesToUpdateCount) +
#    geom_boxplot(aes(y=updates, x=factor(event), 
#                     fill=interaction(algorithm, numLeafsPerBitmap, redundancyPerBitmap,
#                                     numRulesPerLeaf, colocateNumHostsPerLeaf)),
#                 outlier.size=0.5) +
#    facet_grid(groupSizeDist ~ switch) +
#    xlab("Event Type") +
#    ylab("Number of updates") +
#    plotTheme

In [18]:
# %%R -i switchEventTypesToUpdateCount
# ggplot(switchEventTypesToUpdateCount) +
#    stat_ecdf(aes(x=updates, col=interaction(algorithm, numLeafsPerBitmap, redundancyPerBitmap, 
#                                             numRulesPerLeaf, colocateNumHostsPerLeaf))) +
#    facet_grid(groupSizeDist * numBitmaps ~ switch * event) +
#    ggtitle("Event updates distribution") +
#    xlab("Number of updates") +
#    plotTheme

In [11]:
switchEventTypesToUpdateCountNormalized = DataSetAsDataFrame('switch_event_types_to_update_count_normalized.csv',
                                                             folders, ['updates', 'switch', 'event'],
                                                             header=0)

In [12]:
switchEventTypesToUpdateCountNormalized.head()

Unnamed: 0,updates,switch,event,groupSizeDist,placementDist,colocateNumHostsPerLeaf,seed,algorithm,numBitmaps,numLeafsPerBitmap,redundancyPerBitmap,numRulesPerLeaf,numEvents
0,0.009259,virtual,join,uniform,colocate-random-random,12,0,exact-match,30,3,0,64000,1000000
1,0.5,virtual,join,uniform,colocate-random-random,12,0,exact-match,30,3,0,64000,1000000
2,0.5,virtual,join,uniform,colocate-random-random,12,0,exact-match,30,3,0,64000,1000000
3,0.5,virtual,join,uniform,colocate-random-random,12,0,exact-match,30,3,0,64000,1000000
4,0.455882,virtual,join,uniform,colocate-random-random,12,0,exact-match,30,3,0,64000,1000000


In [13]:
switchEventTypesToUpdateCountNormalized[
    (switchEventTypesToUpdateCountNormalized['groupSizeDist'] == 'wve') &
    (switchEventTypesToUpdateCountNormalized['switch'] == 'virtual') &
    (switchEventTypesToUpdateCountNormalized['event'] == 'join') &
    (switchEventTypesToUpdateCountNormalized['updates'] > 1)
]

Unnamed: 0,updates,switch,event,groupSizeDist,placementDist,colocateNumHostsPerLeaf,seed,algorithm,numBitmaps,numLeafsPerBitmap,redundancyPerBitmap,numRulesPerLeaf,numEvents


In [14]:
switchEventTypesToUpdateCountNormalized[
    (switchEventTypesToUpdateCountNormalized['groupSizeDist'] == 'wve') &
    (switchEventTypesToUpdateCountNormalized['switch'] == 'virtual') &
    (switchEventTypesToUpdateCountNormalized['event'] == 'join')
]['updates'].describe()

count    951824.000000
mean          0.345944
std           0.240312
min           0.000200
25%           0.010309
50%           0.493524
75%           0.514555
max           1.000000
Name: updates, dtype: float64

In [15]:
switchEventTypesToUpdateCountNormalized[
    (switchEventTypesToUpdateCountNormalized['groupSizeDist'] == 'wve') &
    (switchEventTypesToUpdateCountNormalized['switch'] == 'virtual') &
    (switchEventTypesToUpdateCountNormalized['event'] == 'leave')
]['updates'].describe()

count    962192.000000
mean          0.506770
std           0.079629
min           0.000200
25%           0.492683
50%           0.507395
75%           0.526144
max           1.000000
Name: updates, dtype: float64

In [16]:
switchEventTypesToUpdateCountNormalized.to_csv(directory + 'switch_event_types_to_update_count_normalized.csv', index=False)

In [16]:
# %%R -i switchEventTypesToUpdateCountNormalized
# plt <- ggplot(switchEventTypesToUpdateCountNormalized) +
#    geom_boxplot(aes(y=updates, x=factor(event), 
#                     fill=interaction(algorithm, numLeafsPerBitmap, redundancyPerBitmap,
#                                     numRulesPerLeaf, colocateNumHostsPerLeaf)),
#                 outlier.size=0.5) +
#    facet_grid(groupSizeDist ~ switch) +
#    xlab("Event Type") +
#    ylab("Number of updates (normalized by group sizes)") +
#    plotTheme


In [17]:
# %%R -i switchEventTypesToUpdateCountNormalized
# ggplot(switchEventTypesToUpdateCountNormalized) +
#    stat_ecdf(aes(x=updates, col=interaction(algorithm, numLeafsPerBitmap, redundancyPerBitmap, 
#                                             numRulesPerLeaf, colocateNumHostsPerLeaf))) +
#    facet_grid(groupSizeDist * numBitmaps ~ switch * event) +
#    ggtitle("Event updates distribution") +
#    xlab("Number of updates") +
#    plotTheme