In [1]:
import warnings
warnings.filterwarnings('ignore')
# For ignoring warnings printed by R magic commands

In [2]:
import os
import glob

In [3]:
# Import pandas
import pandas as pd

In [4]:
# Load R magic
%load_ext rpy2.ipython
# Import ggplot2
%R require(ggplot2);

In [5]:
import progressbar

def bar_range(x, desc):
    widgets = [
        '%s: ' % desc, progressbar.Percentage(),
        ' ', progressbar.Bar(),
        ' ', progressbar.ETA(),
    ]
    bar = progressbar.ProgressBar(widgets=widgets)
    if isinstance(x, range) or isinstance(x, list):
        return bar(x)
    else:
        return bar(range(x))

In [6]:
# If experiment parameters change, modify this function
def AddParametersAsColumns(df, folder):
    expParams = folder.split('/')[-1].split('.')
    
    cloudParams = expParams[1].split('_')
    df['groupSizeDist'] = cloudParams[10]
    df['placementDist'] = cloudParams[11]
    df['placementNumHostsPerLeaf'] = 'uniform' if cloudParams[12] == '-1' else cloudParams[12]
    df['seed'] = int(cloudParams[15])
    
    optimizerParams_0 = expParams[2].split('_')
    node_type_0 = optimizerParams_0[7]
    df['%sAlgorithm' % node_type_0] = optimizerParams_0[0]
    df['%sNumBitmaps' % node_type_0] = int(optimizerParams_0[1])
    df['%sNumNodesPerBitmap' % node_type_0] = int(optimizerParams_0[2])
    df['%sRedundancyPerBitmap' % node_type_0] = int(optimizerParams_0[3])
    df['%sNumRules' % node_type_0] = int(optimizerParams_0[4])
    
    if len(expParams) > 3:
        optimizerParams_1 = expParams[3].split('_')
        node_type_1 = optimizerParams_1[7]
        df['%sAlgorithm' % node_type_1] = optimizerParams_1[0]
        df['%sNumBitmaps' % node_type_1] = int(optimizerParams_1[1])
        df['%sNumNodesPerBitmap' % node_type_1] = int(optimizerParams_1[2])
        df['%sRedundancyPerBitmap' % node_type_1] = int(optimizerParams_1[3])
        df['%sNumRules' % node_type_1] = int(optimizerParams_1[4])
    
# Turns all files of a given filname across a set of folders into a single dataframe
def DataSetAsDataFrame(filename, folders, headers, header=None, reset_index=True, add_columns=True):
    # Read dataset as dataframe
    def ReadDataSet(folder):
        df = pd.read_csv(folder + '/' + filename, sep=',', header=header, names=headers)
        if add_columns:
            AddParametersAsColumns(df, folder)
        return df
    # Get the list of dataframes
    dfs = map(ReadDataSet, folders)
    # Combine into a single dataframe
    df = pd.concat(dfs)
    if reset_index:
        df.reset_index(inplace=True)
        df.drop('index', axis=1, inplace=True)
    return df

In [7]:
%%R 
plotTheme <- theme(legend.title=element_blank(), legend.position="top", legend.text=element_text(size=6))

In [8]:
directory = '/mnt/sdc1/baseerat/numerical-evals/1-26-2018/logs-1M/'
folders = glob.glob(directory + "logs.*")

In [9]:
# vmCountPerTenant = DataSetAsDataFrame('vm_count_per_tenant.csv', 
#                                       folders, 
#                                       ['vmCount'])

In [10]:
# vmCountPerTenant.head(2)

In [11]:
# criterion = ~((vmCountPerTenant['groupSizeDist'] == 'wve') & 
#               (vmCountPerTenant['placementDist'] == 'colocate-uniform') & 
#               (vmCountPerTenant['placementNumHostsPerLeaf'] == 'uniform') & 
#               (vmCountPerTenant['podsAlgorithm'] == 'exact-match') & 
#               (vmCountPerTenant['podsNumBitmaps'] == 1) & 
#               (vmCountPerTenant['podsNumNodesPerBitmap'] == 3) &
#               (vmCountPerTenant['podsRedundancyPerBitmap'] == 0) &
#               (vmCountPerTenant['podsNumRules'] == 10000))
# vmCountPerTenant.drop(vmCountPerTenant.index[criterion], inplace=True)
# vmCountPerTenant.drop(['groupSizeDist', 'placementDist', 'placementNumHostsPerLeaf',
#                       'podsAlgorithm', 'podsNumBitmaps', 'podsNumNodesPerBitmap', 'podsRedundancyPerBitmap',
#                       'podsNumRules'], axis=1, inplace=True)

In [12]:
# # vmCountPerTenant['vmCount'].quantile([0.0, 0.5, 0.99, 0.999, 0.9999, 1.0])
# vmCountPerTenant['vmCount'].describe()

In [13]:
# vmCountPerTenant.to_csv(directory + 'vm_count_per_tenant.csv', index=False)

In [14]:
# groupCountPerTenant = DataSetAsDataFrame('group_count_per_tenant.csv', 
#                                          folders, ['groupCount'])

In [15]:
# criterion = ~((groupCountPerTenant['groupSizeDist'] == 'wve') & 
#               (groupCountPerTenant['placementDist'] == 'colocate-uniform') & 
#               (groupCountPerTenant['placementNumHostsPerLeaf'] == 'uniform') & 
#               (groupCountPerTenant['podsAlgorithm'] == 'exact-match') & 
#               (groupCountPerTenant['podsNumBitmaps'] == 1) & 
#               (groupCountPerTenant['podsNumNodesPerBitmap'] == 3) &
#               (groupCountPerTenant['podsRedundancyPerBitmap'] == 0) &
#               (groupCountPerTenant['podsNumRules'] == 10000))
# groupCountPerTenant.drop(groupCountPerTenant.index[criterion], inplace=True)
# groupCountPerTenant.drop(['groupSizeDist', 'placementDist', 'placementNumHostsPerLeaf',
#                           'podsAlgorithm', 'podsNumBitmaps', 'podsNumNodesPerBitmap', 'podsRedundancyPerBitmap',
#                           'podsNumRules'], axis=1, inplace=True)

In [16]:
# # groupCountPerTenant['groupCount'].quantile([0.0, 0.5, 0.99, 0.999, 0.9999, 1.0])
# groupCountPerTenant['groupCount'].describe()

In [17]:
# groupCountPerTenant.to_csv(directory + 'group_count_per_tenant.csv', index=False)

In [9]:
# _folders = glob.glob(directory + "logs.*_0.random-fuzzy-match_*_pods")

In [18]:
# groupSizePerGroupPerTenant = DataSetAsDataFrame('group_size_per_group_per_tenant.csv', 
#                                                 _folders, ['groupSize'])

In [19]:
# len(groupSizePerGroupPerTenant)

In [20]:
# criterion = ~((groupSizePerGroupPerTenant['placementDist'] == 'colocate-uniform') & 
#               (groupSizePerGroupPerTenant['placementNumHostsPerLeaf'] == 'uniform') & 
#               (groupSizePerGroupPerTenant['podsAlgorithm'] == 'exact-match') & 
#               (groupSizePerGroupPerTenant['podsNumBitmaps'] == 1) & 
#               (groupSizePerGroupPerTenant['podsNumNodesPerBitmap'] == 3) &
#               (groupSizePerGroupPerTenant['podsRedundancyPerBitmap'] == 0) &
#               (groupSizePerGroupPerTenant['podsNumRules'] == 10000))
# groupSizePerGroupPerTenant.drop(groupSizePerGroupPerTenant.index[criterion], inplace=True)
# groupSizePerGroupPerTenant.drop(['placementDist', 'placementNumHostsPerLeaf',
#                                  'podsAlgorithm', 'podsNumBitmaps', 'podsNumNodesPerBitmap', 
#                                  'podsRedundancyPerBitmap', 'podsNumRules'], axis=1, inplace=True)

In [21]:
# groupSizePerGroupPerTenant.groupby('groupSizeDist')['groupSize'].quantile(
#     [0.0, 0.5, 0.8, 0.99, 0.991, 0.992, 0.993, 0.994, 0.995, 0.996, 0.999, 0.9999, 1.0])

In [22]:
# groupSizePerGroupPerTenant.to_csv(directory + 'group_size_per_group_per_tenant.csv', index=False)

In [10]:
# _folders = glob.glob(directory + "logs.*_0.*_pods") + glob.glob(directory + "logs.*_1.*_pods")

In [11]:
# leafCountPerGroupPerTenant = DataSetAsDataFrame('leaf_count_per_group_per_tenant.csv', 
#                                                 _folders, ['leafCount'])

In [12]:
# criterion = ~((leafCountPerGroupPerTenant['podsAlgorithm'] == 'exact-match') & 
#               (leafCountPerGroupPerTenant['podsNumBitmaps'] == 1) & 
#               (leafCountPerGroupPerTenant['podsNumNodesPerBitmap'] == 3) &
#               (leafCountPerGroupPerTenant['podsRedundancyPerBitmap'] == 0) &
#               (leafCountPerGroupPerTenant['podsNumRules'] == 10000))
# leafCountPerGroupPerTenant.drop(leafCountPerGroupPerTenant.index[criterion], inplace=True)
# leafCountPerGroupPerTenant.drop(['podsAlgorithm', 'podsNumBitmaps', 'podsNumNodesPerBitmap', 
#                                  'podsRedundancyPerBitmap', 'podsNumRules'], axis=1, inplace=True)

In [23]:
# # leafCountPerGroupPerTenant.groupby(['groupSizeDist', 'placementDist', 'placementNumHostsPerLeaf'])[
# #     'leafCount'].quantile([0.0, 0.5, 0.8, 0.99, 0.999, 0.9999, 1.0])
# leafCountPerGroupPerTenant.groupby(['groupSizeDist', 'placementDist', 'placementNumHostsPerLeaf'])[
#     'leafCount'].describe()

In [14]:
# leafCountPerGroupPerTenant.to_csv(directory + 'leaf_count_per_group_per_tenant.csv', index=False)

In [8]:
# _folders = glob.glob(directory + "logs.*_0.*_pods") + glob.glob(directory + "logs.*_1.*_pods")

In [None]:
# podCountPerGroupPerTenant = DataSetAsDataFrame('pod_count_per_group_per_tenant.csv', 
#                                                 _folders, ['podCount'])

In [10]:
# criterion = ~((podCountPerGroupPerTenant['podsAlgorithm'] == 'exact-match') & 
#               (podCountPerGroupPerTenant['podsNumBitmaps'] == 1) & 
#               (podCountPerGroupPerTenant['podsNumNodesPerBitmap'] == 3) &
#               (podCountPerGroupPerTenant['podsRedundancyPerBitmap'] == 0) &
#               (podCountPerGroupPerTenant['podsNumRules'] == 10000))
# podCountPerGroupPerTenant.drop(podCountPerGroupPerTenant.index[criterion], inplace=True)
# podCountPerGroupPerTenant.drop(['podsAlgorithm', 'podsNumBitmaps', 'podsNumNodesPerBitmap', 
#                                 'podsRedundancyPerBitmap', 'podsNumRules'], axis=1, inplace=True)

In [24]:
# # podCountPerGroupPerTenant.groupby(['groupSizeDist', 'placementDist', 'placementNumHostsPerLeaf'])[
# #     'podCount'].quantile([0.0, 0.5, 0.8, 0.99, 0.999, 0.9999, 1.0])
# podCountPerGroupPerTenant.groupby(['groupSizeDist', 'placementDist', 'placementNumHostsPerLeaf'])[
#     'podCount'].describe()

In [25]:
# podCountPerGroupPerTenant.to_csv(directory + 'pod_count_per_group_per_tenant.csv', index=False)

In [26]:
# _folders = glob.glob(directory + "logs.*_0.random-fuzzy-match_*_pods") # + glob.glob(directory + "logs.*_1.*_pods")

In [27]:
# podsAlgorithmElapseTime = DataSetAsDataFrame('pods_algorithm_elapse_time.csv', 
#                                               _folders, ['elapseTime'])

In [28]:
# len(podsAlgorithmElapseTime)

In [29]:
# podsAlgorithmElapseTime.head(2)

In [11]:
# podsAlgorithmElapseTime.drop(['groupSizeDist', 'placementDist', 'placementNumHostsPerLeaf'], axis=1, inplace=True)

In [30]:
# # podsAlgorithmElapseTime.groupby(['podsAlgorithm', 'podsNumBitmaps', 'podsNumNodesPerBitmap', 
# #                                 'podsRedundancyPerBitmap', 'podsNumRules'])['elapseTime'].describe()
# podsAlgorithmElapseTime.groupby('podsAlgorithm')['elapseTime'].describe()

In [None]:
# podsAlgorithmElapseTime.to_csv(directory + 'pods_algorithm_elapse_time.csv', index=False)

In [15]:
# podsAlgorithmElapseTime['groupSize'] = groupSizePerGroupPerTenant['groupSize']

In [16]:
# podsAlgorithmElapseTimeTrim = podsAlgorithmElapseTime[
#     (((podsAlgorithmElapseTime['placementDist'] == 'colocate-uniform') &
#       (podsAlgorithmElapseTime['placementNumHostsPerLeaf'] == 'uniform')) |
#      ((podsAlgorithmElapseTime['placementDist'] == 'colocate-colocate-uniform') &
#       (podsAlgorithmElapseTime['placementNumHostsPerLeaf'] == '12'))) &
#     (podsAlgorithmElapseTime['podsNumRules'] == 64000)
# ]

In [31]:
# podsAlgorithmElapseTimeTrim['placementNumHostsPerLeaf'].unique()

In [32]:
# podsAlgorithmElapseTimeTrim.to_csv(directory + 'pods_algorithm_elapse_time_trim.csv', index=False)

In [None]:
# @Lalith

In [13]:
# podsAlgorithmElapseTime = pd.read_csv(directory + 'pods_algorithm_elapse_time.csv')

In [17]:
# podsAlgorithmElapseTimeTrim = podsAlgorithmElapseTime[
#     (((podsAlgorithmElapseTime['placementDist'] == 'colocate-uniform') &
#       (podsAlgorithmElapseTime['placementNumHostsPerLeaf'] == 'uniform')) |
#      ((podsAlgorithmElapseTime['placementDist'] == 'colocate-colocate-uniform') &
#       (podsAlgorithmElapseTime['placementNumHostsPerLeaf'] == '12'))) &
#     (podsAlgorithmElapseTime['seed'] == 0) &
#     (podsAlgorithmElapseTime['podsAlgorithm'] == 'random-fuzzy-match') &
#     (podsAlgorithmElapseTime['podsNumRules'] == 64000)
# ]

In [None]:
# podsAlgorithmElapseTimeTrim['podsNumRules'].unique()

In [None]:
# podsAlgorithmElapseTimeTrim.to_csv(directory + 'pods_algorithm_elapse_time_trim.csv', index=False)

In [33]:
# _leafs_folders = (glob.glob(directory + "logs.*_0.*.random-fuzzy-match_*_3_0_*_2_3_leafs_*") +
#                   glob.glob(directory + "logs.*_0.*.random-fuzzy-match_*_3_6_*_2_3_leafs_*") +
#                   glob.glob(directory + "logs.*_0.*.random-fuzzy-match_*_3_12_*_2_3_leafs_*"))

In [34]:
# leafsAlgorithmElapseTime = DataSetAsDataFrame('leafs_algorithm_elapse_time.csv', 
#                                               _leafs_folders, ['elapseTime'])

In [35]:
# len(leafsAlgorithmElapseTime)

In [36]:
# leafsAlgorithmElapseTime.head(2)

In [38]:
# leafsAlgorithmElapseTime.drop(['groupSizeDist', 'placementDist', 'placementNumHostsPerLeaf',
#                                'podsAlgorithm', 'podsNumBitmaps', 'podsNumNodesPerBitmap', 
#                                'podsRedundancyPerBitmap', 'podsNumRules'], axis=1, inplace=True)

In [39]:
# leafsAlgorithmElapseTime.groupby('leafsAlgorithm')['elapseTime'].describe()

In [40]:
# leafsAlgorithmElapseTime.to_csv(directory + 'leafs_algorithm_elapse_time.csv', index=False)

In [41]:
# leafsAlgorithmElapseTime['groupSize'] = groupSizePerGroupPerTenant['groupSize']

In [42]:
# leafsAlgorithmElapseTimeTrim = leafsAlgorithmElapseTime[
#     (((leafsAlgorithmElapseTime['placementDist'] == 'colocate-uniform') &
#       (leafsAlgorithmElapseTime['placementNumHostsPerLeaf'] == 'uniform')) |
#      ((leafsAlgorithmElapseTime['placementDist'] == 'colocate-colocate-uniform') &
#       (leafsAlgorithmElapseTime['placementNumHostsPerLeaf'] == '12'))) &
#     (leafsAlgorithmElapseTime['podsNumRules'] == 64000)
# ]

In [43]:
# leafsAlgorithmElapseTimeTrim['placementNumHostsPerLeaf'].unique()

In [44]:
# leafsAlgorithmElapseTimeTrim.to_csv(directory + 'leafs_algorithm_elapse_time_trim.csv', index=False)

In [45]:
# leafsAlgorithmElapseTimeTrim[leafsAlgorithmElapseTimeTrim['groupSize'] > 100][['groupSize', 'elapseTime']].head(10)

In [None]:
# @Lalith

In [10]:
# leafsAlgorithmElapseTime = pd.read_csv(directory + 'leafs_algorithm_elapse_time.csv')

In [12]:
# leafsAlgorithmElapseTimeTrim = leafsAlgorithmElapseTime[
#     (((leafsAlgorithmElapseTime['placementDist'] == 'colocate-uniform') &
#       (leafsAlgorithmElapseTime['placementNumHostsPerLeaf'] == 'uniform')) |
#      ((leafsAlgorithmElapseTime['placementDist'] == 'colocate-colocate-uniform') &
#       (leafsAlgorithmElapseTime['placementNumHostsPerLeaf'] == '12'))) &
#     (leafsAlgorithmElapseTime['seed'] == 0) &
#     (leafsAlgorithmElapseTime['leafsAlgorithm'] == 'random-fuzzy-match') &
#     (leafsAlgorithmElapseTime['leafsNumRules'] == 64000)
# ]

In [13]:
# leafsAlgorithmElapseTimeTrim.to_csv(directory + 'leafs_algorithm_elapse_time_trim.csv', index=False)

In [46]:
# groupsCoveredWithBitmapsOnlyForPods = DataSetAsDataFrame('groups_covered_with_bitmaps_only_for_pods.csv', 
#                                                          folders, 
#                                                          ['groupsCovered', 'groupsCoveredWithoutDefaultBitmap'],
#                                                          header=0)

In [9]:
# %%R -i groupsCoveredWithBitmapsOnlyForPods
# ggplot(groupsCoveredWithBitmapsOnlyForPods) +
#    geom_bar(aes(y=groupsCoveredWithoutDefaultBitmap, x=factor(podsNumBitmaps), 
#                 fill=interaction(podsAlgorithm, podsNumNodesPerBitmap, podsRedundancyPerBitmap)),
#             stat="identity",position=position_dodge()) +
#    facet_grid(groupSizeDist * podsNumRules ~ factor(placementNumHostsPerLeaf, levels=c('uniform','12','24','48'))) +
#    xlab("Number of bitmaps") +
#    ylab("Groups covered (without default bitmap)") +
#    plotTheme

In [47]:
# groupsCoveredWithBitmapsOnlyForPods.to_csv(directory + 'groups_covered_with_bitmaps_only_for_pods.csv', index=False)

In [48]:
# groupsCoveredWithBitmapsOnlyForLeafs = DataSetAsDataFrame('groups_covered_with_bitmaps_only_for_leafs.csv', 
#                                                           leafs_folders, 
#                                                           ['groupsCovered', 'groupsCoveredWithoutDefaultBitmap'],
#                                                           header=0)

In [12]:
# %%R -i groupsCoveredWithBitmapsOnlyForLeafs
# ggplot(groupsCoveredWithBitmapsOnlyForLeafs) +
#    geom_bar(aes(y=groupsCoveredWithoutDefaultBitmap, x=factor(leafsNumBitmaps), 
#                 fill=interaction(leafsAlgorithm, leafsNumNodesPerBitmap, leafsRedundancyPerBitmap)),
#             stat="identity",position=position_dodge()) +
#    facet_grid(groupSizeDist * leafsNumRules ~ factor(placementNumHostsPerLeaf, levels=c('uniform','12','24','48'))) +
#    xlab("Number of bitmaps") +
#    ylab("Groups covered (without default bitmap)") +
#    plotTheme

In [49]:
# groupsCoveredWithBitmapsOnlyForLeafs.to_csv(directory + 'groups_covered_with_bitmaps_only_for_leafs.csv', index=False)

In [50]:
# ruleCountForPods = DataSetAsDataFrame('rule_count_for_pods.csv', 
#                                       folders, ['numRules'])

In [15]:
# %%R -i ruleCountForPods
# ggplot(ruleCountForPods) +
#     geom_boxplot(aes(y=numRules, x=factor(podsNumBitmaps), 
#                      fill=interaction(podsAlgorithm, podsNumNodesPerBitmap, podsRedundancyPerBitmap)),
#                  outlier.size=0.5) +
#     facet_grid(groupSizeDist * podsNumRules ~ factor(placementNumHostsPerLeaf, levels=c('uniform','12','24','48'))) +
#     xlab("Number of bitmaps") +
#     ylab("Number of rules") +
#     plotTheme

In [51]:
# ruleCountForPods.to_csv(directory + 'rule_count_for_pods.csv', index=False)

In [52]:
# ruleCountForLeafs = DataSetAsDataFrame('rule_count_for_leafs.csv', 
#                                        leafs_folders, ['numRules'])

In [18]:
# %%R -i ruleCountForLeafs
# ggplot(ruleCountForLeafs) +
#     geom_boxplot(aes(y=numRules, x=factor(leafsNumBitmaps), 
#                      fill=interaction(leafsAlgorithm, leafsNumNodesPerBitmap, leafsRedundancyPerBitmap)),
#                  outlier.size=0.5) +
#     facet_grid(groupSizeDist * leafsNumRules ~ factor(placementNumHostsPerLeaf, levels=c('uniform','12','24','48'))) +
#     xlab("Number of bitmaps") +
#     ylab("Number of rules") +
#     plotTheme

In [53]:
# ruleCountForLeafs.to_csv(directory + 'rule_count_for_leafs.csv', index=False)

In [8]:
# trafficOverheadPerGroupPerTenantForPods = DataSetAsDataFrame(
#     'traffic_overhead_per_group_per_tenant_for_pods.csv', folders, ['trafficOverhead'])

In [9]:
# trafficOverheadPerGroupPerTenantForPods = trafficOverheadPerGroupPerTenantForPods.groupby(
#     ['groupSizeDist', 'placementDist', 'placementNumHostsPerLeaf', 'seed',
#      'podsAlgorithm', 'podsNumBitmaps', 'podsNumNodesPerBitmap', 'podsRedundancyPerBitmap',
#     'podsNumRules']).mean()

In [10]:
# trafficOverheadPerGroupPerTenantForPods = trafficOverheadPerGroupPerTenantForPods.reset_index()

In [11]:
# %%R -i trafficOverheadPerGroupPerTenantForPods
# ggplot(trafficOverheadPerGroupPerTenantForPods) +
#     geom_boxplot(aes(y=trafficOverhead, x=factor(podsNumBitmaps), 
#                      fill=interaction(podsAlgorithm, podsNumNodesPerBitmap, podsRedundancyPerBitmap))) +
#     facet_grid(groupSizeDist * podsNumRules ~ factor(placementNumHostsPerLeaf, levels=c('uniform','12','24','48'))) +
#     xlab("Number of bitmaps") +
#     ylab("Traffic overhead per group") +
#     plotTheme

In [12]:
# trafficOverheadPerGroupPerTenantForLeafs = DataSetAsDataFrame(
#     'traffic_overhead_per_group_per_tenant_for_leafs.csv', leafs_folders, ['trafficOverhead'])

In [13]:
# trafficOverheadPerGroupPerTenantForLeafs = trafficOverheadPerGroupPerTenantForLeafs.groupby(
#     ['groupSizeDist', 'placementDist', 'placementNumHostsPerLeaf', 'seed',
#      'podsAlgorithm', 'podsNumBitmaps', 'podsNumNodesPerBitmap', 'podsRedundancyPerBitmap',
#      'podsNumRules', 
#      'leafsAlgorithm', 'leafsNumBitmaps', 'leafsNumNodesPerBitmap', 'leafsRedundancyPerBitmap',
#      'leafsNumRules']).mean()

In [14]:
# trafficOverheadPerGroupPerTenantForLeafs = trafficOverheadPerGroupPerTenantForLeafs.reset_index()

In [15]:
# %%R -i trafficOverheadPerGroupPerTenantForLeafs
# ggplot(trafficOverheadPerGroupPerTenantForLeafs) +
#     geom_boxplot(aes(y=trafficOverhead, x=factor(leafsNumBitmaps), 
#                      fill=interaction(leafsAlgorithm, leafsNumNodesPerBitmap, leafsRedundancyPerBitmap))) +
#     facet_grid(groupSizeDist * leafsNumRules ~ factor(placementNumHostsPerLeaf, levels=c('uniform','12','24','48'))) +
#     xlab("Number of bitmaps") +
#     ylab("Traffic overhead per group") +
#     plotTheme

In [None]:
# trafficPerGroupPerTenant = DataSetAsDataFrame('traffic_per_group_per_tenant.csv', 
#                                               leafs_folders, 
#                                               ['multicastTraffic', 'unicastTraffic', 
#                                                'overlayTraffic', 'baseeratTraffic'],
#                                               header=0)

In [None]:
# trafficPerGroupPerTenant = trafficPerGroupPerTenant.groupby(
#     ['groupSizeDist', 'placementDist', 'placementNumHostsPerLeaf', 'seed',
#      'podsAlgorithm', 'podsNumBitmaps', 'podsNumNodesPerBitmap', 'podsRedundancyPerBitmap', 'podsNumRules', 
#      'leafsAlgorithm', 'leafsNumBitmaps', 'leafsNumNodesPerBitmap', 'leafsRedundancyPerBitmap', 'leafsNumRules'])[
#     ['unicastTraffic', 'multicastTraffic', 'baseeratTraffic', 'overlayTraffic']].agg('sum')
# trafficPerGroupPerTenant.reset_index(inplace=True)

In [None]:
# trafficPerGroupPerTenant.to_csv(directory + 'traffic_per_group_per_tenant.csv', index=False)

In [9]:
# If experiment parameters change, modify this function
def AddParametersAsColumns(df, folder):
    expParams = folder.split('/')[-1].split('.')
    
    cloudParams = expParams[1].split('_')
    df['groupSizeDist'] = cloudParams[10]
    df['placementDist'] = cloudParams[11]
    df['placementNumHostsPerLeaf'] = 'uniform' if cloudParams[12] == '-1' else cloudParams[12]
    df['seed'] = int(cloudParams[15])
    
    optimizerParams_0 = expParams[2].split('_')
    node_type_0 = optimizerParams_0[7]
    df['%sAlgorithm' % node_type_0] = optimizerParams_0[0]
    df['%sNumBitmaps' % node_type_0] = int(optimizerParams_0[1])
    df['%sNumNodesPerBitmap' % node_type_0] = int(optimizerParams_0[2])
    df['%sRedundancyPerBitmap' % node_type_0] = int(optimizerParams_0[3])
    df['%sNumRules' % node_type_0] = int(optimizerParams_0[4])
    
    if len(expParams) > 3:
        optimizerParams_1 = expParams[3].split('_')
        node_type_1 = optimizerParams_1[7]
        df['%sAlgorithm' % node_type_1] = optimizerParams_1[0]
        df['%sNumBitmaps' % node_type_1] = int(optimizerParams_1[1])
        df['%sNumNodesPerBitmap' % node_type_1] = int(optimizerParams_1[2])
        df['%sRedundancyPerBitmap' % node_type_1] = int(optimizerParams_1[3])
        df['%sNumRules' % node_type_1] = int(optimizerParams_1[4])
    
# Turns all files of a given filname across a set of folders into a single dataframe
def DataSetAsDataFrame(filename, filename_bytes, folders, headers, headers_bytes, header=None, header_bytes=None, reset_index=True, add_columns=True):
    # Read dataset as dataframe
    def ReadDataSet(folder):
        df = pd.read_csv(folder + '/' + filename, sep=',', header=header, names=headers)
        df_bytes = pd.read_csv(folder + '/' + filename_bytes, sep=',', header=header_bytes, names=headers_bytes)
        
        small_message_size = 64 + 50  # for VXLAN
        df['multicastTraffic64B'] = df['multicastTraffic'] * small_message_size 
        df['unicastTraffic64B'] = df['unicastTraffic'] * small_message_size
        df['overlayTrafficCorrected64B'] = df['overlayTrafficCorrected'] * small_message_size
        df['baseeratTraffic64B'] = (df['baseeratTraffic'] * small_message_size) + df_bytes['baseeratHeaderTraffic']
        
        medium_message_size = 512 + 50  # for VXLAN
        df['multicastTraffic512B'] = df['multicastTraffic'] * medium_message_size 
        df['unicastTraffic512B'] = df['unicastTraffic'] * medium_message_size
        df['overlayTrafficCorrected512B'] = df['overlayTrafficCorrected'] * medium_message_size
        df['baseeratTraffic512B'] = (df['baseeratTraffic'] * medium_message_size) + df_bytes['baseeratHeaderTraffic']
        
        large_message_size = 1500 + 50  # for VXLAN
        df['multicastTraffic1500B'] = df['multicastTraffic'] * large_message_size 
        df['unicastTraffic1500B'] = df['unicastTraffic'] * large_message_size
        df['overlayTrafficCorrected1500B'] = df['overlayTrafficCorrected'] * large_message_size
        df['baseeratTraffic1500B'] = (df['baseeratTraffic'] * large_message_size) + df_bytes['baseeratHeaderTraffic']
        
        if add_columns:
            AddParametersAsColumns(df, folder)
        return df
    # Get the list of dataframes
    dfs = map(ReadDataSet, folders)
    # Combine into a single dataframe
    df = pd.concat(dfs)
    if reset_index:
        df.reset_index(inplace=True)
        df.drop('index', axis=1, inplace=True)
    return df

In [10]:
leafs_folders_0 = glob.glob(directory + "logs.*_0.*_leafs*")

In [11]:
trafficPerGroupPerTenant_0 = DataSetAsDataFrame(
    filename='traffic_per_group_per_tenant.csv', 
    filename_bytes='traffic_per_group_per_tenant_for_baseerat_bytes.csv',
    folders=leafs_folders_0, 
    headers=['multicastTraffic', 'unicastTraffic', 'overlayTraffic', 'overlayTrafficCorrected', 'baseeratTraffic'],
    headers_bytes=['baseeratHeaderTraffic'],
    header=0, reset_index=False)

In [12]:
trafficPerGroupPerTenant_0.columns

Index(['multicastTraffic', 'unicastTraffic', 'overlayTraffic',
       'overlayTrafficCorrected', 'baseeratTraffic', 'multicastTraffic64B',
       'unicastTraffic64B', 'overlayTrafficCorrected64B', 'baseeratTraffic64B',
       'multicastTraffic512B', 'unicastTraffic512B',
       'overlayTrafficCorrected512B', 'baseeratTraffic512B',
       'multicastTraffic1500B', 'unicastTraffic1500B',
       'overlayTrafficCorrected1500B', 'baseeratTraffic1500B', 'groupSizeDist',
       'placementDist', 'placementNumHostsPerLeaf', 'seed', 'podsAlgorithm',
       'podsNumBitmaps', 'podsNumNodesPerBitmap', 'podsRedundancyPerBitmap',
       'podsNumRules', 'leafsAlgorithm', 'leafsNumBitmaps',
       'leafsNumNodesPerBitmap', 'leafsRedundancyPerBitmap', 'leafsNumRules'],
      dtype='object')

In [13]:
trafficPerGroupPerTenant_0 = trafficPerGroupPerTenant_0.groupby(
    ['groupSizeDist', 'placementDist', 'placementNumHostsPerLeaf', 'seed',
     'podsAlgorithm', 'podsNumBitmaps', 'podsNumNodesPerBitmap', 'podsRedundancyPerBitmap', 'podsNumRules', 
     'leafsAlgorithm', 'leafsNumBitmaps', 'leafsNumNodesPerBitmap', 'leafsRedundancyPerBitmap', 'leafsNumRules'])[
    ['unicastTraffic', 'multicastTraffic', 'baseeratTraffic', 'overlayTrafficCorrected',
     'unicastTraffic64B', 'multicastTraffic64B', 'baseeratTraffic64B', 'overlayTrafficCorrected64B',
     'unicastTraffic512B', 'multicastTraffic512B', 'baseeratTraffic512B', 'overlayTrafficCorrected512B',
     'unicastTraffic1500B', 'multicastTraffic1500B', 'baseeratTraffic1500B', 'overlayTrafficCorrected1500B']].agg('sum')
trafficPerGroupPerTenant_0.reset_index(inplace=True)

In [20]:
trafficPerGroupPerTenant_0.to_csv(directory + 'traffic_per_group_per_tenant_0.csv', index=False)

In [17]:
(trafficPerGroupPerTenant_0['baseeratTraffic64B']/trafficPerGroupPerTenant_0['multicastTraffic64B']).head(2)

0    1.134503
1    1.026009
dtype: float64

In [21]:
leafs_folders_1 = glob.glob(directory + "logs.*_1.*_leafs*")

In [22]:
trafficPerGroupPerTenant_1 = DataSetAsDataFrame(
    filename='traffic_per_group_per_tenant.csv', 
    filename_bytes='traffic_per_group_per_tenant_for_baseerat_bytes.csv',
    folders=leafs_folders_1, 
    headers=['multicastTraffic', 'unicastTraffic', 'overlayTraffic', 'overlayTrafficCorrected', 'baseeratTraffic'],
    headers_bytes=['baseeratHeaderTraffic'],
    header=0, reset_index=False)

In [23]:
trafficPerGroupPerTenant_1 = trafficPerGroupPerTenant_1.groupby(
    ['groupSizeDist', 'placementDist', 'placementNumHostsPerLeaf', 'seed',
     'podsAlgorithm', 'podsNumBitmaps', 'podsNumNodesPerBitmap', 'podsRedundancyPerBitmap', 'podsNumRules', 
     'leafsAlgorithm', 'leafsNumBitmaps', 'leafsNumNodesPerBitmap', 'leafsRedundancyPerBitmap', 'leafsNumRules'])[
    ['unicastTraffic', 'multicastTraffic', 'baseeratTraffic', 'overlayTrafficCorrected',
     'unicastTraffic64B', 'multicastTraffic64B', 'baseeratTraffic64B', 'overlayTrafficCorrected64B',
     'unicastTraffic512B', 'multicastTraffic512B', 'baseeratTraffic512B', 'overlayTrafficCorrected512B',
     'unicastTraffic1500B', 'multicastTraffic1500B', 'baseeratTraffic1500B', 'overlayTrafficCorrected1500B']].agg('sum')
trafficPerGroupPerTenant_1.reset_index(inplace=True)

In [24]:
trafficPerGroupPerTenant_1.to_csv(directory + 'traffic_per_group_per_tenant_1.csv', index=False)

In [25]:
(trafficPerGroupPerTenant_1['baseeratTraffic64B']/trafficPerGroupPerTenant_1['multicastTraffic64B']).head(2)

0    1.094125
1    1.026967
dtype: float64

In [27]:
leafs_folders_2 = glob.glob(directory + "logs.*_2.*_leafs*")

In [28]:
trafficPerGroupPerTenant_2 = DataSetAsDataFrame(
    filename='traffic_per_group_per_tenant.csv', 
    filename_bytes='traffic_per_group_per_tenant_for_baseerat_bytes.csv',
    folders=leafs_folders_2, 
    headers=['multicastTraffic', 'unicastTraffic', 'overlayTraffic', 'overlayTrafficCorrected', 'baseeratTraffic'],
    headers_bytes=['baseeratHeaderTraffic'],
    header=0, reset_index=False)

In [29]:
trafficPerGroupPerTenant_2 = trafficPerGroupPerTenant_2.groupby(
    ['groupSizeDist', 'placementDist', 'placementNumHostsPerLeaf', 'seed',
     'podsAlgorithm', 'podsNumBitmaps', 'podsNumNodesPerBitmap', 'podsRedundancyPerBitmap', 'podsNumRules', 
     'leafsAlgorithm', 'leafsNumBitmaps', 'leafsNumNodesPerBitmap', 'leafsRedundancyPerBitmap', 'leafsNumRules'])[
    ['unicastTraffic', 'multicastTraffic', 'baseeratTraffic', 'overlayTrafficCorrected',
     'unicastTraffic64B', 'multicastTraffic64B', 'baseeratTraffic64B', 'overlayTrafficCorrected64B',
     'unicastTraffic512B', 'multicastTraffic512B', 'baseeratTraffic512B', 'overlayTrafficCorrected512B',
     'unicastTraffic1500B', 'multicastTraffic1500B', 'baseeratTraffic1500B', 'overlayTrafficCorrected1500B']].agg('sum')
trafficPerGroupPerTenant_2.reset_index(inplace=True)

In [30]:
trafficPerGroupPerTenant_2.to_csv(directory + 'traffic_per_group_per_tenant_2.csv', index=False)

In [31]:
(trafficPerGroupPerTenant_2['baseeratTraffic64B']/trafficPerGroupPerTenant_2['multicastTraffic64B']).head(2)

0    1.112323
1    1.026310
dtype: float64

In [32]:
trafficPerGroupPerTenant_0 = pd.read_csv(directory + 'traffic_per_group_per_tenant_0.csv', sep=',')

In [33]:
trafficPerGroupPerTenant_1 = pd.read_csv(directory + 'traffic_per_group_per_tenant_1.csv', sep=',')

In [34]:
trafficPerGroupPerTenant_2 = pd.read_csv(directory + 'traffic_per_group_per_tenant_2.csv', sep=',')

In [35]:
trafficPerGroupPerTenant = pd.concat([trafficPerGroupPerTenant_0, trafficPerGroupPerTenant_1, trafficPerGroupPerTenant_2])

In [36]:
trafficPerGroupPerTenant.to_csv(directory + 'traffic_per_group_per_tenant.csv', index=False)