In [1]:
import pandas as pd
import numpy as np

from influxdb_client import InfluxDBClient, Point, Dialect

import re
import time
import datetime

import warnings
from influxdb_client.client.warnings import MissingPivotFunction

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import matplotlib.colors as colors

import pandasql as ps
import sqlite3

import csv

import warnings 
from influxdb_client.client.warnings import MissingPivotFunction
warnings.simplefilter("ignore", MissingPivotFunction)

import gc

pd.set_option('display.max_rows', 500)

In [2]:
def _parse_line(line):

    rx_dict = {
    'token': re.compile(r'var token = "(?P<token>.*)"\n'),
    'url': re.compile(r'var url = "(?P<url>.*)"\n'),
    'org': re.compile(r'var org = "(?P<org>.*)"\n'),
    'bucket': re.compile(r'var bucket = "(?P<bucket>.*)"\n'),
    }   

    """
    Do a regex search against all defined regexes and
    return the key and match result of the first matching regex

    """
    for key, rx in rx_dict.items():
        match = rx.search(line)
        if match:
            return key, match
    # if there are no matches
    return None, None

filepath = '/root/flexi-pipe/config.go'
# open the file and read through it line by line
with open(filepath, 'r') as file_object:
    line = file_object.readline()
    while line:
        # at each line check for a match with a regex
        key, match = _parse_line(line)

        if key == 'token':
            token = match.group('token')
        elif key == 'url':
            url = match.group('url')
        elif key == 'org':
            org = match.group('org')
        elif key == 'bucket':
            bucket = match.group('bucket')
        
        line = file_object.readline()
# url="http://192.168.20.58:8086"
url = "http://localhost:8086"

In [3]:
start = 1692978196
end = 1693513472

In [4]:
def experiment(start_time, end_time, filepath):
    # Retrieve experiments data from csv
    data = pd.read_csv(filepath, header=None)
    df = pd.DataFrame(data)

    #Rename columns
    experiments = df.rename(columns={0: "start", 1: "end", 2: "topology", 3: "runtime", 4: "parameter", 5: "d", 6: "dlo", 7: "dhi", 8: "dscore", 9: "dlazy", 10: "dout", 11: "gossipFactor", 12: "initialDelay", 13: "interval"}, errors='raise')

    #Correct timestamp
    experiments["start"] = experiments["start"].str.slice(0, 27)
    experiments["end"] = experiments["end"].str.slice(0, 27)

    #String to timestamp
    # experiments['startUnix'] = pd.to_datetime(experiments["start"],format="%Y-%m-%d %H:%M:%S.%f").astype('int64') / 10**9
    # experiments['endUnix'] = pd.to_datetime(experiments["end"],format="%Y-%m-%d %H:%M:%S.%f").astype('int64') / 10**9
    experiments['startUnix'] = pd.to_datetime(experiments["start"],format="mixed").astype('int64') / 10**9
    experiments['endUnix'] = pd.to_datetime(experiments["end"],format="mixed").astype('int64') / 10**9


    experiments['startUnix'] = pd.to_timedelta(experiments['startUnix'], unit='s').dt.total_seconds().astype(int)#.astype(str)
    experiments['endUnix'] = pd.to_timedelta(experiments['endUnix'], unit='s').dt.total_seconds().astype(int)#.astype(str)

    #Drop fields we don't mneed for the moment
    exp = experiments.drop(columns=["runtime", "initialDelay"]).sort_values(by=["start"])

    #Get times for different intervals
    # intervals = exp["interval"].drop_duplicates().sort_values().reset_index(drop=True)
    # intervals.head(10)

    expTime = exp[exp['startUnix'].astype(int).between(start_time, end_time)]
    # expTime['experiment'] = expTime.index
    expTime = expTime.reset_index().rename({'index':'experiment'}, axis = 'columns')

    return expTime

experiments = experiment(start, end, '../experiments.csv')
experiments.head(500)

Unnamed: 0,experiment,start,end,topology,parameter,d,dlo,dhi,dscore,dlazy,dout,gossipFactor,interval,startUnix,endUnix
0,150,2023-08-25 15:43:16.3243323,2023-08-25 16:13:16.3255309,unl,reference,8,6,12,4,8,2,0.25,1.0,1692978196,1692979996
1,151,2023-08-25 16:16:49.6961810,2023-08-25 16:46:49.6974978,unl,reference,8,6,12,4,8,2,0.25,1.0,1692980209,1692982009
2,152,2023-08-25 16:50:23.6899992,2023-08-25 17:20:23.6921991,unl,reference,8,6,12,4,8,2,0.25,1.0,1692982223,1692984023
3,153,2023-08-25 17:23:57.7272734,2023-08-25 17:53:57.8279711,unl,interval,8,6,12,4,8,2,0.25,0.5,1692984237,1692986037
4,154,2023-08-25 17:57:30.6350173,2023-08-25 18:27:30.7363305,unl,interval,8,6,12,4,8,2,0.25,0.5,1692986250,1692988050
5,155,2023-08-25 18:31:03.6332371,2023-08-25 19:01:03.6935448,unl,interval,8,6,12,4,8,2,0.25,0.5,1692988263,1692990063
6,156,2023-08-25 19:04:36.9266682,2023-08-25 19:34:36.9337235,unl,interval,8,6,12,4,8,2,0.25,30.0,1692990276,1692992076
7,157,2023-08-25 19:38:10.1780791,2023-08-25 20:08:10.1796217,unl,interval,8,6,12,4,8,2,0.25,30.0,1692992290,1692994090
8,158,2023-08-25 20:11:43.6270210,2023-08-25 20:41:43.6285805,unl,interval,8,6,12,4,8,2,0.25,30.0,1692994303,1692996103
9,159,2023-08-25 20:45:16.8407778,2023-08-25 21:15:16.8427550,unl,interval,8,6,12,4,8,2,0.25,3.0,1692996316,1692998116


In [5]:
def from_influx(url, token, org, measurement, start_time, end_time,grouping_key):
    client = InfluxDBClient(url=url, token=token, org=org,  timeout=900_000)

    # write_api = client.write_api(write_options=SYNCHRONOUS)
    query_api = client.query_api()

    data_frame = query_api.query_data_frame('from(bucket: "gs") '
                                        ' |> range(start: '+str(start_time)+', stop:'+str(end_time)+') '
                                        ' |> filter(fn: (r) => r._measurement == "'+measurement+'") '
                                        ' |> group(columns: ["_measurement", "_field"], mode: "by") '
                                        ' |> pivot(rowKey:["_time"], columnKey: ["_field"], valueColumn: "_value")')
    client.close()

    # df = data_frame.drop(columns=['result', 'table','_start', '_stop', '_measurement', 'topic', 'receivedFrom']).sort_values(by=["_time"]).reset_index(drop=True)
    data_frame.reset_index(inplace=True)
    df = data_frame[['_time', grouping_key]].sort_values(by=["_time"]).reset_index(drop=True)
    df["_time"] = pd.to_datetime(df["_time"])

    return df

In [6]:
def from_influx_count(url, token, org, start_time, end_time,grouping_key):
    # start_time = 1693222601 
    # end_time = 1693228430    
    client = InfluxDBClient(url=url, token=token, org=org,  timeout=900_000)
        
        # write_api = client.write_api(write_options=SYNCHRONOUS)
    query_api = client.query_api()
        
    data_frame = query_api.query_data_frame('from(bucket: "gs") '
                       ' |> range(start: '+str(start_time)+', stop:'+str(end_time)+') '
                        '|> filter(fn: (r) => r._measurement == "deliverMessage") '
                        '|> group(columns: ["_measurement", "_field"], mode: "by") '
                        '|> count()')
    client.close()

    if data_frame.empty:
        count = 0
    else:
        count = data_frame["_value"].min().astype(int)
    return count



In [7]:
#Validate data
validate = pd.DataFrame()

for index, row in experiments.iterrows():
    count = from_influx_count(url, token, org, row["startUnix"], row["endUnix"],"_measurement")
    d = {'experiment': [row['experiment']], 'count': [count], 'topology': [row["topology"]], 'd': [row["d"]],'dhi': [row["dhi"]],'dlo': [row["dlo"]],'dlazy': [row["dlazy"]],
        'dscore': [row["dscore"]],'dout': [row["dout"]],'gossipFactor': [row["gossipFactor"]],'interval': [row["interval"]],}
    aux = pd.DataFrame(data=d)
    validate = pd.concat([validate, aux])

# validate.head(200)

exps = experiments.merge(validate, on=['experiment', 'topology','d','dhi','dlo','dlazy','dscore','dout','gossipFactor','interval'])
exps = exps.loc[exps["count"]>1000]
exps.to_csv('exp_filtered.csv')
exps.head(10)

Unnamed: 0,experiment,start,end,topology,parameter,d,dlo,dhi,dscore,dlazy,dout,gossipFactor,interval,startUnix,endUnix,count
2,152,2023-08-25 16:50:23.6899992,2023-08-25 17:20:23.6921991,unl,reference,8,6,12,4,8,2,0.25,1.0,1692982223,1692984023,17416
3,153,2023-08-25 17:23:57.7272734,2023-08-25 17:53:57.8279711,unl,interval,8,6,12,4,8,2,0.25,0.5,1692984237,1692986037,75673
4,154,2023-08-25 17:57:30.6350173,2023-08-25 18:27:30.7363305,unl,interval,8,6,12,4,8,2,0.25,0.5,1692986250,1692988050,153137
5,155,2023-08-25 18:31:03.6332371,2023-08-25 19:01:03.6935448,unl,interval,8,6,12,4,8,2,0.25,0.5,1692988263,1692990063,125112
6,156,2023-08-25 19:04:36.9266682,2023-08-25 19:34:36.9337235,unl,interval,8,6,12,4,8,2,0.25,30.0,1692990276,1692992076,215731
7,157,2023-08-25 19:38:10.1780791,2023-08-25 20:08:10.1796217,unl,interval,8,6,12,4,8,2,0.25,30.0,1692992290,1692994090,274279
8,158,2023-08-25 20:11:43.6270210,2023-08-25 20:41:43.6285805,unl,interval,8,6,12,4,8,2,0.25,30.0,1692994303,1692996103,232539
9,159,2023-08-25 20:45:16.8407778,2023-08-25 21:15:16.8427550,unl,interval,8,6,12,4,8,2,0.25,3.0,1692996316,1692998116,172720
10,160,2023-08-25 21:18:50.1678891,2023-08-25 21:48:50.1692797,unl,interval,8,6,12,4,8,2,0.25,3.0,1692998330,1693000130,195350
11,161,2023-08-25 21:52:23.8902604,2023-08-25 22:22:23.8915283,unl,interval,8,6,12,4,8,2,0.25,3.0,1693000343,1693002143,171780


In [8]:
#experiment,n_nodes,topology,d,dlo,dhi,dscore,dlazy,dout,gossipFactor,interval,start,end,overhead,propTime,bandwidth,totalMessages,totalrpc

final = exps.drop(columns=['experiment', 'start','end','startUnix','endUnix', 'parameter', 'count']).drop_duplicates()
final['n_nodes'] = 23
final = final.reset_index().rename({'index':'identifier'}, axis = 'columns')
final.head(10)
 

Unnamed: 0,identifier,topology,d,dlo,dhi,dscore,dlazy,dout,gossipFactor,interval,n_nodes
0,2,unl,8,6,12,4,8,2,0.25,1.0,23
1,3,unl,8,6,12,4,8,2,0.25,0.5,23
2,6,unl,8,6,12,4,8,2,0.25,30.0,23
3,9,unl,8,6,12,4,8,2,0.25,3.0,23
4,12,unl,24,6,12,4,8,2,0.25,1.0,23
5,15,unl,12,6,12,4,8,2,0.25,1.0,23
6,18,unl,6,6,12,4,8,2,0.25,1.0,23
7,21,unl,8,2,12,4,8,2,0.25,1.0,23
8,24,unl,8,3,12,4,8,2,0.25,1.0,23
9,27,unl,8,8,12,4,8,2,0.25,1.0,23


In [9]:
# final = final.loc[final["identifier"]<9]
# final.head(10)

In [11]:
#Bandwidth
# recvRpc = pd.DataFrame()
# deliverMessage = pd.DataFrame()
# publishMessage = pd.DataFrame()

message = pd.DataFrame()

for index, row in final.iterrows():
    # print(row["start"], row["end"], row["experiment"])
    execs = exps.loc[(exps["topology"] == row["topology"]) & (exps["d"] == row["d"]) &
                            (exps["dlo"] == row["dlo"]) & (exps["dhi"] == row["dhi"]) &
                            (exps["dscore"] == row["dscore"]) & (exps["dlazy"] == row["dlazy"]) &
                            (exps["dout"] == row["dout"]) & (exps["gossipFactor"] == row["gossipFactor"]) &                    
                            (exps["interval"] == row["interval"])]

    for idx, exec in execs.iterrows():
        # print(exec["startUnix"], exec["endUnix"])
        query_mess = from_influx(url, token, org, "deliverMessage", exec["startUnix"], exec["endUnix"],"_measurement")
        query_rpc = from_influx(url, token, org, "recvRPC", exec["startUnix"], exec["endUnix"],"_measurement")
        # query_pub = from_influx(url, token, org, "publishMessage", exec["startUnix"], exec["endUnix"],"_measurement")
    
        query_mess["identifier"] = row["identifier"]
        query_rpc["identifier"] = row["identifier"]
        # query_pub["identifier"] = row["identifier"]

        query_mess["experiment"] = exec["experiment"]
        query_rpc["experiment"] = exec["experiment"]
        # query_pub["experiment"] = row["experiment"]
    
        query_mess["start"] = exec["start"]
        query_mess["end"] = exec["end"]
    
        query_rpc["start"] = exec["start"]
        query_rpc["end"] = exec["end"]
    
        # query_pub["start"] = exec["start"]
        # query_pub["end"] = exec["end"]
    
        # deliverMessage = pd.concat([query_mess, deliverMessage])
        # recvRpc = pd.concat([recvRpc, query_rpc])
        # publishMessage = pd.concat([publishMessage, query_pub])

        joined = pd.concat([query_rpc,query_mess])
        del query_rpc
        del query_mess
        # del deliverMessage
        gc.collect()
        
        joined["_time"] = pd.to_datetime(joined["_time"])
        message = pd.concat([message,joined])
        

message.head(10)


KeyboardInterrupt: 

In [46]:
#Try resampling for every seconds
dfNoIndex = message.reset_index()
# dfNoIndex.head(10)

by_time = dfNoIndex.groupby([dfNoIndex['experiment'],dfNoIndex["start"],dfNoIndex["end"],dfNoIndex["identifier"],pd.Grouper(key="_time", freq='1s')])["_measurement"].count().reset_index()
dfAggTime = by_time.rename(columns={"_measurement": "count"})

# date_list = pd.date_range(dfAggTime['start'].min(), dfAggTime['end'].max(),freq='1s',tz=None)

# dates = pd.DataFrame(date_list).rename(columns={0:"_time"})
# dates['count'] = 0

# dates['_time'] = pd.to_datetime(dates["_time"], format='mixed')#.tz_localize(None)
dfAggTime['_time'] = pd.to_datetime(dfAggTime["_time"], format='mixed')#.tz_localize(None)
dfAggTime['start'] = pd.to_datetime(dfAggTime["start"], format='mixed')#.tz_localize(None)
dfAggTime['end'] = pd.to_datetime(dfAggTime["end"], format='mixed')#.tz_localize(None)

dfAggTime["duration"] = (dfAggTime["end"]-dfAggTime["start"])/pd.Timedelta(seconds=1)
# dates["_time"] = dates["_time"].dt.tz_localize(None)
# dfAggTime["_time"] = dfAggTime["_time"].dt.tz_localize(None)
# dfAggTime["start"] = dfAggTime["start"].dt.tz_localize(None)
# dfAggTime["end"] = dfAggTime["end"].dt.tz_localize(None)

dfAggTime.head(10)

Unnamed: 0,experiment,start,end,identifier,_time,count,duration
0,152,2023-08-25 16:50:23.689999200,2023-08-25 17:20:23.692199100,2,2023-08-25 17:04:39+00:00,1171,1800.0022
1,152,2023-08-25 16:50:23.689999200,2023-08-25 17:20:23.692199100,2,2023-08-25 17:04:40+00:00,390,1800.0022
2,152,2023-08-25 16:50:23.689999200,2023-08-25 17:20:23.692199100,2,2023-08-25 17:04:41+00:00,83,1800.0022
3,152,2023-08-25 16:50:23.689999200,2023-08-25 17:20:23.692199100,2,2023-08-25 17:04:44+00:00,54,1800.0022
4,152,2023-08-25 16:50:23.689999200,2023-08-25 17:20:23.692199100,2,2023-08-25 17:04:47+00:00,52,1800.0022
5,152,2023-08-25 16:50:23.689999200,2023-08-25 17:20:23.692199100,2,2023-08-25 17:04:50+00:00,108,1800.0022
6,152,2023-08-25 16:50:23.689999200,2023-08-25 17:20:23.692199100,2,2023-08-25 17:04:53+00:00,106,1800.0022
7,152,2023-08-25 16:50:23.689999200,2023-08-25 17:20:23.692199100,2,2023-08-25 17:04:56+00:00,106,1800.0022
8,152,2023-08-25 16:50:23.689999200,2023-08-25 17:20:23.692199100,2,2023-08-25 17:04:59+00:00,107,1800.0022
9,152,2023-08-25 16:50:23.689999200,2023-08-25 17:20:23.692199100,2,2023-08-25 17:05:02+00:00,108,1800.0022


In [47]:
dfAggTime.tail(10)

Unnamed: 0,experiment,start,end,identifier,_time,count,duration
181822,300,2023-08-31 17:06:48.914120100,2023-08-31 17:36:48.916668,2,2023-08-31 17:36:38+00:00,237,1800.002548
181823,300,2023-08-31 17:06:48.914120100,2023-08-31 17:36:48.916668,2,2023-08-31 17:36:39+00:00,366,1800.002548
181824,300,2023-08-31 17:06:48.914120100,2023-08-31 17:36:48.916668,2,2023-08-31 17:36:40+00:00,356,1800.002548
181825,300,2023-08-31 17:06:48.914120100,2023-08-31 17:36:48.916668,2,2023-08-31 17:36:41+00:00,248,1800.002548
181826,300,2023-08-31 17:06:48.914120100,2023-08-31 17:36:48.916668,2,2023-08-31 17:36:42+00:00,382,1800.002548
181827,300,2023-08-31 17:06:48.914120100,2023-08-31 17:36:48.916668,2,2023-08-31 17:36:43+00:00,356,1800.002548
181828,300,2023-08-31 17:06:48.914120100,2023-08-31 17:36:48.916668,2,2023-08-31 17:36:44+00:00,265,1800.002548
181829,300,2023-08-31 17:06:48.914120100,2023-08-31 17:36:48.916668,2,2023-08-31 17:36:45+00:00,376,1800.002548
181830,300,2023-08-31 17:06:48.914120100,2023-08-31 17:36:48.916668,2,2023-08-31 17:36:46+00:00,355,1800.002548
181831,300,2023-08-31 17:06:48.914120100,2023-08-31 17:36:48.916668,2,2023-08-31 17:36:47+00:00,265,1800.002548


In [48]:
df=dfAggTime.drop(columns=["start", "end", "_time"])
avgPropExp = df.groupby(['experiment', 'identifier', 'duration']).agg('sum')
avgPropExp.reset_index(inplace=True)
avgPropExp['mean'] = avgPropExp['count']/avgPropExp['duration']

avgPropExp.head(10)

Unnamed: 0,experiment,identifier,duration,count,mean
0,152,2,1800.0022,48857,27.142745
1,153,3,1800.100698,426586,236.978965
2,154,3,1800.101313,965473,536.343701
3,155,3,1800.060308,841747,467.621555
4,156,6,1800.007055,1622959,901.640355
5,157,6,1800.001543,2189148,1216.192291
6,158,6,1800.001559,1895668,1053.147976
7,159,9,1800.001977,1237109,687.282023
8,160,9,1800.001391,1341499,745.276646
9,161,9,1800.001268,1442626,801.458324


In [49]:
avgProp = avgPropExp.drop(columns=['experiment', 'duration', 'count'])
avgBand = avgProp.groupby(['identifier']).agg({'mean':['mean','std']})
avgBand.columns = avgBand.columns.droplevel(0)
avgBand.reset_index(inplace=True)

avgBand.head(10)

Unnamed: 0,identifier,mean,std
0,2,349.473952,455.845166
1,3,413.648073,156.810917
2,6,1056.993541,157.311225
3,9,744.672331,57.09055
4,12,824.463965,510.581462
5,15,961.139534,160.53804
6,18,621.270477,29.856764
7,21,604.560808,396.792143
8,24,360.85954,263.001297
9,27,287.544853,267.921622


In [50]:
finalBandwidth = final.merge(avgBand, on='identifier')
finalBandwidth = finalBandwidth.rename(columns={'mean' : 'bandwidth'}).drop(columns=['std'])

finalBandwidth.to_csv('bandwidth.csv')
finalBandwidth.head(10)

Unnamed: 0,identifier,topology,d,dlo,dhi,dscore,dlazy,dout,gossipFactor,interval,n_nodes,bandwidth
0,2,unl,8,6,12,4,8,2,0.25,1.0,23,349.473952
1,3,unl,8,6,12,4,8,2,0.25,0.5,23,413.648073
2,6,unl,8,6,12,4,8,2,0.25,30.0,23,1056.993541
3,9,unl,8,6,12,4,8,2,0.25,3.0,23,744.672331
4,12,unl,24,6,12,4,8,2,0.25,1.0,23,824.463965
5,15,unl,12,6,12,4,8,2,0.25,1.0,23,961.139534
6,18,unl,6,6,12,4,8,2,0.25,1.0,23,621.270477
7,21,unl,8,2,12,4,8,2,0.25,1.0,23,604.560808
8,24,unl,8,3,12,4,8,2,0.25,1.0,23,360.85954
9,27,unl,8,8,12,4,8,2,0.25,1.0,23,287.544853
