In [1]:
import pandas as pd
import numpy as np

from influxdb_client import InfluxDBClient, Point, Dialect

import re
import time
import datetime

import warnings
from influxdb_client.client.warnings import MissingPivotFunction

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import matplotlib.colors as colors

import pandasql as ps
import sqlite3

import csv


pd.set_option('display.max_rows', 100)

In [2]:
def _parse_line(line):

    rx_dict = {
    'token': re.compile(r'var token = "(?P<token>.*)"\n'),
    'url': re.compile(r'var url = "(?P<url>.*)"\n'),
    'org': re.compile(r'var org = "(?P<org>.*)"\n'),
    'bucket': re.compile(r'var bucket = "(?P<bucket>.*)"\n'),
    }   

    """
    Do a regex search against all defined regexes and
    return the key and match result of the first matching regex

    """
    for key, rx in rx_dict.items():
        match = rx.search(line)
        if match:
            return key, match
    # if there are no matches
    return None, None

filepath = '/root/flexi-pipe/config.go'
# open the file and read through it line by line
with open(filepath, 'r') as file_object:
    line = file_object.readline()
    while line:
        # at each line check for a match with a regex
        key, match = _parse_line(line)

        if key == 'token':
            token = match.group('token')
        elif key == 'url':
            url = match.group('url')
        elif key == 'org':
            org = match.group('org')
        elif key == 'bucket':
            bucket = match.group('bucket')
        
        line = file_object.readline()
# url="http://192.168.20.58:8086"
url = "http://localhost:8086"

In [3]:
start_time = 1692622102
end_time = 1692772911

In [4]:
def experiment(start_time, end_time, filepath):
    # Retrieve experiments data from csv
    data = pd.read_csv(filepath, header=None)
    df = pd.DataFrame(data)

    #Rename columns
    experiments = df.rename(columns={0: "start", 1: "end", 2: "topology", 3: "runtime", 4: "parameter", 5: "d", 6: "dlo", 7: "dhi", 8: "dscore", 9: "dlazy", 10: "dout", 11: "gossipFactor", 12: "initialDelay", 13: "interval"}, errors='raise')

    #Correct timestamp
    experiments["start"] = experiments["start"].str.slice(0, 27)
    experiments["end"] = experiments["end"].str.slice(0, 27)

    #String to timestamp
    # experiments['startUnix'] = pd.to_datetime(experiments["start"],format="%Y-%m-%d %H:%M:%S.%f").astype('int64') / 10**9
    # experiments['endUnix'] = pd.to_datetime(experiments["end"],format="%Y-%m-%d %H:%M:%S.%f").astype('int64') / 10**9
    experiments['startUnix'] = pd.to_datetime(experiments["start"],format="mixed").astype('int64') / 10**9
    experiments['endUnix'] = pd.to_datetime(experiments["end"],format="mixed").astype('int64') / 10**9


    experiments['startUnix'] = pd.to_timedelta(experiments['startUnix'], unit='s').dt.total_seconds().astype(int)#.astype(str)
    experiments['endUnix'] = pd.to_timedelta(experiments['endUnix'], unit='s').dt.total_seconds().astype(int)#.astype(str)

    #Drop fields we don't mneed for the moment
    exp = experiments.drop(columns=["runtime", "initialDelay"]).sort_values(by=["start"])

    #Get times for different intervals
    # intervals = exp["interval"].drop_duplicates().sort_values().reset_index(drop=True)
    # intervals.head(10)

    expTime = exp[exp['startUnix'].astype(int).between(start_time, end_time)]
    # expTime['experiment'] = expTime.index
    expTime = expTime.reset_index().rename({'index':'experiment'}, axis = 'columns')

    return expTime

experiments = experiment(start_time, end_time, '../experiments.csv')
experiments.head(40)

Unnamed: 0,experiment,start,end,topology,parameter,d,dlo,dhi,dscore,dlazy,dout,gossipFactor,interval,startUnix,endUnix
0,0,2023-08-21 12:48:22.3741823,2023-08-21 13:18:22.3952804,unl,reference,8,6,12,4,8,2,0.25,1.0,1692622102,1692623902
1,1,2023-08-21 13:21:55.6321388,2023-08-21 13:51:55.6334369,unl,reference,8,6,12,4,8,2,0.25,1.0,1692624115,1692625915
2,2,2023-08-21 13:55:28.9783578,2023-08-21 14:25:28.9820206,unl,reference,8,6,12,4,8,2,0.25,1.0,1692626128,1692627928
3,3,2023-08-21 14:29:02.5866046,2023-08-21 14:59:02.6871797,unl,interval,8,6,12,4,8,2,0.25,0.5,1692628142,1692629942
4,4,2023-08-21 15:02:35.9708968,2023-08-21 15:32:36.0272486,unl,interval,8,6,12,4,8,2,0.25,0.5,1692630155,1692631956
5,5,2023-08-21 15:36:09.3380555,2023-08-21 16:06:09.3391894,unl,interval,8,6,12,4,8,2,0.25,0.5,1692632169,1692633969
6,6,2023-08-21 16:09:43.1463718,2023-08-21 16:39:43.1814633,unl,interval,8,6,12,4,8,2,0.25,30.0,1692634183,1692635983
7,7,2023-08-21 16:43:16.7045785,2023-08-21 17:13:16.7785160,unl,interval,8,6,12,4,8,2,0.25,30.0,1692636196,1692637996
8,8,2023-08-21 17:16:50.1424767,2023-08-21 17:46:50.1924402,unl,interval,8,6,12,4,8,2,0.25,30.0,1692638210,1692640010
9,9,2023-08-21 17:50:23.0310174,2023-08-21 18:20:23.0333186,unl,interval,8,6,12,4,8,2,0.25,3.0,1692640223,1692642023


In [5]:
def from_influx(url, token, org, measurement, start_time, end_time,grouping_key):
    client = InfluxDBClient(url=url, token=token, org=org,  timeout=900_000)

    # write_api = client.write_api(write_options=SYNCHRONOUS)
    query_api = client.query_api()

    data_frame = query_api.query_data_frame('from(bucket: "gs") '
                                        ' |> range(start: '+str(start_time)+', stop:'+str(end_time)+') '
                                        ' |> filter(fn: (r) => r._measurement == "'+measurement+'") '
                                        ' |> group(columns: ["_measurement", "_field"], mode: "by") '
                                        ' |> pivot(rowKey:["_time"], columnKey: ["_field"], valueColumn: "_value")')
    client.close()

    # df = data_frame.drop(columns=['result', 'table','_start', '_stop', '_measurement', 'topic', 'receivedFrom']).sort_values(by=["_time"]).reset_index(drop=True)
    data_frame.reset_index(inplace=True)
    df = data_frame[['_time', grouping_key]].sort_values(by=["_time"]).reset_index(drop=True)
    df["_time"] = pd.to_datetime(df["_time"])

    return df

In [6]:
ref = experiments.loc[experiments["parameter"] == "reference"]
start_reference = ref["startUnix"].min().astype(int)
end_reference = ref["endUnix"].max().astype(int)

reference 		= from_influx(url, token, org, "deliverMessage", start_reference, end_reference, "_measurement")
reference_rpc 	= from_influx(url, token, org, "recvRPC", start_reference, end_reference, "_measurement")
reference.head(10)

Unnamed: 0,_time,_measurement
0,2023-08-21 14:09:45.838643+00:00,deliverMessage
1,2023-08-21 14:09:45.838787+00:00,deliverMessage
2,2023-08-21 14:09:45.839210+00:00,deliverMessage
3,2023-08-21 14:09:45.839463+00:00,deliverMessage
4,2023-08-21 14:09:45.839751+00:00,deliverMessage
5,2023-08-21 14:09:45.839764+00:00,deliverMessage
6,2023-08-21 14:09:45.840053+00:00,deliverMessage
7,2023-08-21 14:09:45.840195+00:00,deliverMessage
8,2023-08-21 14:09:45.840359+00:00,deliverMessage
9,2023-08-21 14:09:45.840382+00:00,deliverMessage


In [7]:
reference_rpc.head(10)

Unnamed: 0,_time,_measurement
0,2023-08-21 13:02:38.480207+00:00,recvRPC
1,2023-08-21 13:02:38.480868+00:00,recvRPC
2,2023-08-21 13:02:38.489501+00:00,recvRPC
3,2023-08-21 13:02:38.490327+00:00,recvRPC
4,2023-08-21 13:02:38.490556+00:00,recvRPC
5,2023-08-21 13:02:38.491121+00:00,recvRPC
6,2023-08-21 13:02:38.506134+00:00,recvRPC
7,2023-08-21 13:02:38.506757+00:00,recvRPC
8,2023-08-21 13:02:38.506913+00:00,recvRPC
9,2023-08-21 13:02:38.506925+00:00,recvRPC


In [8]:
par = experiments.loc[experiments["parameter"] == "d"]
start_query = par["startUnix"].min().astype(int)
end_query = par["endUnix"].max().astype(int)

rpc  		= from_influx(url, token, org, "recvRPC", start_query, end_query, '_measurement')
received 	= from_influx(url, token, org, "deliverMessage",start_query, end_query, '_measurement')

rpc.head(10)
# received.head(10)

Unnamed: 0,_time,_measurement
0,2023-08-21 19:45:18.202280+00:00,recvRPC
1,2023-08-21 19:45:18.202973+00:00,recvRPC
2,2023-08-21 19:45:18.225958+00:00,recvRPC
3,2023-08-21 19:45:18.227187+00:00,recvRPC
4,2023-08-21 19:45:18.227602+00:00,recvRPC
5,2023-08-21 19:45:18.230511+00:00,recvRPC
6,2023-08-21 19:45:18.236934+00:00,recvRPC
7,2023-08-21 19:45:18.237525+00:00,recvRPC
8,2023-08-21 19:45:18.237893+00:00,recvRPC
9,2023-08-21 19:45:18.237954+00:00,recvRPC


In [9]:
received.head(10)

Unnamed: 0,_time,_measurement
0,2023-08-21 19:45:22.353238+00:00,deliverMessage
1,2023-08-21 19:45:22.353271+00:00,deliverMessage
2,2023-08-21 19:45:22.353590+00:00,deliverMessage
3,2023-08-21 19:45:22.353753+00:00,deliverMessage
4,2023-08-21 19:45:22.354024+00:00,deliverMessage
5,2023-08-21 19:45:22.354079+00:00,deliverMessage
6,2023-08-21 19:45:22.354127+00:00,deliverMessage
7,2023-08-21 19:45:22.354324+00:00,deliverMessage
8,2023-08-21 19:45:22.354398+00:00,deliverMessage
9,2023-08-21 19:45:22.354477+00:00,deliverMessage


In [10]:
exp = experiments.loc[experiments['topology'] == "unl"]
exp = exp.loc[exp['parameter'] == "d"]
exp = pd.concat([exp, ref])

received 	= pd.concat([received, reference])
rpc 		= pd.concat([rpc, reference_rpc])

# df = calcBandwidth(received, rpc, exp, graph['parameter'])
rpc.head(10)

Unnamed: 0,_time,_measurement
0,2023-08-21 19:45:18.202280+00:00,recvRPC
1,2023-08-21 19:45:18.202973+00:00,recvRPC
2,2023-08-21 19:45:18.225958+00:00,recvRPC
3,2023-08-21 19:45:18.227187+00:00,recvRPC
4,2023-08-21 19:45:18.227602+00:00,recvRPC
5,2023-08-21 19:45:18.230511+00:00,recvRPC
6,2023-08-21 19:45:18.236934+00:00,recvRPC
7,2023-08-21 19:45:18.237525+00:00,recvRPC
8,2023-08-21 19:45:18.237893+00:00,recvRPC
9,2023-08-21 19:45:18.237954+00:00,recvRPC


In [11]:
received.head(10)

Unnamed: 0,_time,_measurement
0,2023-08-21 19:45:22.353238+00:00,deliverMessage
1,2023-08-21 19:45:22.353271+00:00,deliverMessage
2,2023-08-21 19:45:22.353590+00:00,deliverMessage
3,2023-08-21 19:45:22.353753+00:00,deliverMessage
4,2023-08-21 19:45:22.354024+00:00,deliverMessage
5,2023-08-21 19:45:22.354079+00:00,deliverMessage
6,2023-08-21 19:45:22.354127+00:00,deliverMessage
7,2023-08-21 19:45:22.354324+00:00,deliverMessage
8,2023-08-21 19:45:22.354398+00:00,deliverMessage
9,2023-08-21 19:45:22.354477+00:00,deliverMessage


In [12]:
# def calcBandwidth(message, rpc, expTime, parameter):
message = received
rpc = rpc
expTime = exp 
parameter = "d"

message = message[['_time', '_measurement']].reset_index(drop=True)
rpc = rpc[['_time', '_measurement']].reset_index(drop=True)

joined = pd.concat([rpc, message])#on=['receivedFrom', 'topic'])
joined["_time"] = pd.to_datetime(joined["_time"])

joined.tail(100)

Unnamed: 0,_time,_measurement
879447,2023-08-21 14:16:42.274595+00:00,deliverMessage
879448,2023-08-21 14:16:42.274658+00:00,deliverMessage
879449,2023-08-21 14:16:42.274725+00:00,deliverMessage
879450,2023-08-21 14:16:42.275045+00:00,deliverMessage
879451,2023-08-21 14:17:02.181425+00:00,deliverMessage
879452,2023-08-21 14:17:02.181562+00:00,deliverMessage
879453,2023-08-21 14:17:02.181787+00:00,deliverMessage
879454,2023-08-21 14:17:02.182546+00:00,deliverMessage
879455,2023-08-21 14:17:02.243481+00:00,deliverMessage
879456,2023-08-21 14:17:02.243885+00:00,deliverMessage


In [13]:
#Make the db in memory
conn = sqlite3.connect(':memory:')
#write the tables
joined.to_sql('df', conn, index=False)
expTime.to_sql('expTime', conn, index=False)

qry = '''
            select  
                df._time,
                expTime.start as min,
                expTime.end as max,
                expTime.experiment,
                expTime.'''+parameter+''',
                df._measurement
            from
                df join expTime on
                df._time between expTime.start and expTime.end
	    '''
dfNew = pd.read_sql_query(qry, conn)
dfNew = dfNew.set_index('experiment')#.rename(columns={"_time": "min"})#.drop(columns=["messageID"])

#dfNew['min'] = 
dfNew['min'] = pd.to_datetime(dfNew["min"], format='mixed')
dfNew['max'] = pd.to_datetime(dfNew["max"], format='mixed')
dfNew['_time'] = pd.to_datetime(dfNew["_time"], format='mixed')

dfNew.head(100)

Unnamed: 0_level_0,_time,min,max,d,_measurement
experiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
12,2023-08-21 19:45:18.202280+00:00,2023-08-21 19:31:02.923742,2023-08-21 20:01:02.925166800,24,recvRPC
12,2023-08-21 19:45:18.202973+00:00,2023-08-21 19:31:02.923742,2023-08-21 20:01:02.925166800,24,recvRPC
12,2023-08-21 19:45:18.225958+00:00,2023-08-21 19:31:02.923742,2023-08-21 20:01:02.925166800,24,recvRPC
12,2023-08-21 19:45:18.227187+00:00,2023-08-21 19:31:02.923742,2023-08-21 20:01:02.925166800,24,recvRPC
12,2023-08-21 19:45:18.227602+00:00,2023-08-21 19:31:02.923742,2023-08-21 20:01:02.925166800,24,recvRPC
12,2023-08-21 19:45:18.230511+00:00,2023-08-21 19:31:02.923742,2023-08-21 20:01:02.925166800,24,recvRPC
12,2023-08-21 19:45:18.236934+00:00,2023-08-21 19:31:02.923742,2023-08-21 20:01:02.925166800,24,recvRPC
12,2023-08-21 19:45:18.237525+00:00,2023-08-21 19:31:02.923742,2023-08-21 20:01:02.925166800,24,recvRPC
12,2023-08-21 19:45:18.237893+00:00,2023-08-21 19:31:02.923742,2023-08-21 20:01:02.925166800,24,recvRPC
12,2023-08-21 19:45:18.237954+00:00,2023-08-21 19:31:02.923742,2023-08-21 20:01:02.925166800,24,recvRPC


In [14]:
#Try resampling for every seconds
dfNoIndex = dfNew.reset_index()
# dfNoIndex.head(10)

by_time = dfNoIndex.groupby([dfNoIndex['experiment'],dfNoIndex[parameter],dfNoIndex["min"],dfNoIndex["max"],pd.Grouper(key="_time", freq='1s')])["_measurement"].count().reset_index()
dfAggTime = by_time.rename(columns={"_measurement": "count"})

date_list = pd.date_range(dfAggTime['min'].min(), dfAggTime['max'].max(),freq='1s',tz=None)

dates = pd.DataFrame(date_list).rename(columns={0:"_time"})
dates['count'] = 0

dates['_time'] = pd.to_datetime(dates["_time"], format='mixed')#.tz_localize(None)
dfAggTime['_time'] = pd.to_datetime(dfAggTime["_time"], format='mixed')#.tz_localize(None)
dfAggTime['min'] = pd.to_datetime(dfAggTime["min"], format='mixed')#.tz_localize(None)
dfAggTime['max'] = pd.to_datetime(dfAggTime["max"], format='mixed')#.tz_localize(None)

dates["_time"] = dates["_time"].dt.tz_localize(None)
dfAggTime["_time"] = dfAggTime["_time"].dt.tz_localize(None)
dfAggTime["min"] = dfAggTime["min"].dt.tz_localize(None)
dfAggTime["max"] = dfAggTime["max"].dt.tz_localize(None)

dfAggTime.head(10)

Unnamed: 0,experiment,d,min,max,_time,count
0,0,8,2023-08-21 12:48:22.374182300,2023-08-21 13:18:22.395280400,2023-08-21 13:02:38,246
1,1,8,2023-08-21 13:21:55.632138800,2023-08-21 13:51:55.633436900,2023-08-21 13:36:11,88
2,1,8,2023-08-21 13:21:55.632138800,2023-08-21 13:51:55.633436900,2023-08-21 13:36:12,140
3,2,8,2023-08-21 13:55:28.978357800,2023-08-21 14:25:28.982020600,2023-08-21 14:09:45,886
4,2,8,2023-08-21 13:55:28.978357800,2023-08-21 14:25:28.982020600,2023-08-21 14:09:46,1842
5,2,8,2023-08-21 13:55:28.978357800,2023-08-21 14:25:28.982020600,2023-08-21 14:10:02,129
6,2,8,2023-08-21 13:55:28.978357800,2023-08-21 14:25:28.982020600,2023-08-21 14:10:15,1
7,2,8,2023-08-21 13:55:28.978357800,2023-08-21 14:25:28.982020600,2023-08-21 14:10:22,26
8,2,8,2023-08-21 13:55:28.978357800,2023-08-21 14:25:28.982020600,2023-08-21 14:10:42,26
9,2,8,2023-08-21 13:55:28.978357800,2023-08-21 14:25:28.982020600,2023-08-21 14:11:02,26


In [15]:
 #write the tables
dfAggTime.to_sql('aggTime', conn, index=False)
dates.to_sql('dates', conn, index=False)

qry = '''
        select distinct
            dates._time as _time,
            aggTime.min,
            aggTime.max,
            aggTime.'''+parameter+''',
            aggTime.experiment,
            dates.count
        from
            dates join aggTime on
            dates._time between aggTime.min and aggTime.max
        '''
dfFill = pd.read_sql_query(qry, conn)

dfFill['_time'] = pd.to_datetime(dfFill["_time"], format='mixed')#.tz_localize(None)
dfAggTime['_time'] = pd.to_datetime(dfAggTime["_time"], format='mixed')#.tz_localize(None)

dfFill["_time"] = dfFill["_time"].dt.tz_localize(None)
dfAggTime["_time"] = dfAggTime["_time"].dt.tz_localize(None)

#write the tables
dfFill.to_sql('fill', conn, index=False)
dfAggTime.to_sql('agg', conn, index=False)

qry = '''
        select distinct
           experiment,
           '''+parameter+''',
           _time,
           count
        from fill
        where fill._time not in (SELECT DISTINCT _time FROM agg)
        '''
dfMissingTime = pd.read_sql_query(qry, conn).reset_index(drop=True).drop_duplicates()
# dfNew['min'] = pd.to_datetime(dfNew["min"], format='mixed')

df = pd.concat([dfMissingTime.reset_index(drop=True), dfAggTime.drop(columns=['min','max']).reset_index(drop=True)])#.sort_values(by=['_time'])
df["_time"] = pd.to_datetime(df["_time"], format='mixed')
df = df.sort_values(by=['_time']).drop_duplicates()
# df = dfAggTime.drop(columns=['min'])
df.head(100)

Unnamed: 0,experiment,d,_time,count
0,0,8,2023-08-21 12:48:22.374182,0
1,0,8,2023-08-21 12:48:23.374182,0
2,0,8,2023-08-21 12:48:24.374182,0
3,0,8,2023-08-21 12:48:25.374182,0
4,0,8,2023-08-21 12:48:26.374182,0
5,0,8,2023-08-21 12:48:27.374182,0
6,0,8,2023-08-21 12:48:28.374182,0
7,0,8,2023-08-21 12:48:29.374182,0
8,0,8,2023-08-21 12:48:30.374182,0
9,0,8,2023-08-21 12:48:31.374182,0


In [16]:
avgPropExp = df.groupby(['experiment']).agg('mean')
avgPropExp.reset_index(inplace=True)
avgPropExp = avgPropExp.drop(columns=['experiment'])

# avgPropExp.head(10)
# print(avgPropExp)

avgProp = avgPropExp.groupby([parameter]).agg({'count':['mean','std']})
avgProp.columns = avgProp.columns.droplevel(0)
avgProp.reset_index(inplace=True)

avgProp.head(100)

Unnamed: 0,d,mean,std
0,6.0,374.769478,120.335008
1,8.0,0.73033,1.03718
2,12.0,382.578775,40.892249
3,24.0,289.955793,91.492883
