# Evaluation Of Duet Benchmarking Results from Distributed Database "Cassandra"

### Imports

In [34]:
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pandas as pd
import os
from pathlib import Path

### Import Benchmarking Measurements as csv file and create Pandas Dataframe for further Analysis

In [35]:
cwd = Path.cwd()
filepath = os.path.join(cwd, "Results", "10_000ops_lokal", "load_measurements_10000.csv")
print(filepath)
df = pd.read_csv(filepath)

C:\Users\Felix Medicus\Desktop\Thesis_MCC\DuetBenchmarking\Results
C:\Users\Felix Medicus\Desktop\Thesis_MCC\DuetBenchmarking\Results\Results\10_000ops_lokal\load_measurements_10000.csv


### Insert latencies in ms into the measurement table

In [36]:
df["latency_ms"] = df["received"]- df["sent"]

### Restructure Timeline

In [37]:
firstSent = df["sent"].min()
df["sent_sec"] = (df["sent"] - firstSent)/1000

### Create Function to prune Measurements

In [38]:
def pruneMeasurements(dataframe): 
    print("Hello from the other side")
    
pruneMeasurements(df)


Hello from the other side


### Create subsets of dataframe (for Version A and B, and for each worker for Version A and B) 

In [39]:
dfA1 = df.loc[(df["workerId"] == "w1-vA")]
dfA2 = df.loc[(df["workerId"] == "w2-vA")]
dfA3 = df.loc[(df["workerId"] == "w3-vA")]
dfB1 = df.loc[(df["workerId"] == "w1-vB")]
dfB2 = df.loc[(df["workerId"] == "w2-vB")]
dfB3 = df.loc[(df["workerId"] == "w3-vB")]
dfA = df.loc[(df["workerId"] == "w1-vA") | (df["workerId"] == "w2-vA") | (df["workerId"] == "w3-vA")]
dfB = df.loc[(df["workerId"] == "w1-vB") | (df["workerId"] == "w2-vB") | (df["workerId"] == "w3-vB")]

### Return benchmark run time (First request send to last response received)

In [40]:
totalTime = df["received"].max() - df["sent"].min()
print(totalTime/1000/60, "minuten")

1.6583833333333333 minuten


In [41]:
dfA.describe()

Unnamed: 0,sent,received,latency_ms,sent_sec
count,10000.0,10000.0,10000.0,10000.0
mean,1657968000000.0,1657968000000.0,29.6772,49.610312
std,28546.19,28546.08,5.263437,28.546194
min,1657968000000.0,1657968000000.0,25.0,0.0
25%,1657968000000.0,1657968000000.0,28.0,24.9385
50%,1657968000000.0,1657968000000.0,28.0,49.5825
75%,1657968000000.0,1657968000000.0,29.0,74.32125
max,1657968000000.0,1657968000000.0,82.0,99.076


In [42]:
dfB.describe()

Unnamed: 0,sent,received,latency_ms,sent_sec
count,10000.0,10000.0,10000.0,10000.0
mean,1657968000000.0,1657968000000.0,29.7348,49.657313
std,28610.14,28610.08,5.24479,28.610143
min,1657968000000.0,1657968000000.0,25.0,0.0
25%,1657968000000.0,1657968000000.0,28.0,25.00825
50%,1657968000000.0,1657968000000.0,28.0,49.634
75%,1657968000000.0,1657968000000.0,29.0,74.4065
max,1657968000000.0,1657968000000.0,102.0,99.474


### Min. Latencies for Worker 2 (A & B)

In [43]:
dfA2.loc[dfA2["latency_ms"].idxmin()]

ValueError: attempt to get argmin of an empty sequence

In [None]:
dfB2.loc[dfB2["latency_ms"].idxmin()]

### Some checks

#### Time difference in ms when first query was sent by worker-1 to Version A and B 

In [None]:

dfA2["sent"].min() - dfB2["sent"].min()

#### Time difference in ms when last query was sent by worker-1 to Version A and B 

In [None]:
dfA1["sent"].max() - dfB1["sent"].max() 


## Create Figures 

### Create Boxplots

In [None]:
sns.set_theme(style="darkgrid")

In [None]:
ax = plt.boxplot(x=dfA["latency_ms"])


### Create Flowchart

In [None]:
sns.lmplot(x='sent_sec', y='latency_ms', data=dfA, fit_reg=False)