# NFVSDN'17 paper RAW data cleanup

Prepares raw data from NFVSDN'17 measurements to be usable data for a nfv-t-cp pmodel.

In [7]:
import pandas as pd

# single machine results: ab throughput 1GB
D_tp_nginx = pd.read_pickle("data/nfvsdn17rawdata/singlenode/170408-1120-tp-nginxlb.pkl")
D_tp_socat = pd.read_pickle("data/nfvsdn17rawdata/singlenode/170408-1238-tp-socat.pkl")
D_tp_redir = pd.read_pickle("data/nfvsdn17rawdata/singlenode/170408-1355-tp-squid.pkl")
D_tp_nginx_socat_redir = pd.read_pickle("data/nfvsdn17rawdata/singlenode/170409-1533-tp-nginxlb-socat-squid.pkl")
D_tp_socat_redir_nginx = pd.read_pickle("data/nfvsdn17rawdata/singlenode/170410-1712-tp-socat-squid-nginxlb.pkl")
D_tp_redir_nginx_socat = pd.read_pickle("data/nfvsdn17rawdata/singlenode/170412-2153-tp-squid-nginxlb-socat.pkl")

# multi machine results: ab throughput 1GB
D_tp_nginx_maxi = pd.read_pickle("data/nfvsdn17rawdata/multinode/170412-1737-tp-nginxlb-maxinet.pkl")
D_tp_socat_maxi = pd.read_pickle("data/nfvsdn17rawdata/multinode/170412-1856-tp-socat-maxinet.pkl")
D_tp_redir_maxi = pd.read_pickle("data/nfvsdn17rawdata/multinode/170412-2015-tp-squid-maxinet.pkl")
D_tp_nginx_socat_redir_maxi = pd.read_pickle("data/nfvsdn17rawdata/multinode/170413-2115-tp-nginxlb-socat-squid-maxinet.pkl")
D_tp_socat_redir_nginx_maxi = pd.read_pickle("data/nfvsdn17rawdata/multinode/170414-0829-tp-socat-squid-nginxlb-maxinet.pkl")
D_tp_redir_nginx_socat_maxi = pd.read_pickle("data/nfvsdn17rawdata/multinode/170414-1951-tp-squid-nginxlb-socat-maxinet.pkl")

In [8]:
for c in D_tp_nginx_socat_redir.columns.values:
    if "param_" in c:
        print(c)

param_duration
param_in:addr_eth0
param_in:cpu_bw
param_in:cpu_cores
param_in:enabled
param_in:environment
param_in:image
param_in:mem
param_name
param_out:addr_eth0
param_out:cpu_bw
param_out:cpu_cores
param_out:enabled
param_out:environment
param_out:image
param_out:mem
param_repetition
param_run_id
param_vnf1:addr_input
param_vnf1:addr_output
param_vnf1:cpu_bw
param_vnf1:cpu_cores
param_vnf1:environment
param_vnf1:image
param_vnf1:mem
param_vnf2:addr_input
param_vnf2:addr_output
param_vnf2:cpu_bw
param_vnf2:cpu_cores
param_vnf2:environment
param_vnf2:image
param_vnf2:mem
param_vnf3:addr_input
param_vnf3:addr_output
param_vnf3:cpu_bw
param_vnf3:cpu_cores
param_vnf3:environment
param_vnf3:image
param_vnf3:mem


## Cleanup and build a single DF with all data

Only use the needed columns and rename the columns to simplify.
Combine dataframes. `topology` field allows to select them.

In [15]:
def cleanup_df(in_df):
    filter_columns = {
        "param_name": "topology",
        "param_repetition": "repetition_id",
        #"param_run_id": "rund_id",
        "param_duration": "duration",
        "param_vnf1:cpu_bw": "vnf1cpu",
        "param_vnf2:cpu_bw": "vnf2cpu",
        "param_vnf3:cpu_bw": "vnf3cpu",
        "in_ab_transfer_rate_kbyte_per_second": "throughput_kbyte_per_second"
    } 
    out_df = in_df[list(filter_columns.keys())].copy()
    out_df = out_df.rename(columns=filter_columns)
    # rename data
    out_df["topology"] = out_df["topology"].str.replace("tp-", "")
    out_df["topology"] = out_df["topology"].str.replace("nginxlb", "nx")
    out_df["topology"] = out_df["topology"].str.replace("socat", "sc")
    out_df["topology"] = out_df["topology"].str.replace("squid", "sq")
    out_df["topology"] = out_df["topology"].str.replace("maxinet", "phys-isolated")
    return out_df

# combine dataframes
df = pd.concat([
    cleanup_df(D_tp_nginx_socat_redir),
    cleanup_df(D_tp_socat_redir_nginx),
    cleanup_df(D_tp_redir_nginx_socat),
    cleanup_df(D_tp_nginx_socat_redir_maxi),
    cleanup_df(D_tp_socat_redir_nginx_maxi),
    cleanup_df(D_tp_redir_nginx_socat_maxi),
])
df

Unnamed: 0,topology,repetition_id,duration,vnf1cpu,vnf2cpu,vnf3cpu,throughput_kbyte_per_second
0,nx-sc-sq,3,60,0.16,0.64,0.64,134028.40
1,nx-sc-sq,6,60,0.16,0.64,1.00,131123.94
2,nx-sc-sq,0,60,0.64,0.16,1.00,71830.73
3,nx-sc-sq,4,60,1.00,0.16,0.32,70274.05
4,nx-sc-sq,1,60,0.32,0.64,0.64,289443.26
5,nx-sc-sq,2,60,0.16,0.16,0.64,74075.87
6,nx-sc-sq,7,60,1.00,0.16,0.32,71434.81
7,nx-sc-sq,5,60,0.32,0.32,1.00,160597.55
8,nx-sc-sq,5,60,1.00,0.16,0.32,73981.00
9,nx-sc-sq,7,60,0.32,1.00,0.32,278727.55


### List some details about data

We have single physical node measurements and isolated node measurements (done with Maxinet) in a single dataframe:

#### Topologies
* Topology: 'nx-sc-sq', 'sc-sq-nx', 'sq-nx-sc' = Containernet only (single physical node) measurements. Isolation done through core pinning.
* Topology: 'sq-nx-sc-phys-isolated', 'nx-sc-sq-phys-isolated', 'sc-sq-nx-phys-isolated' = Maxinet measurements

#### VNFs

* nx: Nginx loadbalancer
* sc: Socat TCP L4 forwarder
* sq: Squid proxy (w/o) caching enabled

In [10]:
print("Topologies: {}".format(list(set(df["topology"]))))
print("VNF1 CPU bandwidth values: {}".format(list(set(df["vnf1cpu"]))))
print("VNF2 CPU bandwidth values: {}".format(list(set(df["vnf2cpu"]))))
print("VNF3 CPU bandwidth values: {}".format(list(set(df["vnf3cpu"]))))
print("VNF3 CPU bandwidth values: {}".format(len(list(set(df["repetition_id"])))))

Topologies: ['nx-sc-sq', 'sq-nx-sc-phys-isolated', 'sc-sq-nx-phys-isolated', 'nx-sc-sq-phys-isolated', 'sc-sq-nx', 'sq-nx-sc']
VNF1 CPU bandwidth values: [0.16, 0.64, 0.32, 1.0]
VNF2 CPU bandwidth values: [0.64, 1.0, 0.32, 0.16]
VNF3 CPU bandwidth values: [0.64, 1.0, 0.32, 0.16]
VNF3 CPU bandwidth values: 10


## Output Dataframe

Store unified and clean dataframe as a re-usable Pickle file and CSV file for distribution and to publish them as open dataset.

In [11]:
df.to_pickle("data/peuster_karl_ieeenfvsdn17_3vnf_sfc_profile.pkl")
df.to_csv("data/peuster_karl_ieeenfvsdn17_3vnf_sfc_profile.csv")

## Examples: Selection

Each configuration has 10 different measurements that can be used in the simulation.

In [12]:
df[(df["topology"]=="nx-sc-sq") & (df["vnf1cpu"]==1.0) & (df["vnf2cpu"]==1.0) & (df["vnf3cpu"]==1.0)]

Unnamed: 0,topology,repetition_id,vnf1cpu,vnf2cpu,vnf3cpu,throughput_kbyte_per_second
78,nx-sc-sq,4,1.0,1.0,1.0,461700.57
113,nx-sc-sq,2,1.0,1.0,1.0,450820.75
205,nx-sc-sq,9,1.0,1.0,1.0,451898.89
251,nx-sc-sq,6,1.0,1.0,1.0,451634.75
299,nx-sc-sq,0,1.0,1.0,1.0,456678.65
302,nx-sc-sq,3,1.0,1.0,1.0,457911.74
412,nx-sc-sq,1,1.0,1.0,1.0,450517.17
482,nx-sc-sq,5,1.0,1.0,1.0,454041.55
564,nx-sc-sq,7,1.0,1.0,1.0,457362.45
610,nx-sc-sq,8,1.0,1.0,1.0,447014.84


## Examples: Prototype for nfv-t-cp

The simulation model gets a configuration tuple as input: (0.16, 0.32, 0.64) and returns a single performance output value which is randomly selected among the 10 available values.

In [13]:
import random

def measure_service(df, topology, config):
    # select results that match given topology and config
    selected = df[(df["topology"]==topology) 
                  & (df["vnf1cpu"]==config[0]) 
                  & (df["vnf2cpu"]==config[1]) 
                  & (df["vnf3cpu"]==config[2])]
    candidate_results = list(selected["throughput_kbyte_per_second"])
    assert(len(candidate_results) > 0)
    # randomly return one of the candidate results
    return random.choice(candidate_results)

topologies = ["nx-sc-sq", "sc-sq-nx", "sq-nx-sc"]
configs = [(0.16, 0.16, 0.16),
           (0.32, 0.16, 0.16),
           (0.64, 0.16, 0.16)]

for t in topologies:
    for c in configs:
        print("topology '{}' config '{}': {} kbyte/s"
              .format(t, c, measure_service(df, t, c)))

topology 'nx-sc-sq' config '(0.16, 0.16, 0.16)': 73261.18 kbyte/s
topology 'nx-sc-sq' config '(0.32, 0.16, 0.16)': 69596.38 kbyte/s
topology 'nx-sc-sq' config '(0.64, 0.16, 0.16)': 72750.02 kbyte/s
topology 'sc-sq-nx' config '(0.16, 0.16, 0.16)': 61831.76 kbyte/s
topology 'sc-sq-nx' config '(0.32, 0.16, 0.16)': 143652.93 kbyte/s
topology 'sc-sq-nx' config '(0.64, 0.16, 0.16)': 173442.64 kbyte/s
topology 'sq-nx-sc' config '(0.16, 0.16, 0.16)': 20361.27 kbyte/s
topology 'sq-nx-sc' config '(0.32, 0.16, 0.16)': 47588.18 kbyte/s
topology 'sq-nx-sc' config '(0.64, 0.16, 0.16)': 66365.01 kbyte/s


KeyError: 'param_duration'

# TODO

* 1.0 configs are missing in sq-nx-sc and all maxinet results!!!