# Startup Delay Inference

In [None]:
# Read dataset
import json

# Read from a file
with open('data/Video_Startup_Delay_Inference/regression.json', 'r') as file:
    results = json.load(file)
    
print(results.keys())

In [None]:
for key in results.keys():
    print("Run {}:".format(key))
    for k in results[key].keys():
        print(f"{k}: {results[key][k]}")


In [None]:
burst_loss_keys = [key for key in results.keys() if not key.endswith('_0')]
print(burst_loss_keys)
prob_loss_keys = [key for key in results.keys() if key.endswith('_0') and not key.startswith('0_')]
print(prob_loss_keys)

In [None]:
import pandas as pd

dfs = []
dfs.append(pd.DataFrame({"key": "No loss", "data": [a - b for a, b in zip(results["0_0"]["real"], results["0_0"]["predicted"])]}))
for key in prob_loss_keys:
  data = [a - b for a, b in zip(results[key]["real"], results[key]["predicted"])]
  dfs.append(pd.DataFrame({"key": "$p$={}\n ".format(float(key.split("_")[0])/100.0), "data": data}))

dfprob = pd.concat(dfs) 

In [None]:
import pandas as pd

dfs = []
dfs.append(pd.DataFrame({"key": "No loss", "data": [a - b for a, b in zip(results["0_0"]["real"], results["0_0"]["predicted"])]}))
for key in burst_loss_keys:
  data = [a - b for a, b in zip(results[key]["real"], results[key]["predicted"])]
  dfs.append(pd.DataFrame({"key": "$p_1$={} \n$1-p_2$={}".format(float(key.split("_")[0])/100.0,float(key.split("_")[1])), "data": data}))

dfburst = pd.concat(dfs) 

In [None]:
print(dfprob.groupby("key")["data"].median())

In [None]:
import matplotlib
# Solving fonttype error
matplotlib.rcParams['text.usetex'] = True


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
plt.figure(figsize=(6, 3))
sns.boxplot(data=dfprob, y="data", x="key", fliersize=0)
plt.xlabel('Loss rate')
plt.ylabel('Error in ms \n(real - predicted)')
plt.grid(False)
plt.ylim([-4000,3000])
#plt.legend(ncol=2, loc='upper left')
plt.tight_layout()
plt.savefig('data/Video_Startup_Delay_Inference/regression_prob_box.pdf')
plt.show()

In [None]:
print(dfburst.groupby("key")["data"].median())

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

plt.figure(figsize=(6, 3))
print(dfburst["key"].unique())
order = ["No loss",'$p_1$=0.005 \n$1-p_2$=0.1','$p_1$=0.005 \n$1-p_2$=0.01','$p_1$=0.01 \n$1-p_2$=0.1','$p_1$=0.01 \n$1-p_2$=0.01']
filter=['$p_1$=0.005 \n$1-p_2$=0.0001',
        '$p_1$=0.01 \n$1-p_2$=0.0001',
        '$p_1$=0.005 \n$1-p_2$=0.001',
        '$p_1$=0.01 \n$1-p_2$=0.001']
sns.boxplot(data=dfburst[~dfburst["key"].isin(filter)], y="data",order=order,  x="key", fliersize=0 )
plt.xlabel('Loss rate')
plt.ylabel('Error in ms \n(real - predicted)')
plt.grid(False)
plt.ylim([-10000,2700])
#plt.legend(ncol=2, loc='upper left', prop={'size': 8})
plt.tight_layout()
plt.savefig('data/Video_Startup_Delay_Inference/regression_burst_box.pdf')
plt.show()

In [None]:
import pandas as pd
import numpy as np

df = dfburst.copy()
result = df.groupby('key')['data'].apply(lambda x: np.median(np.abs(x))).reset_index()
result.columns = ['key', 'median_absolute_value']

print(result)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

plt.figure(figsize=(5, 2.5))
ax = sns.ecdfplot(data=dfprob, x="data", hue="key")
plt.xlabel('Error in seconds (real - predicted)')
plt.ylabel('CDF')
plt.grid(False)
plt.xlim([-5000,5000])
# ax.legend(ncol=2, loc='upper left', prop={'size': 8})
plt.tight_layout()
plt.savefig('data/Video_Startup_Delay_Inference/regression_prob_cdf.pdf')
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

plt.figure(figsize=(5, 2.5))
ax = sns.ecdfplot(data=dfburst, x="data", hue="key")
plt.xlabel('Error in seconds (real - predicted)')
plt.ylabel('CDF')
plt.grid(False)
plt.xlim([-5000,5000])
# ax.legend(labels=dfburst["key"].unique(), ncol=2, loc='upper left', prop={'size': 8})
# ax.legend(labels=["No loss", "p_1=0.005 p_2=0.5", "p_1=0.005 p_2=0.1", "p_1=0.005 p_2=0.01", "p_1=0.01 p_2=0.5", "p_1=0.01 p_2=0.1", "p_1=0.01 p_2=0.01"], ncol=2, loc='upper left', prop={'size': 8})
plt.tight_layout()
plt.savefig('data/Video_Startup_Delay_Inference/regression_burst_cdf.pdf')
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

df = pd.read_pickle("data/merged_cleaned_loss_dataset.pkl")

df = df[df["startup_time"] >0]

print(df["pcap_file"].unique())

df = df[(df["pcap_file"] == "cleaned.pcap")  
        # | (df["pcap_file"] == "loss_rate_10.0.pcap") 
        # | (df["pcap_file"] == "loss_rate_5.0.pcap")  
        | (df["pcap_file"] == "loss_rate_0.5_0.1.pcap") 
        | (df["pcap_file"] == "loss_rate_0.5_0.01.pcap") 
        | (df["pcap_file"] == "loss_rate_1.0_0.1.pcap") 
        | (df["pcap_file"] == "loss_rate_1.0_0.01.pcap")
        ]

df["pcap_file"] = df["pcap_file"].replace({
    "cleaned.pcap": "No loss",
    # "loss_rate_10.0.pcap": "$p$=0.1",
    # "loss_rate_5.0.pcap": "$p$=0.05",
    "loss_rate_0.5_0.1.pcap": "$p_1$=0.05 \n$1-p_2$=0.1",
    "loss_rate_0.5_0.01.pcap": "$p_1$=0.05 \n$1-p_2$=0.01",
    "loss_rate_1.0_0.1.pcap": "$p_1$=0.1 \n$1-p_2$=0.1",
    "loss_rate_1.0_0.01.pcap": "$p_1$=0.1 \n$1-p_2$=0.01",
})
plt.figure(figsize=(6, 3))
sns.boxplot(data=df, y="segment_duration_avg", x="pcap_file", fliersize=0, order=["No loss", "$p_1$=0.05 \n$1-p_2$=0.1", "$p_1$=0.05 \n$1-p_2$=0.01", "$p_1$=0.1 \n$1-p_2$=0.1", "$p_1$=0.1 \n$1-p_2$=0.01"])
plt.xlabel('Loss rate')
plt.ylabel('Average time \n(seconds)')
plt.grid(False)
plt.ylim([-0.1,1.5])
#plt.legend(ncol=2, loc='upper left', prop={'size': 8})
plt.tight_layout()
plt.savefig('data/Video_Startup_Delay_Inference/regression_segment_time_box.pdf')
plt.show()

In [None]:
import matplotlib.pyplot as plt

df = pd.read_pickle("data/Video_Startup_Delay_Inference/cleaned_loss_dataset.pkl")

df.columns.values

In [None]:
for i in dfburst[~dfburst["key"].isin(filter)]:
    print(dfburst["data"].median())

In [None]:
import pandas as pd

df = pd.read_pickle("data/Video_Startup_Delay_Inference/cleaned_loss_dataset.pkl")

df = df[df["startup_time"] >0]

# Get code to train models from server
# Train the two models and run test sets

df["folder"].unique().shape


In [None]:
import json
import re
import pandas as pd
import numpy as np
from sklearn import metrics
from autogluon.tabular import TabularDataset, TabularPredictor

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split


def do_train(df, target="startup_time", fts=None, n_jobs=None, params=None):
    rf = RandomForestRegressor(n_jobs=n_jobs)
    hyperparameters = {
    'RF': {'max_depth': 10,'min_samples_split': 10,},
    }
    if fts is not None:
        clf = TabularPredictor(label=target).fit(
        train_data=df[fts+target],
        hyperparameters=hyperparameters,
        )
    else: 
        clf = TabularPredictor(label=target).fit(
        train_data=df,
        hyperparameters=hyperparameters,
        )
    return clf


def do_test(df, model, fts=None, target="startup_time"):
  if fts is not None:
    x_test = df[fts]
  else:
    x_test = df.drop(target, axis=1)
  y_test = df[target]

  ret = {}

  predictions = model.predict(x_test)
  # ret["shape"] = x_test.shape
  # ret["real"] = y_test.values.tolist()
  # ret["predicted"] = predictions.tolist()
  ret["mean_absolute_error"] = metrics.mean_absolute_error(y_test, predictions)
  ret["mean_squared_error"] = metrics.mean_squared_error(y_test, predictions)
  ret["median_absolute_error"] = metrics.median_absolute_error(y_test, predictions)
  ret["root_mean_squared_error"] =  metrics.root_mean_squared_error(y_test, predictions)
  ret["r2_score"] = metrics.r2_score(y_test, predictions)

  return ret


print("Prepare the dataset")
df = pd.read_pickle("data/Video_Startup_Delay_Inference/merged_cleaned_loss_dataset.pkl")

df = df[df["startup_time"] >0]

# Get unique folder values
unique_folders = df['folder'].unique()

# Split the unique folders into train and test sets (80/20 split)
train, test = train_test_split(unique_folders, test_size=0.2, random_state=42)

# Create train and test dataframes
train_df = df[df["pcap_file"] == "cleaned.pcap"][df['folder'].isin(train)].drop(columns=['folder', 'filename', 'pcap_file', 'interval_start', 'interval_end','loss_rate', 'burst_rate'])

results = {}

# pairs = [[0.5, 0], [1.0, 0], [2.0, 0], [5.0, 0], [10.0, 0], 
#          [1, 0.1], [1, 0.01], [1, 0.001], [1, 0.0001], 
#          [0.5, 0.1], [0.5, 0.01], [0.5, 0.001], [0.5, 0.0001]]

pairs = [[0.5, 0], [1.0, 0],
         [1, 0.1], [1, 0.01], [1, 0.001], 
         [0.5, 0.1], [0.5, 0.01], [0.5, 0.001]]


print("All model")

model = do_train(train_df)

print("Test the model on 0 0")
results["0_0"] = do_test(df[df["pcap_file"] == "cleaned.pcap"][df['folder'].isin(test)].drop(columns=['folder', 'filename', 'pcap_file', 'interval_start', 'interval_end','loss_rate', 'burst_rate']), model)



for pair in pairs:
  print("Test the model on {} {}".format(pair[0], pair[1]))
  results["{}_{}".format(pair[0], pair[1])] = do_test(df[(df["loss_rate"] == pair[0]) & (df["burst_rate"] == pair[1])][df['folder'].isin(test)].drop(columns=['folder', 'filename', 'pcap_file', 'interval_start', 'interval_end','loss_rate', 'burst_rate']), model)
  
  
with open('data/Video_Startup_Delay_Inference/all.json', 'w') as file:
  json.dump(results, file, indent=4)
  
print("Network model") 

model = do_train(train_df.drop(columns=['segment_count',
       'segment_size_min', 'segment_size_max', 'segment_size_avg',
       'segment_size_std', 'segment_duration_min', 'segment_duration_max',
       'segment_duration_avg', 'segment_duration_std', 'segment_gap_min',
       'segment_gap_max', 'segment_gap_avg', 'segment_gap_std']))

print("Test the model on 0 0")
results["0_0"] = do_test(df[df["pcap_file"] == "cleaned.pcap"][df['folder'].isin(test)].drop(columns=['folder', 'filename', 'pcap_file', 'interval_start', 'interval_end','loss_rate', 'burst_rate','segment_count',
       'segment_size_min', 'segment_size_max', 'segment_size_avg',
       'segment_size_std', 'segment_duration_min', 'segment_duration_max',
       'segment_duration_avg', 'segment_duration_std', 'segment_gap_min',
       'segment_gap_max', 'segment_gap_avg', 'segment_gap_std']), model)



for pair in pairs:
  print("Test the model on {} {}".format(pair[0], pair[1]))
  results["{}_{}".format(pair[0], pair[1])] = do_test(df[(df["loss_rate"] == pair[0]) & (df["burst_rate"] == pair[1])][df['folder'].isin(test)].drop(columns=['folder', 'filename', 'pcap_file', 'interval_start', 'interval_end','loss_rate', 'burst_rate','segment_count',
       'segment_size_min', 'segment_size_max', 'segment_size_avg',
       'segment_size_std', 'segment_duration_min', 'segment_duration_max',
       'segment_duration_avg', 'segment_duration_std', 'segment_gap_min',
       'segment_gap_max', 'segment_gap_avg', 'segment_gap_std']), model)
  
  
with open('data/Video_Startup_Delay_Inference/network.json', 'w') as file:
  json.dump(results, file, indent=4)

# Service Identification

In [None]:
from autogluon.tabular import TabularDataset, TabularPredictor
from itertools import combinations
import pandas as pd
import json
import sys

folder="data/service_recognition"
train_data = TabularDataset(folder+"/TRAIN.csv.xz")
loss_rate = [0.0,0.5,1.0,2.0,5.0,10.0]
markov = [0.1,0.01,0.001,0.0001]

test_data = [TabularDataset(folder+f"/TEST_{loss}.csv.xz") for loss in loss_rate]
markov_test = [TabularDataset(folder+f"/TEST_{loss}_{m}.csv.xz") for m in markov for loss in loss_rate[1:3]]
list_markov = [f"{loss}_{m}" for m in markov for loss in loss_rate[1:3]]
label = 'service'

feature_subsets = [
    # L3
    [
        "out_counter", "in_counter", "out_bytes", "in_bytes"
    ],
    # IAT
    [
        "out_std_inter_arrival_time", "out_avg_inter_arrival_time", "out_med_inter_arrival_time",
        "out_max_inter_arrival_time", "out_min_inter_arrival_time", "out_ske_inter_arrival_time",
        "out_kur_inter_arrival_time", "in_std_inter_arrival_time", "in_avg_inter_arrival_time",
        "in_med_inter_arrival_time", "in_max_inter_arrival_time", "in_min_inter_arrival_time",
        "in_ske_inter_arrival_time", "in_kur_inter_arrival_time"
    ],
    # TCP
    [
        "out_std_bytes_per_packet", "out_avg_bytes_per_packet", "out_max_bytes_per_packet",
        "out_min_bytes_per_packet", "in_std_bytes_per_packet", "in_avg_bytes_per_packet",
        "in_max_bytes_per_packet", "in_min_bytes_per_packet", "out_std_rwnd", "out_avg_rwnd",
        "out_max_rwnd", "out_min_rwnd", "in_std_rwnd", "in_avg_rwnd", "in_max_rwnd",
        "in_min_rwnd", "out_syn_flags", "in_syn_flags", "out_ack_flags", "in_ack_flags",
        "out_psh_flags", "in_psh_flags", "out_urg_flags", "in_urg_flags", "out_rst_flags",
        "in_rst_flags", "out_fin_flags", "in_fin_flags", "out_goodput", "in_goodput",
        "out_max_rtt"
    ],
    # TCP Stat
    [
        "out_med_bytes_per_packet", "out_kur_bytes_per_packet", "out_ske_bytes_per_packet",
        "in_med_bytes_per_packet", "in_kur_bytes_per_packet", "in_ske_bytes_per_packet",
        "out_med_rwnd", "out_kur_rwnd", "out_ske_rwnd", "in_med_rwnd", "in_kur_rwnd",
        "in_ske_rwnd", "out_kur_rtt"
    ],
    # Bytes in flight
    [
        "out_std_bytes_in_flight", "out_avg_bytes_in_flight", "out_max_bytes_in_flight",
        "out_min_bytes_in_flight", "out_ske_bytes_in_flight", "out_kur_bytes_in_flight",
        "out_str_bytes_in_flight", "out_end_bytes_in_flight", "in_std_bytes_in_flight",
        "in_avg_bytes_in_flight", "in_max_bytes_in_flight", "in_min_bytes_in_flight",
        "in_ske_bytes_in_flight", "in_kur_bytes_in_flight", "in_str_bytes_in_flight",
        "in_end_bytes_in_flight"
    ],
    # Retransmit
    [
        "out_std_retransmit", "out_avg_retransmit", "out_med_retransmit", "out_max_retransmit",
        "out_min_retransmit", "out_ske_retransmit", "out_kur_retransmit", "out_zero_retransmit",
        "out_one_retransmit", "out_two_retransmit", "out_x_retransmit", "in_std_retransmit",
        "in_avg_retransmit", "in_med_retransmit", "in_max_retransmit", "in_min_retransmit",
        "in_ske_retransmit", "in_kur_retransmit", "in_zero_retransmit", "in_one_retransmit",
        "in_two_retransmit", "in_x_retransmit", "out_ooo_packets", "out_ooo_bytes",
        "in_ooo_packets", "in_ooo_bytes"
    ],
]

hyperparameters = {
        'RF': {'max_depth': 10,'min_samples_split': 10,},
        }

feature_combo = sum(feature_subsets,[])
print(f"Training model with features: {feature_combo}")
predictor = TabularPredictor(label=label).fit(
    train_data=train_data[feature_combo + [label]],
    hyperparameters=hyperparameters,
)
models = []
models.append((predictor, feature_combo))
print(f"Evaluate model with features: {feature_combo}")
metadata = {"features":feature_combo}


basis = test_data[0]
for i,test in enumerate(test_data):
    test = test.reset_index(drop=True)
    test = test.reindex(basis.index)
    test[label] = test[label].fillna('WRONG')
    result = predictor.evaluate(test,detailed_report=True)
    result['confusion_matrix']=result['confusion_matrix'].to_json()
    output=folder+f"ml_output_{loss_rate[i]}.json"
    with open(output, 'w+') as f:
        try:
            data = json.load(f)
        except json.JSONDecodeError:
            data = []
        data.append({'features': feature_combo, 'result': result})
        f.seek(0)
        json.dump(data, f, indent=4)
        f.truncate()
for i,test in enumerate(markov_test):
    test = test.reset_index(drop=True)
    test = test.reindex(basis.index)
    test[label] = test[label].fillna('WRONG')
    result = predictor.evaluate(test,detailed_report=True)

    result['confusion_matrix']=result['confusion_matrix'].to_json()
    output=folder+f"ml_output_{list_markov[i]}.json"
    with open(output, 'w+') as f:
        try:
            data = json.load(f)
        except json.JSONDecodeError:
            data = []
        data.append({'features': feature_combo, 'result': result})
        f.seek(0)
        json.dump(data, f, indent=4)
        f.truncate()