In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline


In [6]:
from preprocess import Preprocess
from training import Server
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn import metrics
import numpy as np
import pickle

from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve
from utils import accuracy

import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [7]:
# Training configs for Single and Multi proxy models
config_yml_M = "config-8k-M.yml"
config_yml_S = "config-8k-S.yml"

## Multi-proxy training

In [8]:
# Get the data ready for training
p = Preprocess(config_yml=config_yml_M)
p.main_run_cv()

Pickle path:  data/tc_M_flowlen-100.pkl ; h5 path:  data/tc_M_flowlen-100_neg-49_cv-5.h5
Cross-val in 5 folds
Folds h5 already exists: data/tc_M_flowlen-100_neg-49_cv-5.h5
data/tc_M_flowlen-100_neg-49_cv-5_indices.pkl exists already.


In [17]:
seeds = [1, 10, 101, 102] # seeds

run = seeds[1] # 10

folds = range(p.folds)

print(f"{p.folds}-fold cross-validation with seed {run}")

5-fold cross-validation with seed 10


In [None]:
# Train
for fold in folds:
    print(f"Evaluating multi-proxy fold {fold}...")
    t = Server(cf=config_yml_M, fold=fold, run=run)
    t.train()

In [None]:
def print_best_stats(pkl_path, fold=0):
    with open(pkl_path, "rb") as f:
        aux = pickle.load(f)
        print(round(aux["best_av_prec"]*100, 2), "epoch", aux["best_av_prec_epoch"])

# Check best results on validation data
for run in seeds:
    print("Run with seed", run)
    try:
        for fold in folds:
            last_aux_path = t.last_aux.split("fold")[0] + f"fold{fold}-{run}.pkl"
            print_best_stats(last_aux_path, fold) 
    except FileNotFoundError:
            print("No results saved.")
    print("\n")

## Single-proxy training

In [None]:
# Get the data ready for training
p = Preprocess(config_yml=config_yml_S)
p.main_run_cv()

In [None]:
seeds = [1, 10, 101, 102] # seeds

run = seeds[1] # 10

folds = range(p.folds)

print(f"{p.folds}-fold cross-validation with seed {run}")

In [None]:
# Train
for fold in folds:
    print(f"Evaluating single-proxy fold {fold}...")
    t = Server(cf=config_yml_S, fold=fold, run=run)
    t.train()

In [None]:
# Check best results on validation data
for run in seeds:
    print("Run with seed", run)
    try:
        for fold in folds:
            last_aux_path = t.last_aux.split("fold")[0] + f"fold{fold}-{run}.pkl"
            print_best_stats(last_aux_path, fold) 
    except FileNotFoundError:
            print("No results saved.")
    print("\n")

## Test on N * N

In [32]:
def predict(server, x, y, save_props=True, batch_size=256):

    n_steps = int(np.ceil(len(x)/batch_size)) #(len(x) // 256)-1
    print(n_steps)
    sum_loss = 0
    sum_acc = 0
    if save_props:
            saved_props = []
            saved_y_truth = []            
    for i in tqdm(range(n_steps)):
        if len(y[i*256:]) >= 256:
            xxx = tf.constant(x[i*256:(i+1)*256].astype('float32'))
            yyy = tf.constant(y[i*256:(i+1)*256].astype('float32'))
        else:
            xxx = tf.constant(x[i*256:].astype('float32'))
            yyy = tf.constant(y[i*256:].astype('float32'))
        props = server.model(xxx, training=False)
        props = tf.reshape(props, [-1]) # specific for tf.nn.sigmoid_cross_entropy_with_logits
        sum_loss += server.loss(props, yyy).numpy()
        if save_props:
            saved_props = np.hstack([saved_props, props.numpy()])
            saved_y_truth = np.hstack([saved_y_truth, yyy.numpy()])
    avg_loss = round(sum_loss / n_steps, 3)
    if save_props:
        return avg_loss, saved_props, saved_y_truth
    else:
        return avg_loss


def get_xy(test_pos_x):
        n_pos = len(test_pos_x)
        n_neg_per_pos = len(test_pos_x)-1
        n_flows = n_neg_per_pos * n_pos
        flow_size = 300
        mod = n_neg_per_pos
        neg_x = np.zeros((n_flows, 8, flow_size, 1), dtype=np.float32)
        print(neg_x.shape)
        neg_y = np.zeros((n_flows))
        dataset = test_pos_x
        
        for i in tqdm(range(n_pos), desc='Generating neg test x'):
            #print(i)
            indices = list(range(n_pos))
            unpaired = indices[:i] + indices[i+1:]
            #shuffle(unpaired)
            for j in range(n_neg_per_pos):
                index = mod*i + j
                #print("   ", index)
                
                down_here_time = dataset[unpaired[j]][0]
                down_there_time = dataset[i][1]
                up_there_time = dataset[i][2]
                up_here_time = dataset[unpaired[j]][3]

                down_here_size = dataset[unpaired[j]][4]
                down_there_size = dataset[i][5]
                up_there_size = dataset[i][6]
                up_here_size = dataset[unpaired[j]][7]

                neg_x[index, 0, :,] = down_here_time
                neg_x[index, 1, :,] = down_there_time
                neg_x[index, 2, :,] = up_there_time
                neg_x[index, 3, :,] = up_here_time

                neg_x[index, 4, :,] = down_here_size
                neg_x[index, 5, :,] = down_there_size
                neg_x[index, 6, :,] = up_there_size
                neg_x[index, 7, :,] = up_here_size
        return neg_x, neg_y
    
    
def makeNtimesN(h5_path, fold=-1):
    print(f"MAKE N*N TESTING")
    
    if fold >= 0:
        print(f"Fold {fold}")

    with h5py.File(h5_path, 'r') as h5f:
        if fold >= 0:
            test_indices = list(h5f['indices'][f'test{fold}'])
        else:
            test_indices = list(h5f['indices']['test'])
        x = list(h5f['data']['x'])
        y = list(h5f['data']['y'])

    test_x = [x[index] for index in test_indices]
    test_y = [y[index] for index in test_indices]
    print("Test X:", len(test_x))

    test_pos_x = [test_x[i] for i in range(len(test_x)) if test_y[i] == 1]
    test_pos_y = [y for y in test_y if y == 1]

    test_flows = len(test_pos_x) * len(test_pos_x) 
    n_flows = (len(test_pos_x)-1)*len(test_pos_x)
    flow_size = 300

    test_neg_x, test_neg_y = get_xy(test_pos_x)

    print("Positive samples: ", len(test_pos_x))
    print("Negative samples: ", len(test_neg_x))
    print("N*N=", len(test_pos_x)*len(test_pos_x))

    test_x = np.concatenate((test_pos_x, test_neg_x), axis=0)
    test_y = np.concatenate((test_pos_y, test_neg_y), axis=0)
    print(test_x.shape, test_y.shape)
    
    return test_x, test_y


def save_results(props, y_truth, name):
    res = [props, y_truth]
    name = "results/" + name + ".pkl"
    pickle.dump(res, open(name, "wb"))
    print("Saved: " + name)
    
    
def load_results(name):
    name = name + ".pkl"
    res = pickle.load(open(name, "rb"))
    print(len(res))
    return res

In [34]:
#test_datapath_S = "log_tc_S_neg-49/test/test_tc_S_NtimesN"
#test_datapath_M = "log_tc_M_neg-49/test/test_tc_M_NtimesN"

h5_path_S = "data/tc_S_flowlen-100_neg-49_cv-5.h5"
h5_path_M = "data/tc_M_flowlen-100_neg-49_cv-5.h5"

res_S = "results/test_tc_S_flowlen-100_neg-49_NtimesN"
res_M = "results/test_tc_M_flowlen-100_neg-49_NtimesN"

The following code will generate N * N datasets for single and multi proxy models for each fold (5 times), and then test a model for every fold on this new test dataset. This takes a very long time.

We already provide final results in the folder "results/", that are generated with this code with seed=10.

In [None]:
runs = [run] # = seeds

res = {}
save = True

print("Test Multi-proxy:")
for fold in folds:
    test_x, test_y = makeNtimesN(h5_path_M, fold=fold)
    #name = test_datapath_M + f"fold{fold}.pkl"
    #[test_x, test_y] = pickle.load(open(name, "rb"))
    for run in runs:
        print(f"Fold {fold}, run {run}...", end='')

        t = Server(cf=config_yml_S, fold=fold, run=run, pred_only=True, init_data=False)

        loss, props, y_truth = predict(t, test_x, test_y)
        av_prec = average_precision_score(y_truth, props)

        print(f"AP = {av_prec}")

        if save:
            res_path = res_M + f"-fold{fold}-run{run}" 
            save_results(props, y_truth, res_path")
            print(f"Results saved at", res_path)

        del t
    del test_x, test_y

In [None]:
runs = [run] # = seeds

res = {}
save = True

print("Test Single-proxy:")
for fold in folds:
    test_x, test_y = makeNtimesN(h5_path_S, fold=fold)
    #name = test_datapath_S + f"fold{fold}.pkl"
    #[test_x, test_y] = pickle.load(open(name, "rb"))
    for run in runs:
        print(f"Fold {fold}, run {run}...", end='')

        t = Server(cf=config_yml_S, fold=fold, run=run, pred_only=True, init_data=False)

        loss, props, y_truth = predict(t, test_x, test_y)
        av_prec = average_precision_score(y_truth, props)

        print(f"AP = {av_prec}")

        if save:
            res_path = res_S + f"-fold{fold}-run{run}" 
            save_results(props, y_truth, res_path")
            print(f"Results saved at", res_path)

        del t
    del test_x, test_y