In [2]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from graph_nets import blocks
from graph_nets import graphs
from graph_nets import modules
from graph_nets import utils_np
from graph_nets import utils_tf
import models
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import sonnet as snt
import tensorflow as tf
import os
import scipy.io
import torch.utils.data as Data
import glob
import math
import random

In [3]:
def data_loader(data_path):
    mat = scipy.io.loadmat(data_path)
    x = np.squeeze(mat['flow']/255.0)
    y = np.squeeze(mat['label'])
    z = np.squeeze(mat['global_att'])
    w = np.squeeze(mat['timestamp'])
    
    return x, y, z, w

***adjust variables according to your case

datapath
- where you store the dataset

meta
- number of meta features

samplesize
- total sample size

rate
- define size of training set

folders
- folder number under your datapath
- depend on how you structured/preprocessed your data

In [4]:
datapath = "datapath"
meta = 7
samplesize = 13341
data2 = np.zeros((meta,samplesize))

rate = 0.6
folders = 18
count = 0
for i in range(folders):
    
    print(i)
    
    path = datapath + str(i) + '/*.mat'
    samples = glob.glob(path)
    sample_count = math.ceil(len(samples)*rate)
    
    for j, data in enumerate(samples):
        node, gt, gb, timestamp = data_loader(data)
        data2[:,count] = gb[:]
        count = count + 1
        
max_v = np.amax(data2, 1)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17


In [5]:
training = []
ground_truth = []

validation = []
val_ground_truth = []

for i in range(folders):
    
    print(i)
    
    path = datapath + str(i) + '/*.mat'
    samples = glob.glob(path)
    sample_count = math.ceil(len(samples)*rate)
    
    for j, data in enumerate(samples):
        node, gt, gb, timestamp = data_loader(data)
        
        # put meta features of a flow to global
        gb[0] = gb[0]/max_v[0]
        gb[1] = gb[1]/max_v[1]
        gb[2] = gb[2]/max_v[2]
        gb[3] = gb[3]/max_v[3]
        gb[4] = gb[4]/max_v[4]
        gb[5] = gb[5]/max_v[5]
        gb[6] = gb[6]/max_v[6]

        # put timestamps of a flow to edges
        if len(node.shape) == 2:
 
            if node.shape[1] >1500:
                print(node.shape)

            node_count = node.shape[0]
            edge = []
            sender = []
            receiver = []
            
            for k in range(node_count):
                for m in range(k+1, node_count):
                    
                    time_ = timestamp[m]-timestamp[k]
                    edge.append([time_.astype(np.float32)])
                    sender.append(k)
                    receiver.append(m)
            
        elif len(node.shape) == 1:
            print("only 1 node, something's wrong!!")

        else:
            print("something's wrong!!")

        # put packets of a flow to nodes
        # use first 1500 bytes of each packets
        node = node[:,:100]

        globals_0 = gb.tolist()

        data_dict_0 = {
            "globals": globals_0,
            "nodes": node.tolist(),
            "edges": edge,
            "senders": sender,
            "receivers": receiver
        }
        data_dict_1 = {
            "globals": gt
        }
        
        if j<sample_count:
            training.append(data_dict_0)
            ground_truth.append(data_dict_1)
        else:
            validation.append(data_dict_0)
            val_ground_truth.append(data_dict_1)
            
print("Training size: ", len(training))
print("Validation size", len(validation))

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
Training size:  8013
Validation size 5328


***adjust variables according to your case

cls
- classes to classify

***you can modify EncodeProcessDecode() in model.py according to your case

In [6]:
# Full Graph Network Block
cls = 6
OUTPUT_SIZE=cls
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

with tf.device('/GPU:0'):

    net = models.EncodeProcessDecode(               
        edge_output_size=None,
        node_output_size=None,
        global_output_size=OUTPUT_SIZE,)

print(net)

koko-build()
EncodeProcessDecode(global_output_size=6)


In [None]:
# class name tag
cls_names = ["chat", "email", "file", "streaming", "torrent", "voip"]

def create_loss_ops(output, target):
    # calculate weight according to sample size of each class
    weights = tf.constant([1.0/0.3019, 1.0/0.0223, 1.0/0.0709, 1.0/0.0441, 1.0/0.0337, 1.0/0.5268])
    
    # with weights
    loss = tf.compat.v1.losses.softmax_cross_entropy(onehot_labels=weights*tf.one_hot(tf.cast(target.globals, tf.int32), cls)
                                                     , logits=output.globals)
    # without weights
    #loss = tf.compat.v1.losses.softmax_cross_entropy(onehot_labels=tf.one_hot(tf.cast(target.globals, tf.int32), cls)
    #                                                 , logits=output.globals)
    return loss

def update_step(inputs_tr, targets_tr):
    with tf.GradientTape() as tape:
        outputs_tr = net(inputs_tr, is_training=True)
        # Loss.
        loss_tr = create_loss_ops(outputs_tr, targets_tr)

    gradients = tape.gradient(loss_tr, net.trainable_variables)
    optimizer.apply(gradients, net.trainable_variables)
    
    return outputs_tr, loss_tr

# define EPOH, batch size for training & validation
EPOCH = 300
BATCH_SIZE = 128
BATCH_SIZE_V = 350

max_acc = 0.0
max_acc_epoch = 0

# set learning rate
learning_rate = 3.5e-4
optimizer = snt.optimizers.Adam(learning_rate)

for i in range(EPOCH):
    # shuffle
    temp = list(zip(training, ground_truth))
    random.shuffle(temp)
    train_2, gt_2 = zip(*temp)
    temp_loss = 0.0
    
    print("EPOCH:", i)
    
    for k in range(math.floor(len(train_2)/BATCH_SIZE)):
    
        with tf.device('/GPU:0'):
            # get input data
            inputs_graph = utils_tf.data_dicts_to_graphs_tuple(train_2[k*BATCH_SIZE:(k+1)*BATCH_SIZE])
            targets_graph = utils_tf.data_dicts_to_graphs_tuple(gt_2[k*BATCH_SIZE:(k+1)*BATCH_SIZE])
            with tf.GradientTape() as tape:
                outputs_tr = net(inputs_graph, 1)
                # Loss.
                loss_tr = create_loss_ops(outputs_tr[0], targets_graph)
            
            temp_loss += loss_tr
            gradients = tape.gradient(loss_tr, net.trainable_variables)
            optimizer.apply(gradients, net.trainable_variables)
    
    total2 = np.zeros(cls)
    cf = np.zeros((cls, cls))
    
    acc = tf.keras.metrics.Accuracy()
    acc.reset_states()
    
    # validation acc
    ranc = math.ceil(len(validation)/BATCH_SIZE_V)
    ranf = math.floor(len(validation)/BATCH_SIZE_V)
    last_bit = len(validation) - ranf*BATCH_SIZE_V
    for k in range(ranc):

        with tf.device('/GPU:0'):

            if k <= ranc - 2:
                inputs_graph = utils_tf.data_dicts_to_graphs_tuple(validation[k*BATCH_SIZE_V:(k+1)*BATCH_SIZE_V])
                targets_graph = utils_tf.data_dicts_to_graphs_tuple(val_ground_truth[k*BATCH_SIZE_V:(k+1)*BATCH_SIZE_V])
            else: 
                inputs_graph = utils_tf.data_dicts_to_graphs_tuple(validation[k*BATCH_SIZE_V:])
                targets_graph = utils_tf.data_dicts_to_graphs_tuple(val_ground_truth[k*BATCH_SIZE_V:])
            outputs_tr = net(inputs_graph, 1)
            
        pred = tf.math.argmax(outputs_tr[0].globals, 1)
        _ = acc.update_state(targets_graph.globals, pred)
            
        if k <= ranc - 2:   
            for j in range(BATCH_SIZE_V):
                total2[val_ground_truth[j+k*BATCH_SIZE_V]['globals']] += 1
                cf[val_ground_truth[j+k*BATCH_SIZE_V]['globals'], pred[j]] += 1
        else:
    
            for j in range(last_bit):
                total2[val_ground_truth[j+k*BATCH_SIZE_V]['globals']] += 1
                cf[val_ground_truth[j+k*BATCH_SIZE_V]['globals'], pred[j]] += 1
    if acc.result().numpy() > max_acc:
        max_acc = acc.result().numpy()
        max_acc_epoch = i

    print("Max ACC: ", max_acc, "   / Max epoch: ", max_acc_epoch)
    print("Val ACC: ", acc.result().numpy())
    
    
    for yy in range(cls):
        print(cls_names[yy], ": ", int(total2[yy]), end=" ")

    print("")
    for yy in range(cls):
        print(cls_names[yy], " acc: ", cf[yy,yy]/total2[yy], end=" ")
    print("")    
    
    np.set_printoptions(precision=3,suppress=True)
    print(cf)
    
    total3 = np.zeros(cls)
    cf3 = np.zeros((cls, cls))
    
    acc.reset_states()
    
    ranc = math.ceil(len(training)/BATCH_SIZE_V)
    ranf = math.floor(len(training)/BATCH_SIZE_V)
    last_bit = len(training) - ranf*BATCH_SIZE_V
    
    # check training acc
    for k in range(ranc):
            
        with tf.device('/GPU:0'):
            if k <= ranc - 2:
                inputs_graph = utils_tf.data_dicts_to_graphs_tuple(training[k*BATCH_SIZE_V:(k+1)*BATCH_SIZE_V])
                targets_graph = utils_tf.data_dicts_to_graphs_tuple(ground_truth[k*BATCH_SIZE_V:(k+1)*BATCH_SIZE_V])
            else:
                inputs_graph = utils_tf.data_dicts_to_graphs_tuple(training[k*BATCH_SIZE_V:])
                targets_graph = utils_tf.data_dicts_to_graphs_tuple(ground_truth[k*BATCH_SIZE_V:])
                
            outputs_tr = net(inputs_graph, 1)
        pred = tf.math.argmax(outputs_tr[0].globals, 1)
        _ = acc.update_state(targets_graph.globals, pred)
        if k <= ranc - 2:
            for j in range(BATCH_SIZE_V):
                total3[ground_truth[j+k*BATCH_SIZE_V]['globals']] += 1
                cf3[ground_truth[j+k*BATCH_SIZE_V]['globals'], pred[j]] += 1
        else:
            for j in range(last_bit):
                total3[ground_truth[j+k*BATCH_SIZE_V]['globals']] += 1
                cf3[ground_truth[j+k*BATCH_SIZE_V]['globals'], pred[j]] += 1
            
    print("Training ACC: ", acc.result().numpy())
    
    for yy in range(cls):
        print(cls_names[yy], ": ", int(total3[yy]), end=" ")
    print("")
    for yy in range(cls):
        print(cls_names[yy], " acc: ", cf3[yy,yy]/total3[yy], end=" ")
    print("")  
    my_plot[i, 1] = acc.result().numpy()

    np.set_printoptions(precision=3,suppress=True)
    print(cf3)
    
    print("\n\n")
