In [None]:
import tensorflow as tf
import tensorflow_federated as tff

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
import collections
import time

import seaborn as sns
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from joblib import dump, load

In [None]:
NB_CLIENTS = 9
TEST_CLIENT = None

DATA_PATH_PATTERN = "../data/Device #%d.csv"
PRETRAINED_MODEL = "../models/dl.h5"

WHOLE_CLASSES = ['benign', 'm_scan', 'm_ack', 'm_syn', 'm_udp', 'm_udpplain', 'g_combo', 'g_junk', 'g_scan', 'g_tcp', 'g_udp']
BINARY_MODE = False
BATCH_SIZE = 64
NB_CLASSES = len(WHOLE_CLASSES)
NB_EPOCHES = 1
NB_ROUNDS = 20
MX_RECORDS = 100000

In [None]:
global le, sc
sc = load('../models/std_scaler.bin')
le = load('../models/lab_encoder.bin')


def preprocessor(_client_data: pd.DataFrame):
    global sc, le
    _client_data = shuffle(_client_data)
    _x = _client_data.drop(["type"], axis=1)
    _y = _client_data["type"]
    _x = sc.transform(_x)
    
    if le is None:
        le = LabelEncoder()
        le.fit(WHOLE_CLASSES)
    
    if BINARY_MODE:
        _y = (_y == "benign").astype(int)
    else:
        _y = le.transform(_y)
        _y = tf.keras.utils.to_categorical(_y, num_classes=NB_CLASSES)
        
    return _x, _y


def make_dataset(_x, _y):
    return tf.data.Dataset.from_tensor_slices(
        collections.OrderedDict(
            x=tf.constant(_x),
            y=tf.constant(_y)
        )
    )


def client_data(_x, _y):
    indicies = np.random.randint(len(_x), size=MX_RECORDS)
    _client_data = make_dataset(_x[indicies], _y[indicies])
    return _client_data.shuffle(MX_RECORDS).batch(BATCH_SIZE).repeat(NB_EPOCHES)

In [None]:
global raw_clients_data, processed_data
raw_clients_data = []
processed_data = []
test_raw_data = None

for idx in range(NB_CLIENTS):
    raw_content = shuffle(pd.read_csv(DATA_PATH_PATTERN % (idx + 1)))
    
    if TEST_CLIENT is not None:
        if TEST_CLIENT == idx:
            test_raw_data = raw_content
            continue
    else:
        train_data, test_data = train_test_split(raw_content, test_size=0.2, random_state=101)

        if test_raw_data is None:
            test_raw_data = test_data
        else:
            test_raw_data = pd.concat([test_raw_data, test_data])
        raw_content = train_data
        
    raw_clients_data.append(raw_content)
    processed_data.append(preprocessor(raw_content))

In [None]:
train_client_data = [client_data(*data) for data in processed_data]
element_spec = train_client_data[0].element_spec

test_x, test_y = preprocessor(test_raw_data)

In [None]:
def create_model():
    model = tf.keras.models.Sequential([
        tf.keras.layers.Dense(64, input_dim = 116),
        tf.keras.layers.GroupNormalization(),
#         tf.keras.layers.BatchNormalization(synchronized=True, input_dim=116),
#         tf.keras.layers.Dense(64, activation="relu"),
        tf.keras.layers.Dense(64),
        tf.keras.layers.LeakyReLU(alpha=0.05),
        tf.keras.layers.Dense(32),
        tf.keras.layers.LeakyReLU(alpha=0.05),
        tf.keras.layers.Dense(32),
        tf.keras.layers.LeakyReLU(alpha=0.05),
        tf.keras.layers.Dense(NB_CLASSES, activation = "softmax")
    ])
    model.load_weights(PRETRAINED_MODEL)
    return model

def model_fn():
    model = create_model()
    return tff.learning.models.from_keras_model(
        model,
        input_spec=element_spec,
        loss=tf.keras.losses.CategoricalCrossentropy(),
        metrics=[
            tf.keras.metrics.CategoricalAccuracy()
        ]
    )

In [None]:
trainer = tff.learning.algorithms.build_weighted_fed_avg(
    model_fn, client_optimizer_fn=lambda: tf.keras.optimizers.RMSprop()
)

In [None]:
evaluation = create_model()
evaluation.compile(tf.keras.optimizers.RMSprop(), 'binary_crossentropy', metrics=['acc'])
pretrained_weights = evaluation.get_weights()

In [None]:
def evaluate(nb_rounds=NB_ROUNDS):
    state = trainer.initialize()
    state = trainer.set_model_weights(state, tff.learning.models.ModelWeights(pretrained_weights, []))
    history = []
    
    for _ in range(nb_rounds):
        start_t = time.time()
        train_client_data = [client_data(*data) for data in processed_data]
        state, metrics = trainer.next(state, train_client_data)
        history.append(metrics['client_work']['train'])
        train_metrics = metrics['client_work']['train']
        end_t = time.time()
        print('train metrics {m}, round time {t:.2f} seconds'.format(m=train_metrics, t=(end_t - start_t)))
        evaluation.set_weights(trainer.get_model_weights(state).trainable)
#         evaluation.evaluate(test_x, test_y)
        
    return state, history

In [None]:
state, history = evaluate()

In [None]:
evaluation.set_weights(trainer.get_model_weights(state).trainable)
loss, acc = evaluation.evaluate(test_x, test_y)

In [None]:
print(f'Test Accuracy: { acc }')

In [None]:
pred_y = evaluation.predict(test_x)
roc_matrix = np.zeros((NB_CLASSES, NB_CLASSES))

for corr, pred in zip(test_y, pred_y):
    corr_idx = np.argmax(corr)
    pred_idx = np.argmax(pred)
    roc_matrix[pred_idx][corr_idx] += 1

In [None]:
plt.figure()
sns.heatmap(roc_matrix)