In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import torch
from torch_geometric.data import Data
import os
import torch.nn.functional as F
import json 
import warnings
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from torch_geometric.loader import NeighborLoader

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
from pprint import pprint
import gzip
from sklearn.manifold import TSNE
import json
import copy
import os

In [None]:
num_of_ctg = 10
learning_rounds = 10
epochs = 20
hosts = ['cadets','theia','trace']

In [None]:
def prepare_graph(df):
    nodes = {}
    labels = {}
    edges = []
    proc = {}
    
    dummies = {'SUBJECT_PROCESS': 0,'FILE_OBJECT_FILE': 1,'NetFlowObject': 2,}

    for i in range(len(df)):
        x = df.iloc[i]
        action = x["action"]
        
        actorid = x["actorID"]
        if not (actorid in nodes):
            nodes[actorid] =  []
        nodes[actorid].append(x['exec'])
        nodes[actorid].append(action)
        if x['path'] != '':
            nodes[actorid].append(x['path'])
        labels[actorid] = dummies[x['actor_type']]

        objectid = x["objectID"]
        if not (objectid in nodes):
            nodes[objectid] =  []
        nodes[objectid].append(x['exec'])
        nodes[objectid].append(action)
        if x['path'] != '':
             nodes[objectid].append(x['path'])
        labels[objectid] = dummies[x['object']]

        edges.append(( actorid, objectid ))
        
        proc[actorid] = x['actorID']

    features = []
    feat_labels = []
    edge_index = [[],[]]
    index  = {}
    mapp = []

    all_procs = set()

    for k,v in nodes.items():
        features.append(v)
        feat_labels.append(labels[k])
        index[k] = len(features) - 1
        mapp.append(k)
        
        if k in proc:
            all_procs.add(proc[k])

    for x in edges:
        src = index[x[0]]
        dst = index[x[1]]

        edge_index[0].append(src)
        edge_index[1].append(dst)
        
    idx_to_proc = {}
    for i in range(len(mapp)):
        if mapp[i] in proc:
            idx_to_proc[i] = proc[mapp[i]]
            
    all_procs = list(all_procs)

    return features,feat_labels,edge_index,mapp,all_procs,idx_to_proc

In [None]:
from torch_geometric.nn import GCNConv
from torch_geometric.nn import SAGEConv, GATConv
import torch.nn.functional as F
import torch.nn as nn

class GCN(torch.nn.Module):
    def __init__(self,in_channel,out_channel):
        super(GCN, self).__init__()
        self.conv1 = SAGEConv(in_channel, 32, normalize=True)
        self.conv2 = SAGEConv(32, 20, normalize=True)
        self.linear = nn.Linear(in_features=20, out_features=out_channel)

    def forward(self, x: torch.Tensor, edge_index: torch.Tensor) -> torch.Tensor:
    
        x = self.encode(x, edge_index)
        x = self.linear(x)
        return F.softmax(x, dim=1)
    
    def encode(self, x: torch.Tensor, edge_index: torch.Tensor) -> torch.Tensor:
        
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index)
        return x
    
    def freeze_conv_layers(self):
        for param in self.conv1.parameters():
            param.requires_grad = False
        for param in self.conv2.parameters():
            param.requires_grad = False

In [None]:
def add_attributes(d,p):
    
    f = open(p)
    data = [json.loads(x) for x in f if "EVENT" in x]

    info = []
    for x in data:
        try:
            action = x['datum']['com.bbn.tc.schema.avro.cdm18.Event']['type']
        except:
            action = ''
        try:
            actor = x['datum']['com.bbn.tc.schema.avro.cdm18.Event']['subject']['com.bbn.tc.schema.avro.cdm18.UUID']
        except:
            actor = ''
        try:
            obj = x['datum']['com.bbn.tc.schema.avro.cdm18.Event']['predicateObject']['com.bbn.tc.schema.avro.cdm18.UUID']
        except:
            obj = ''
        try:
            timestamp = x['datum']['com.bbn.tc.schema.avro.cdm18.Event']['timestampNanos']
        except:
            timestamp = ''
        try:
            cmd = x['datum']['com.bbn.tc.schema.avro.cdm18.Event']['properties']['map']['exec']
        except:
            cmd = ''
        try:
            path = x['datum']['com.bbn.tc.schema.avro.cdm18.Event']['predicateObjectPath']['string']
        except:
            path = ''
        try:
            path2 = x['datum']['com.bbn.tc.schema.avro.cdm18.Event']['predicateObject2Path']['string']
        except:
            path2 = ''
        try:
            obj2 = x['datum']['com.bbn.tc.schema.avro.cdm18.Event']['predicateObject2']['com.bbn.tc.schema.avro.cdm18.UUID']
            info.append({'actorID':actor,'objectID':obj2,'action':action,'timestamp':timestamp,'exec':cmd, 'path':path2})
        except:
            pass

        info.append({'actorID':actor,'objectID':obj,'action':action,'timestamp':timestamp,'exec':cmd, 'path':path})

    rdf = pd.DataFrame.from_records(info).astype(str)
    d = d.astype(str)

    return d.merge(rdf,how='inner',on=['actorID','objectID','action','timestamp']).drop_duplicates()

In [None]:
def combine_word2vec_models(models):
    unified_model = Word2Vec(vector_size=models[0].vector_size, window=models[0].window, min_count=models[0].min_count, sg=models[0].sg)
    unified_model.build_vocab([list(models[0].wv.index_to_key)])

    for word in unified_model.wv.index_to_key:
        unified_model.wv[word] = models[0].wv[word]

    for model in models[1:]:
        unique_words = set(model.wv.index_to_key) - set(unified_model.wv.index_to_key)

        unified_model.build_vocab([list(unique_words)], update=True)
        for word in set(model.wv.index_to_key).intersection(set(unified_model.wv.index_to_key)):
            unified_model.wv[word] = (unified_model.wv[word] + model.wv[word]) / 2.0
        for word in unique_words:
            unified_model.wv[word] = model.wv[word]

    return unified_model

In [None]:
from gensim.models.callbacks import CallbackAny2Vec
import gensim
from gensim.models import Word2Vec
from multiprocessing import Pool
from itertools import compress
from tqdm import tqdm
import time

In [None]:
class EpochSaver(CallbackAny2Vec):
    def __init__(self,filename):
        self.epoch = 0
        self.filename = filename

    def on_epoch_end(self, model):
        model.save(self.filename)
        self.epoch += 1

In [None]:
class EpochLogger(CallbackAny2Vec):
    def __init__(self):
        self.epoch = 0

    def on_epoch_begin(self, model):
        print("Epoch #{} start".format(self.epoch))

    def on_epoch_end(self, model):
        print("Epoch #{} end".format(self.epoch))
        self.epoch += 1

In [None]:
def train_client_word2vec_model(df):
    phrases,feat_labels,edge_index,mapp,all_procs,idx_to_proc = prepare_graph(df)
    word2vec = Word2Vec(sentences=phrases, vector_size=30, window=5, min_count=1, workers=5,epochs=100,callbacks=[saver,logger])

In [None]:
from cryptography.fernet import Fernet
import base64

def generate_key():
    return Fernet.generate_key()

def encrypt_word2vec_model(word2vec_model, encryption_key):
    f = Fernet(encryption_key)
    vector_size = word2vec_model.vector_size  
    encrypted_model = gensim.models.Word2Vec(vector_size=vector_size, min_count=1)  
    
    for word in word2vec_model.wv.index_to_key:
        vector = word2vec_model.wv.get_vector(word)
        encrypted_word = f.encrypt(word.encode()).decode()
        encrypted_model.wv[encrypted_word] = vector
    return encrypted_model

def decrypt_word2vec_model(word2vec_model, encryption_key):
    f = Fernet(encryption_key)
    vector_size = word2vec_model.vector_size  
    decrypted_model = gensim.models.Word2Vec(vector_size=vector_size, min_count=1)  
    
    for word in word2vec_model.wv.index_to_key:
        vector = word2vec_model.wv.get_vector(word)
        decrypted_word = f.decrypt(word.encode()).decode()
        decrypted_model.wv[decrypted_word] = vector
    
    return decrypted_model

In [None]:
key = generate_key()

word_models = []
for m in ['cadets','theia','trace']:
    word2vec = Word2Vec.load(f"word2vec_{m}_E3.model")
    encrypted_model = encrypt_word2vec_model(word2vec,key)
    word_models.append(encrypted_model)

global_word = combine_word2vec_models(word_models)
global_word = decrypt_word2vec_model(global_word,key)
global_word.save("global_word2vec_E3.model")

In [None]:
from sklearn.utils import class_weight
import torch.nn.functional as F
from torch.nn import CrossEntropyLoss

In [None]:
from collections import Counter
word2vec = Word2Vec.load("global_word2vec_E3.model")

def infer(doc):
    global word2vec
    temp = dict(Counter(doc))
    emb = np.zeros(30)
    count = 0
    for k,v in temp.items():
        if k in word2vec.wv:
            emb = emb + word2vec.wv[k]*v
            count = count + 1
    emb = emb / count
    return emb

In [None]:
def init_gnns():
    global num_of_ctg
    n = num_of_ctg 
    gnn_models = []
    for i in range(n):
        m = GCN(30,3).to(device)
        gnn_models.append(m)
    return gnn_models

In [None]:
def define_categories(pids):
    global num_of_ctg
    n = num_of_ctg - 1
    ctg = set(pids)
    ctg = list(ctg)
    k, m = divmod(len(ctg), n)
    return [set(ctg[i * k + min(i, m):(i + 1) * k + min(i + 1, m)]) for i in range(n)]

In [None]:
def map_pids_to_category_indices(pids, categories):
    pid_to_category_index = {}
    
    for pid in pids:
        for category_index, category_set in enumerate(categories):
            if pid in category_set:
                pid_to_category_index[pid] = category_index 
                break 
    
    return pid_to_category_index

In [None]:
from torch.nn import CrossEntropyLoss
import copy

templates = init_gnns()

def train_gnn_func(nodes,labels,edges,mapp,pids,idx_to_pid):
    
    global categories ,epochs
    
    pid_to_gnn_index = map_pids_to_category_indices(pids, categories)
    
    set_pids = set(pids)

    proc_index = [i for i in range(len(mapp)) if mapp[i] in set_pids]

    train_splits = [[] for _ in range(len(categories))]

    for i in proc_index:
        pname = idx_to_pid[str(i)]
        split_indx = pid_to_gnn_index[pname]
        train_splits[split_indx].append(int(i))
        
    local_models = [copy.deepcopy(x) for x in templates]
    
    for i in range(len(local_models)-1):
            
        if len(train_splits[i]) == 0:
            local_models[i] = None
        else:
            if f"target_e3_global{i}.pth" in os.listdir():
                local_models[i].load_state_dict(torch.load(f"target_e3_global{i}.pth"))

            optimizer = torch.optim.Adam(local_models[i].parameters(), lr=0.01, weight_decay=5e-4)
            criterion = CrossEntropyLoss()

            graph = Data(x=torch.tensor(nodes,dtype=torch.float).to(device),y=torch.tensor(labels,dtype=torch.long).to(device), edge_index=torch.tensor(edges,dtype=torch.long).to(device))
            mask = torch.tensor([False]*graph.num_nodes, dtype=torch.bool)
            mask[train_splits[i]] = True
            
            def get_neighbors(edge_index, nodes):
                neighbors = []
                for node in nodes:
                    mask = edge_index[0] == node
                    neighbors.extend(edge_index[1, mask].tolist())
                return torch.tensor(list(set(neighbors)), dtype=torch.long)

            one_hop_neighbors = get_neighbors(graph.edge_index, train_splits[i])
            two_hop_neighbors = get_neighbors(graph.edge_index, one_hop_neighbors)
            two_hop_neighbors = two_hop_neighbors[~mask[two_hop_neighbors]]
            mask[two_hop_neighbors] = True
            
            for epoch in range(epochs):
                loader = NeighborLoader(graph, num_neighbors=[-1,-1], batch_size=5000,input_nodes=mask)
                total_loss = 0
                for subg in loader:
                    local_models[i].train()
                    optimizer.zero_grad() 
                    out = local_models[i](subg.x, subg.edge_index) 
                    loss = criterion(out, subg.y) 
                    loss.backward() 
                    optimizer.step()      
                    total_loss += loss.item() * subg.batch_size
                print("Loss: ", total_loss / mask.sum().item(), '\n')
    
    graph = Data(x=torch.tensor(nodes,dtype=torch.float).to(device),y=torch.tensor(labels,dtype=torch.long).to(device), edge_index=torch.tensor(edges,dtype=torch.long).to(device))
    optimizer = torch.optim.Adam(local_models[-1].parameters(), lr=0.01, weight_decay=5e-4)
    criterion = CrossEntropyLoss()
    
    for epoch in range(epochs):
        local_models[-1].train()
        optimizer.zero_grad() 
        out = local_models[-1](graph.x, graph.edge_index) 
        loss = criterion(out, graph.y) 
        loss.backward() 
        optimizer.step()      
        print(f"Epoch: {epoch}, Loss: {loss.item()}")

    return local_models

In [None]:
procs_total = []
data_cache = {}
categories = None

def generate_key():
    return Fernet.generate_key()

def encrypt_data(data, key):
    fernet = Fernet(key)
    return [fernet.encrypt(str(d).encode()) for d in data]

def decrypt_data(nested_data, key):
    fernet = Fernet(key)
    return [[fernet.decrypt(d).decode() for d in inner_list] for inner_list in nested_data]

def load_clients_data():
    
    global data_cache,categories,procs_total

    key = generate_key()
    
    for name in ['cadets','theia','trace']:
        if name == 'cadets':
            train_file = 'content/darpatc/cadets_train.txt'
            attribute_file = "content/ta1-cadets-e3-official.json.1"

        if name == 'theia':
            train_file = "content/darpatc/theia_train.txt"
            attribute_file = "content/ta1-theia-e3-official-1r.json"

        if name == 'trace':
            train_file = "content/darpatc/trace_train.txt"
            attribute_file = "content/ta1-trace-e3-official-1.json"  

        f = open(train_file)

        data = f.read().split('\n')
        data = [line.split('\t') for line in data]

        df = pd.DataFrame (data, columns = ['actorID', 'actor_type','objectID','object','action','timestamp'])
        df = df.dropna()
        df.sort_values(by='timestamp', ascending=True,inplace=True)
        df = add_attributes(df,attribute_file)
        df = df[df['actor_type'] == 'SUBJECT_PROCESS'] 
        df = df[df['object'].isin(['FILE_OBJECT_FILE', 'NetFlowObject', 'SUBJECT_PROCESS'])]    

        docs,labels,edges,mapp,pids,idx_to_pid = prepare_graph(df)
        data_cache[name] = [docs,labels,edges,mapp,pids,idx_to_pid]
        procs_total = procs_total + pids

    encrypted_procs_total = encrypt_data(procs_total, key)
    categories = define_categories(encrypted_procs_total)
    categories = decrypt_data(categories, key)

In [None]:
def client_handling_loop(client_id):        
    docs,labels,edges,mapp,pids,idx_to_pid = data_cache[client_id]
    
    nodes_feat = []
    for x in docs:
        nodes_feat.append(infer(x)) 
        
    trained_local_models = train_gnn_func(nodes_feat,labels,edges,mapp,pids,idx_to_pid)
    return trained_local_models

In [None]:
def server_aggregate(all_models):
    global_models = copy.deepcopy(templates)
    
    for l in range(len(all_models)):
        
        current_models = all_models[l]
        current_models = [x for x in current_models if x != None]
        
        if not len(current_models) == 0:
        
            global_dict = global_models[l].state_dict()

            for k in global_dict.keys():
                param_list = [current_models[i].state_dict()[k] for i in range(len(current_models))]
                global_dict[k] = torch.stack(param_list, 0).mean(0)

            global_models[l].load_state_dict(global_dict)
            torch.save(global_models[l].state_dict(), f"target_e3_global{l}.pth")
                   
    return global_models

In [None]:
def perform_federated_learning(n_clients):
    client_models = []
    for c in n_clients:
        local_gnns = client_handling_loop(c)
        client_models.append(local_gnns)
    return client_models

In [None]:
load_clients_data()

In [None]:
for r in range(learning_rounds):
    client_models = perform_federated_learning(hosts)
    arranged_models =  [list(group) for group in zip(*client_models)]
    global_models = server_aggregate(arranged_models)

In [None]:
def Validate(graph, model):
    graph = Data(x=torch.tensor(nodes, dtype=torch.float).to(device),
                 y=torch.tensor(labels, dtype=torch.long).to(device),
                 edge_index=torch.tensor(edges, dtype=torch.long).to(device))
    
    model.eval()
    out = model(graph.x, graph.edge_index)

    sorted, indices = out.sort(dim=1, descending=True)
    conf = (sorted[:, 0] - sorted[:, 1]) / sorted[:, 0]
    conf = (conf - conf.min()) / conf.max()

    pred = indices[:, 0]
    cond = (pred == graph.y)
    flag = ~torch.tensor(cond)

    return conf[flag].tolist()

In [None]:
from itertools import compress
from torch_geometric import utils

def construct_neighborhood(ids,mapp,edges,hops):
    if hops == 0:
        return set()
    else:
        neighbors = set()
        for i in range(len(edges[0])):
            if mapp[edges[0][i]] in ids:
                neighbors.add(mapp[edges[1][i]])
            if mapp[edges[1][i]] in ids:
                neighbors.add(mapp[edges[0][i]])
        return neighbors.union( construct_neighborhood(neighbors,mapp,edges,hops-1) )

In [None]:
def helper(MP,all_pids,GP,edges,mapp):

    TP = MP.intersection(GP)  
    FP = MP - GP              
    FN = GP - MP              
    TN = all_pids - (GP | MP)
    
    two_hop_gp = construct_neighborhood(GP,mapp,edges,2)
    two_hop_tp = construct_neighborhood(TP,mapp,edges,2)
    FPL = FP - two_hop_gp
    TPL = TP.union(FN.intersection(two_hop_tp))
    FN = FN - two_hop_tp
    
    alerts = TP.union(FP)

    TP,FP,FN,TN = len(TPL),len(FPL),len(FN),len(TN)
    
    FPR = FP / (FP+TN)
    TPR = TP / (TP+FN)

    print(f"Number of True Positives: {TP}")
    print(f"Number of Fasle Positives: {FP}")
    print(f"Number of False Negatives: {FN}")
    print(f"Number of True Negatives: {TN}\n")

    prec = TP / (TP + FP)
    print(f"Precision: {prec}")

    rec = TP / (TP + FN)
    print(f"Recall: {rec}")

    fscore = (2*prec*rec) / (prec + rec)
    print(f"Fscore: {fscore}\n")
    
    return TPL,FPL

In [None]:
data_cache_mal = {}
def load_data_test():
    
    test_file = None
    attribute_file = None
    
    for name in ['cadets','theia','trace']:

        if name == 'cadets':
            test_file = 'content/darpatc/cadets_test.txt'
            attribute_file = "content/ta1-cadets-e3-official-2.json"

        if name == 'theia':
            test_file = "content/darpatc/theia_test.txt"
            attribute_file = "content/ta1-theia-e3-official-6r.json.8"

        if name == 'trace':
            test_file = "content/darpatc/trace_test.txt"
            attribute_file = "content/ta1-trace-e3-official-1.json.4"

        f = open(test_file)

        data = f.read().split('\n')
        data = [line.split('\t') for line in data]

        df = pd.DataFrame (data, columns = ['actorID', 'actor_type','objectID','object','action','timestamp'])
        df = df.dropna()
        df.sort_values(by='timestamp', ascending=True,inplace=True)

        df = add_attributes(df,attribute_file)

        df = df[df['actor_type'] == 'SUBJECT_PROCESS'] 
        df = df[df['object'].isin(['FILE_OBJECT_FILE', 'NetFlowObject', 'SUBJECT_PROCESS'])]

        docs,labels,edges,mapp,pids,idx_to_pid = prepare_graph(df)
        data_cache_mal[name] = [docs,labels,edges,mapp,pids,idx_to_pid]

In [None]:
load_data_test()

In [None]:
def run_evaluation(data_name,thresh=0.8):    
    global word2vec

    client_data = data_cache_mal[data_name]
            
    phrases,labels,edges,mapp,pids,idx_to_pid = client_data

    gt = open(f"{data_name}.txt").read()
    GT_mal = gt.split("\n")
    GT_mal = set([x for x in GT_mal if x in mapp])

    model = GCN(30,3).to(device)
    word2vec = Word2Vec.load("global_word2vec_E3.model")

    nodes = [infer(x) for x in phrases]
    nodes = np.array(nodes)  

    all_ids = set(mapp)
        
    graph = Data(x=torch.tensor(nodes,dtype=torch.float).to(device),y=torch.tensor(labels,dtype=torch.long).to(device), edge_index=torch.tensor(edges,dtype=torch.long).to(device))

    flag = torch.tensor([True]*graph.num_nodes, dtype=torch.bool)
    
    for m_n in range(num_of_ctg):
        if f"target_e3_global{m_n}.pth" in os.listdir("Content_FL_Exp"): 
            model.load_state_dict(torch.load(f"target_e3_global{m_n}.pth"))
            
        model.eval()
        out = model(graph.x, graph.edge_index)

        sorted, indices = out.sort(dim=1,descending=True)
        conf = (sorted[:,0] - sorted[:,1]) / sorted[:,0]
        conf = (conf - conf.min()) / conf.max()

        pred = indices[:,0]
        cond = (pred == graph.y) & (conf >= thresh)
        flag[cond] = torch.logical_and(flag[cond], torch.tensor([False]*len(flag[cond]), dtype=torch.bool))

    index = utils.mask_to_index(flag).tolist()
    ids = set([mapp[x] for x in index])
    TPL,FPL = helper(set(ids),set(all_ids),GT_mal,edges,mapp)
    mapp_to_labels = {x:y for x,y in zip(mapp,labels)}
    return TPL,FPL,mapp_to_labels

In [None]:
_ = run_evaluation('cadets')
_ = run_evaluation('theia')
_ = run_evaluation('trace')