In [1]:
'''
Importing the require libraries here
'''
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import torch
from torch_geometric.data import Data
import os
import torch.nn.functional as F
import orjson as json
import warnings
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
warnings.filterwarnings('ignore')
from torch_geometric.loader import NeighborLoader
import multiprocessing

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
%matplotlib inline

In [2]:
%pwd

'/sfs/weka/scratch/wkw9be/Notebooks_FL'

## Loading libraries and setting up working directory

In [3]:
import os
'''
Setting working directory for rivana jupyter notebook
'''
os.chdir("../content")
%pwd

'/sfs/weka/scratch/wkw9be/content'

In [4]:
'''
Importing some additional libraries
'''
from pprint import pprint
import gzip
from sklearn.manifold import TSNE
import json
import copy
import os

## Defining functions for loading, cleaning and constructing features from the data

In [5]:
'''
This is the main featurizer. It constructs the graph for the cadets dataset.

Args:
    df (DataFrame): This is the main dataframe containing all the system events from the cadets dataset.

return:
    features (list): Contains word2vec encoded feature vectors for each node
    feat_labels (list): Contains label for each node
    edge_index (list): Contains information about edges between nodes in the graph.
    mapp (list): contains id of each node
'''

def prepare_graph(df):
    nodes = {}
    labels = {}
    edges = []
        
    dummies = {
                 'FILE_OBJECT_BLOCK': 0,
                 'FILE_OBJECT_CHAR': 1,
                 'FILE_OBJECT_DIR': 2,
                 'FILE_OBJECT_FILE': 3,
                 'FILE_OBJECT_LINK': 4,
                 'FILE_OBJECT_NAMED_PIPE': 5,
                 'FILE_OBJECT_PEFILE': 6,
                 'FILE_OBJECT_UNIX_SOCKET': 7,
                 'MemoryObject': 8,
                 'NetFlowObject': 9,
                 'PRINCIPAL_LOCAL': 10,
                 'PRINCIPAL_REMOTE': 11,
                 'SRCSINK_DATABASE': 12,
                 'SRCSINK_PROCESS_MANAGEMENT': 13,
                 'SRCSINK_UNKNOWN': 14,
                 'SUBJECT_PROCESS': 15,
                 'SUBJECT_THREAD': 16,
                 'SUBJECT_UNIT': 17,
                 'UnnamedPipeObject': 18,
                 'VALUE_TYPE_SRC': 19
                }

    for i in range(len(df)):
        x = df.iloc[i]
        action = x["action"]
        
        actorid = x["actorID"]
        if not (actorid in nodes):
            nodes[actorid] =  []
        nodes[actorid].append(x['exec'])
        nodes[actorid].append(action)
        if x['path'] != '':
            nodes[actorid].append(x['path'])
        labels[actorid] = dummies[x['actor_type']]

        objectid = x["objectID"]
        if not (objectid in nodes):
            nodes[objectid] =  []
        nodes[objectid].append(x['exec'])
        nodes[objectid].append(action)
        if x['path'] != '':
             nodes[objectid].append(x['path'])
        labels[objectid] = dummies[x['object']]

        edges.append(( actorid, objectid ))

    features = []
    feat_labels = []
    edge_index = [[],[]]
    index  = {}
    mapp = []

    for k,v in nodes.items():
      features.append(v)
      feat_labels.append(labels[k])
      index[k] = len(features) - 1
      mapp.append(k)

    for x in edges:
        src = index[x[0]]
        dst = index[x[1]]

        edge_index[0].append(src)
        edge_index[1].append(dst)

    return features,feat_labels,edge_index,mapp

In [6]:
'''
Defining the model. The model consists of two sageconv layers from the paper GraphSage
'''
#from torch_geometric.nn import SAGEConv, PDNConv
from torch_geometric.nn import SAGEConv
import torch.nn.functional as F
import torch.nn as nn

class GCN(torch.nn.Module):
    def __init__(self,in_channel,out_channel):
        super().__init__()
        self.conv1 = SAGEConv(in_channel, 32, normalize=True)
        self.conv2 = SAGEConv(32, out_channel, normalize=True)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = F.dropout(x, p=0.5, training=self.training)

        x = self.conv2(x, edge_index)
        return F.softmax(x, dim=1)

In [7]:
'''
This function helps visualize the output of the model.
'''
def visualize(h, color):
    z = TSNE(n_components=2).fit_transform(h.detach().cpu().numpy())

    plt.figure(figsize=(10,10))
    plt.xticks([])
    plt.yticks([])

    plt.scatter(z[:, 0], z[:, 1], s=70, c=color, cmap="Set2")
    plt.show()

## Adding semantic attributes from the raw cadets data

In [8]:
'''
This function is used for attributing semnatic information like process names, executable paths,
file paths etc using the raw cadets data
'''

def add_attributes(d,p):
    
    f = open(p)
    data = [json.loads(x) for x in f if "EVENT" in x]

    info = []
    for x in data:
        try:
            action = x['datum']['com.bbn.tc.schema.avro.cdm18.Event']['type']
        except:
            action = ''
        try:
            actor = x['datum']['com.bbn.tc.schema.avro.cdm18.Event']['subject']['com.bbn.tc.schema.avro.cdm18.UUID']
        except:
            actor = ''
        try:
            obj = x['datum']['com.bbn.tc.schema.avro.cdm18.Event']['predicateObject']['com.bbn.tc.schema.avro.cdm18.UUID']
        except:
            obj = ''
        try:
            timestamp = x['datum']['com.bbn.tc.schema.avro.cdm18.Event']['timestampNanos']
        except:
            timestamp = ''
        try:
            cmd = x['datum']['com.bbn.tc.schema.avro.cdm18.Event']['properties']['map']['exec']
        except:
            cmd = ''
        try:
            path = x['datum']['com.bbn.tc.schema.avro.cdm18.Event']['predicateObjectPath']['string']
        except:
            path = ''
        try:
            path2 = x['datum']['com.bbn.tc.schema.avro.cdm18.Event']['predicateObject2Path']['string']
        except:
            path2 = ''
        try:
            obj2 = x['datum']['com.bbn.tc.schema.avro.cdm18.Event']['predicateObject2']['com.bbn.tc.schema.avro.cdm18.UUID']
            info.append({'actorID':actor,'objectID':obj2,'action':action,'timestamp':timestamp,'exec':cmd, 'path':path2})
        except:
            pass

        info.append({'actorID':actor,'objectID':obj,'action':action,'timestamp':timestamp,'exec':cmd, 'path':path})

    rdf = pd.DataFrame.from_records(info).astype(str)
    d = d.astype(str)

    return d.merge(rdf,how='inner',on=['actorID','objectID','action','timestamp']).drop_duplicates()

In [9]:
def combine_word2vec_models(models):
    # Create an empty unified model
    unified_model = Word2Vec(vector_size=models[0].vector_size, window=models[0].window, min_count=models[0].min_count, sg=models[0].sg)

    # Initialize the vocabulary with the words from the first model
    unified_model.build_vocab([list(models[0].wv.index_to_key)])

    # Copy the vectors from the first model to the unified model for the initial vocabulary
    for word in unified_model.wv.index_to_key:
        unified_model.wv[word] = models[0].wv[word]

    # Iterate through the remaining models and add their unique words and vectors
    for model in models[1:]:
        # Get the set of unique words in the current model's vocabulary
        unique_words = set(model.wv.index_to_key) - set(unified_model.wv.index_to_key)

        # Add the unique words to the unified model's vocabulary
        unified_model.build_vocab([list(unique_words)], update=True)

        # Copy the vectors for the unique words from the current model to the unified model
        for word in unique_words:
            unified_model.wv[word] = model.wv[word]

    return unified_model

In [10]:
from gensim.models.callbacks import CallbackAny2Vec
import gensim
from gensim.models import Word2Vec
from multiprocessing import Pool
from itertools import compress
from tqdm import tqdm
import time

In [11]:
#word_models = []
#for m in ['cadets','theia','trace','five']:
#    word2vec = Word2Vec.load(f"word2vec_{m}_E3.model")
#    word_models.append(word2vec)

#global_word = combine_word2vec_models(word_models)
#global_word.save("../Content_FL_Exp/global_word2vec_E3.model")

In [12]:
#phrases,labels,edges,mapp = prepare_graph(df)

In [13]:
#word2vec = Word2Vec(sentences=phrases, vector_size=30, window=5, min_count=1, workers=8,epochs=300,callbacks=[saver,logger])

In [14]:
'''
Defining the train and test function in this cell 
'''
from sklearn.utils import class_weight
import torch.nn.functional as F
from torch.nn import CrossEntropyLoss

template = GCN(30,20).to(device)

In [15]:
'''
Encoding function for running word2vec inference
'''
from collections import Counter
word2vec = Word2Vec.load("../Content_FL_Exp/global_word2vec_E3.model")

def infer(doc):
  temp = dict(Counter(doc))
  emb = np.zeros(30)
  count = 0
  for k,v in temp.items():
    if k in word2vec.wv:
      emb = emb + word2vec.wv[k]*v
      count = count + 1
  emb = emb / count
  return emb

In [16]:
import copy

def train_client_gnn(data_name):
    ################################## Training Main Model #####################################
    model = copy.deepcopy(template)
    if "e3_global.pth" in os.listdir("../Content_FL_Exp"):
        model.load_state_dict(torch.load("../Content_FL_Exp/e3_global.pth"))
        
    with open(f"../Content_FL_Exp/{data_name}_cached_train.json", "r") as json_file:
        client_data = json.load(json_file)
        
    phrases,labels,edges,mapp = client_data

    criterion = CrossEntropyLoss()

    nodes = [infer(x) for x in phrases]
    nodes = np.array(nodes)  

    graph = Data(x=torch.tensor(nodes,dtype=torch.float).to(device),y=torch.tensor(labels,dtype=torch.long).to(device), edge_index=torch.tensor(edges,dtype=torch.long).to(device))
  
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
    criterion = CrossEntropyLoss()

    graph = Data(x=torch.tensor(nodes,dtype=torch.float).to(device),y=torch.tensor(labels,dtype=torch.long).to(device), edge_index=torch.tensor(edges,dtype=torch.long).to(device))
    for epoch in range(10):
        model.train()
        optimizer.zero_grad() 
        out = model(graph.x, graph.edge_index) 
        loss = criterion(out, graph.y) 
        loss.backward() 
        optimizer.step()      
        print(f"Epoch: {epoch}, Loss: {loss.item()}")
    
    return model

In [17]:
def server_aggregate(client_models):
    global_model = copy.deepcopy(template)
    global_dict = global_model.state_dict()
    for k in global_dict.keys():
        param_list = [client_models[i].state_dict()[k] for i in range(len(client_models))]
        global_dict[k] = torch.stack(param_list, 0).mean(0)
    global_model.load_state_dict(global_dict)
    torch.save(global_model.state_dict(), "../Content_FL_Exp/e3_global.pth")
    return global_model

In [18]:
def save_processed_data(name):
    
    train_file = None
    attribute_file = None
    
    if name == 'cadets':
        train_file = 'darpatc/cadets_train.txt'
        attribute_file = "ta1-cadets-e3-official.json.1"
        
    if name == 'theia':
        train_file = "darpatc/theia_train.txt"
        attribute_file = "ta1-theia-e3-official-1r.json"
    
    if name == 'fivedirections':
        train_file = "darpatc/fivedirections_train.txt"
        attribute_file = "ta1-fivedirections-e3-official-2.json"
        
    if name == 'trace':
        train_file = "darpatc/trace_train.txt"
        attribute_file = "ta1-trace-e3-official-1.json"  
        
    f = open(train_file)
    data = f.read().split('\n')
    data = [line.split('\t') for line in data]
    df = pd.DataFrame (data, columns = ['actorID', 'actor_type','objectID','object','action','timestamp'])
    df = df.dropna()
    df.sort_values(by='timestamp', ascending=True,inplace=True)
    
    df = add_attributes(df,attribute_file)
    
    out_data = prepare_graph(df)
    
    file_path = f"../Content_FL_Exp/{name}_cached_train.json"
    with open(file_path, "w") as json_file:
        json.dump(out_data, json_file)

In [19]:
#for x in ['cadets','theia','fivedirections','trace']:
#    print(f'Processing {x}')
#    save_processed_data(x)

In [20]:
learning_rounds = 10

for r in range(learning_rounds):
    print(f"Learning Round {r}")
   
    client_models = []
    for x in ['cadets','theia','fivedirections','trace']:
        print(f"Training Client Model: {x}")
        cmodel = train_client_gnn(x)
        client_models.append(cmodel)
    
    global_model = server_aggregate(client_models)

Learning Round 0
Training Client Model: cadets
Epoch: 0, Loss: 2.997450351715088
Epoch: 1, Loss: 2.97074556350708
Epoch: 2, Loss: 2.9803810119628906
Epoch: 3, Loss: 2.9546849727630615
Epoch: 4, Loss: 2.9504528045654297
Epoch: 5, Loss: 2.9424033164978027
Epoch: 6, Loss: 2.9457931518554688
Epoch: 7, Loss: 2.9458630084991455
Epoch: 8, Loss: 2.952678680419922
Epoch: 9, Loss: 2.9443485736846924
Training Client Model: theia
Epoch: 0, Loss: 2.9973065853118896
Epoch: 1, Loss: 2.982213020324707
Epoch: 2, Loss: 2.9687869548797607
Epoch: 3, Loss: 2.9582056999206543
Epoch: 4, Loss: 2.951622247695923
Epoch: 5, Loss: 2.947148561477661
Epoch: 6, Loss: 2.944131851196289
Epoch: 7, Loss: 2.9420642852783203
Epoch: 8, Loss: 2.9401135444641113
Epoch: 9, Loss: 2.9382224082946777
Training Client Model: fivedirections
Epoch: 0, Loss: 3.0068767070770264
Epoch: 1, Loss: 2.998281240463257
Epoch: 2, Loss: 2.9892704486846924
Epoch: 3, Loss: 2.981553792953491
Epoch: 4, Loss: 2.9752674102783203
Epoch: 5, Loss: 2.969

## Evaluation of the trained GNN model starts here

In [21]:
'''
This function is used for constructing neighborhood around a given 
set of nodes for backwards or forward tracking
'''
from itertools import compress
from torch_geometric import utils

def construct_neighborhood(ids,mapp,edges,hops):
    if hops == 0:
        return set()
    else:
        neighbors = set()
        for i in range(len(edges[0])):
            if mapp[edges[0][i]] in ids:
                neighbors.add(mapp[edges[1][i]])
            if mapp[edges[1][i]] in ids:
                neighbors.add(mapp[edges[0][i]])
        return neighbors.union( construct_neighborhood(neighbors,mapp,edges,hops-1) )

In [22]:
'''
This function logs the evaluation metrics.
'''

def helper(MP,all_pids,GP,edges,mapp):

    GN = all_pids - GP
    MN = all_pids - MP

    TP = MP.intersection(GP)
    FP = MP.intersection(GN)
    FN = MN.intersection(GP)
    TN = MN.intersection(GN)
    
    two_hop_gp = construct_neighborhood(GP,mapp,edges,2)
    two_hop_tp = construct_neighborhood(TP,mapp,edges,2)
    FPL = FP - two_hop_gp
    TPL = TP.union(FN.intersection(two_hop_tp))
    FN = FN - two_hop_tp
    
    alerts = TP.union(FP)

    TP,FP,FN,TN = len(TPL),len(FPL),len(FN),len(TN)
    
    FPR = FP / (FP+TN)
    TPR = TP / (TP+FN)

    print(f"Number of True Positives: {TP}")
    print(f"Number of Fasle Positives: {FP}")
    print(f"Number of False Negatives: {FN}")
    print(f"Number of True Negatives: {TN}\n")

    prec = TP / (TP + FP)
    print(f"Precision: {prec}")

    rec = TP / (TP + FN)
    print(f"Recall: {rec}")

    fscore = (2*prec*rec) / (prec + rec)
    print(f"Fscore: {fscore}\n")
    
    #return alerts
    return TPL,FPL

In [23]:
def save_processed_data_test(name):
    
    test_file = None
    attribute_file = None
    
    if name == 'cadets':
        test_file = 'darpatc/cadets_test.txt'
        attribute_file = "ta1-cadets-e3-official-2.json"
        
    if name == 'theia':
        test_file = "darpatc/theia_test.txt"
        attribute_file = "ta1-theia-e3-official-6r.json.8"
    
    if name == 'fivedirections':
        test_file = "darpatc/fivedirections_test.txt"
        attribute_file = "ta1-fivedirections-e3-official-2.json.23"
        
    if name == 'trace':
        test_file = "darpatc/trace_test.txt"
        attribute_file = "ta1-trace-e3-official-1.json.4"
        
    f = open(test_file)
    data = f.read().split('\n')
    data = [line.split('\t') for line in data]
    df = pd.DataFrame (data, columns = ['actorID', 'actor_type','objectID','object','action','timestamp'])
    df = df.dropna()
    df.sort_values(by='timestamp', ascending=True,inplace=True)
    
    df = add_attributes(df,attribute_file)
    
    out_data = prepare_graph(df)
    
    file_path = f"../Content_FL_Exp/{name}_cached_test.json"
    with open(file_path, "w") as json_file:
        json.dump(out_data, json_file)

In [24]:
#for x in ['cadets','theia','fivedirections','trace']:
#    print(f'Processing {x}')
#    save_processed_data_test(x)

In [30]:
def run_evaluation(data_name,is_fl):    
    global word2vec
    
    with open(f"../Content_FL_Exp/{data_name}_cached_test.json", "r") as json_file:
        client_data = json.load(json_file)
        
    gt = open(f"{data_name}.txt").read()
    GT_mal = set(gt.split("\n"))

    phrases,labels,edges,mapp = client_data
    
    if is_fl:
        model = GCN(30,20).to(device)
        word2vec = Word2Vec.load("../Content_FL_Exp/global_word2vec_E3.model")
        model.load_state_dict(torch.load(f'../Content_FL_Exp/e3_global.pth'))
    else:
        if data_name == 'fivedirections':
            model = GCN(30,13).to(device)
            word2vec = Word2Vec.load("word2vec_five_E3.model")
            model.load_state_dict(torch.load(f'word2vec_gnn_five13_E3.pth'))
            
        if data_name == 'cadets':
            model = GCN(30,6).to(device)
            word2vec = Word2Vec.load("word2vec_cadets_E3.model")
            model.load_state_dict(torch.load(f'word2vec_gnn_cadets0_E3.pth'))
            
        if data_name == 'theia':
            model = GCN(30,5).to(device)
            word2vec = Word2Vec.load("word2vec_theia_E3.model")
            model.load_state_dict(torch.load(f'word2vec_gnn_theia0_E3.pth'))
            
        if data_name == 'trace':
            model = GCN(30,11).to(device)
            word2vec = Word2Vec.load("word2vec_trace_E3.model")
            model.load_state_dict(torch.load(f'word2vec_gnn_trace0_E3.pth'))

    nodes = [infer(x) for x in phrases]
    nodes = np.array(nodes)  

    all_ids = set(mapp)
        
    graph = Data(x=torch.tensor(nodes,dtype=torch.float).to(device),y=torch.tensor(labels,dtype=torch.long).to(device), edge_index=torch.tensor(edges,dtype=torch.long).to(device))
    flag = torch.tensor([True]*graph.num_nodes, dtype=torch.bool)

    model.eval()
    out = model(graph.x, graph.edge_index)

    sorted, indices = out.sort(dim=1,descending=True)
    conf = (sorted[:,0] - sorted[:,1]) / sorted[:,0]
    conf = (conf - conf.min()) / conf.max()

    pred = indices[:,0]
    cond = (pred == graph.y)# & (conf >= 0.9)
    flag[cond] = torch.logical_and(flag[cond], torch.tensor([False]*len(flag[cond]), dtype=torch.bool))

    index = utils.mask_to_index(flag).tolist()
    ids = set([mapp[x] for x in index])
    TPL,FPL = helper(set(ids),set(all_ids),GT_mal,edges,mapp)
    mapp_to_labels = {x:y for x,y in zip(mapp,labels)}
    return TPL,FPL,mapp_to_labels

In [None]:
data_fl = []
for x in ['cadets','trace','fivedirections','theia']:
    print(f"Running Evaluation For {x}")
    temp = run_evaluation(x,True)
    data_fl.append(temp)