In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import torch
from torch_geometric.data import Data
import torch.nn.functional as F
import warnings
import os
warnings.filterwarnings('ignore')
from torch_geometric.loader import NeighborLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
%matplotlib inline

In [None]:
os.chdir("..")
%pwd

In [3]:
from pprint import pprint
import json
import copy

import gensim
from gensim.models import Word2Vec
from multiprocessing import Pool
from itertools import compress
from tqdm import tqdm
import time

import multiprocessing

In [4]:
import math
class PositionalEncoding():

    def __init__(self, d_model ,max_len = 100000):
        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        self.pe = torch.zeros(max_len, d_model)
        self.pe[:,0::2] = torch.sin(position * div_term)
        self.pe[:,1::2] = torch.cos(position * div_term)

    def embed(self, x):
        x = x + self.pe[:x.size(0)]
        return x

encoder = PositionalEncoding(20)

In [5]:
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from collections import Counter
from gensim.models import Word2Vec
from multiprocessing import Pool
from itertools import compress
from tqdm import tqdm
import time

def infer(doc,word2vec):  
    word_emb = []
    for word in doc:
        if word in word2vec.wv:
            word_emb.append(word2vec.wv[word])
  
    if len(word_emb) == 0:
        return np.zeros(20)

    out_emb = torch.tensor(word_emb,dtype=torch.float)
    if len(doc) < 100000:
        out_emb = encoder.embed(out_emb)
    out_emb = out_emb.detach().cpu().numpy()
    out_emb = np.mean(out_emb,axis=0)
    return out_emb

In [6]:
def preprocess(data):
    new_data = {}
    for x in data:
        check1 = x['object'] in ['PROCESS','FILE','FLOW','MODULE']
        check2 = not (x['action'] in ['START','TERMINATE'])
        check3 = x['actorID'] != x['objectID']
        key = (x['action'],x['actorID'],x['objectID'],x['object'],x['pid'],x['ppid'])
        if check1 and check2 and check3:
            new_data[key] = x
    return list(new_data.values())

In [7]:
def describe(x):
    action = x["action"]
    props = x['properties']
    typ = x['object']

    phrase = ''
    try:
        if typ == 'PROCESS':
            phrase = f"{props['parent_image_path']} {action} {props['image_path']} {props['command_line']}"    

        elif typ == 'FILE':
            phrase = f"{props['image_path']} {action} {props['file_path']}"    

        elif typ == 'FLOW':
            phrase = f"{props['image_path']} {action}  {props['dest_ip']} {props['dest_port']} {props['direction']}"    

        else:
            phrase = f"{props['image_path']} {action} {props['module_path']}"
    except:
        phrase = ''
  
    return phrase.split(' ')

In [8]:
def transform(text):
    data = preprocess(text)

    temp = [describe(x) for x in data]
    temp = [x for x in temp if len(x) != 0]

    for i in range(len(data)):
        data[i]['phrase'] = temp[i]

    df = pd.DataFrame.from_dict(data)
    df['timestamp'] = df['timestamp'].str[:-6]
    df['timestamp'] = pd.to_datetime(df['timestamp'],infer_datetime_format=True)
    df.sort_values(by='timestamp', ascending=True,inplace=True)

    return df

def load_data(dataset_id):
    f = open(f"content/data/hosts/{dataset_id}")
    content = [json.loads(line) for line in f]
    return prepare_graph(transform(content))

In [9]:
def prepare_graph(df):
  nodes = {}
  labels = {}
  edges = []
    
  dummies = {'PROCESS':0,'FLOW':1,'FILE':2,'MODULE':3}

  for i in range(len(df)):
    x = df.iloc[i]

    actorid = x['actorID']
    if not (actorid in nodes):
      nodes[actorid] = []
    nodes[actorid] += x['phrase']
    labels[actorid] = dummies['PROCESS']

    objectid = x["objectID"]
    if not (objectid in nodes):
      nodes[objectid] = []
    nodes[objectid] += x['phrase']
    labels[objectid] = dummies[x['object']]
    
    if x['object'] == 'FLOW':
        edges.append(( actorid, objectid, x['properties']['direction'] ))
    else:
        edges.append(( actorid, objectid, x['action'] ))

  features = []
  feat_labels = []
  edge_index = [[],[]]
  index  = {}
  mapp = []
              
  for k,v in nodes.items():
    features.append(v)
    feat_labels.append(labels[k])
    index[k] = len(features) - 1
    mapp.append(k)

  for x in edges:
    src = index[x[0]]
    dst = index[x[1]]
    
    if x[2] in ['READ','inbound']:
        edge_index[0].append(dst)
        edge_index[1].append(src)    
    else:
        edge_index[0].append(src)
        edge_index[1].append(dst)    
    
  return features,np.array(feat_labels),edge_index,mapp

In [10]:
from torch_geometric.nn import GCNConv
from torch_geometric.nn import SAGEConv, GATConv
import torch.nn.functional as F
import torch.nn as nn

'''
Defining the model. The model consists mainly of graph sage and graph attention layers
'''
class GCN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = SAGEConv(20, 32, normalize=True)
        self.conv2 = SAGEConv(32, 20, normalize=True)
        self.linear = nn.Linear(in_features=20,out_features=4)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = F.dropout(x, p=0.5, training=self.training)

        x = self.conv2(x, edge_index)
        x = self.linear(x)
        return F.softmax(x, dim=1)
    
    def encode(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = F.dropout(x, p=0.5, training=self.training)

        x = self.conv2(x, edge_index)
        return x

In [11]:
from gensim.models.callbacks import CallbackAny2Vec
import gensim
from gensim.models import Word2Vec
from multiprocessing import Pool
from itertools import compress
from tqdm import tqdm
import time

class EpochSaver(CallbackAny2Vec):
    '''Callback to save model after each epoch.'''

    def __init__(self,client_id):
        self.epoch = 0
        self.cid = client_id

    def on_epoch_end(self, model):
        model.save(f"Content_FL/{self.cid}.model")
        self.epoch += 1
        
class EpochLogger(CallbackAny2Vec):
    '''Callback to log information about training'''

    def __init__(self):
        self.epoch = 0

    def on_epoch_begin(self, model):
        pass
        #print("Epoch #{} start".format(self.epoch))

    def on_epoch_end(self, model):
        #print("Epoch #{} end".format(self.epoch))
        self.epoch += 1
        
def train_word2vec_func(docs,client_id):
    logger = EpochLogger()
    saver = EpochSaver(client_id)
    word2vec = Word2Vec(sentences=docs, vector_size=20, window=5, min_count=1,workers=5,epochs=100,callbacks=[saver,logger])

In [12]:
from torch.nn import CrossEntropyLoss
from sklearn.utils import class_weight

def train_gnn_func(nodes,labels,edges):
    
    model = GCN().to(device)
    if "global.pth" in os.listdir("Content_FL"):
        model.load_state_dict(torch.load("Content_FL/global.pth"))
   
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
    
    l = np.array(labels)
    class_weights = class_weight.compute_class_weight(class_weight = "balanced",classes = np.unique(l),y = l)
    class_weights = torch.tensor(class_weights,dtype=torch.float).to(device)
    criterion = CrossEntropyLoss(weight=class_weights,reduction='mean')

    graph = Data(x=torch.tensor(nodes,dtype=torch.float).to(device),y=torch.tensor(labels,dtype=torch.long).to(device), edge_index=torch.tensor(edges,dtype=torch.long).to(device))
    for epoch in range(50):
        model.train()
        optimizer.zero_grad() 
        out = model(graph.x, graph.edge_index) 
        loss = criterion(out, graph.y) 
        loss.backward() 
        optimizer.step()      
    
    model.eval()
    
    out = model(graph.x, graph.edge_index) 
    sort, indices = out.sort(dim=1,descending=True)
    
    pred = indices[:,0]
    acc = ((pred == graph.y).sum()) / graph.num_nodes
    print(f"Local Model Accuracy: {acc}")

    return model

In [13]:
import xgboost as xgb
from sklearn.metrics import accuracy_score
import pickle

def train_classifier(client_id):
    docs,labels,edges,mapp = load_data(client_id)
    
    wemb = []
    word2vec = Word2Vec.load(f"Content_FL/{client_id}.model")
    for x in docs:
        wemb.append( infer(x,word2vec) ) 

    model = GCN().to(device)
    model.load_state_dict(torch.load(f'Content_FL/global.pth'))
    
    graph = Data(x=torch.tensor(wemb,dtype=torch.float).to(device),y=torch.tensor(labels,dtype=torch.long).to(device), edge_index=torch.tensor(edges,dtype=torch.long).to(device))

    gemb = model.encode(graph.x, graph.edge_index).detach().cpu().numpy() 
    x = np.hstack((wemb,gemb))
    y = np.array(labels)

    xgb_cl = xgb.XGBClassifier()

    xgb_cl.fit(x,y)
    pickle.dump(xgb_cl, open(f"Content_FL/{client_id}.pkl", "wb"))

    preds = xgb_cl.predict(x)
    print(f"Classification Accuracy :{accuracy_score(y, preds)}")

In [14]:
def client_handling_loop(client_id):
    docs,labels,edges,mapp = load_data(client_id)
    if not f"{client_id}.model" in os.listdir("Content_FL"):
        train_word2vec_func(docs,client_id)
    
    nodes_feat = []
    word2vec = Word2Vec.load(f"Content_FL/{client_id}.model")
    for x in docs:
        nodes_feat.append( infer(x,word2vec) ) 
        
    trained_client_model = train_gnn_func(nodes_feat,labels,edges)
    torch.save(trained_client_model.state_dict(), f"Content_FL/{client_id}.pth")
    return trained_client_model

In [15]:
import random
def perform_federated_learning(n_clients):
    client_models = []
    for c in n_clients:
        print(f"Client: {c}")
        gnn = client_handling_loop(c)
        client_models.append(gnn)
    return client_models

In [16]:
def server_aggregate(client_models):
    global_model = GCN().to(device)
    global_dict = global_model.state_dict()
    for k in global_dict.keys():
        global_dict[k] = torch.stack([client_models[i].state_dict()[k].float() for i in range(len(client_models))], 0).mean(0)
    global_model.load_state_dict(global_dict)
    torch.save(global_model.state_dict(), "Content_FL/global.pth")
    return global_model

In [None]:
import random

n_clients = ['201.txt','501.txt','SysClient0051.systemia.com.txt','SysClient0721.systemia.com.txt','SysClient0113.systemia.com.txt','SysClient0122.systemia.com.txt','SysClient0170.systemia.com.txt','SysClient0316.systemia.com.txt','SysClient0520.systemia.com.txt']

learning_rounds = 3

for r in range(learning_rounds):
    print(f"Federated Learning Round Number: {r}\n")
    client_models = perform_federated_learning(n_clients)
    global_model = server_aggregate(client_models)
    
for cid in n_clients[:3]:
    print("Training Classifiers")
    train_classifier(cid)

In [18]:
from itertools import compress

def helper(MP,acts,objs,GP,edges,mapp):

    all_pids = acts.union(objs)
    GN = all_pids - GP
    MN = all_pids - MP

    TP = MP.intersection(GP)
    FP = MP.intersection(GN)
    FN = MN.intersection(GP)
    
    two_hop_gp = construct_neighborhood(GP,mapp,edges,2)
    two_hop_tp = construct_neighborhood(TP,mapp,edges,2)
    FP = FP - two_hop_gp
    TP = TP.union(FN.intersection(two_hop_tp))
    FN = FN - two_hop_tp

    TP,FP,FN = len(TP),len(FP),len(FN)
    TN = (len(acts) + len(objs)) - TP - FP - FN
    
    FPR = FP / (FP+TN)
    TPR = TP / (TP+FN)

    print(f"Number of True Positives: {TP}")
    print(f"Number of Fasle Positives: {FP}")
    print(f"Number of False Negatives: {FN}")
    print(f"Number of True Negatives: {TN}\n")

    prec = TP / (TP + FP)
    print(f"Precision: {prec}")

    rec = TP / (TP + FN)
    print(f"Recall: {rec}")

    fscore = (2*prec*rec) / (prec + rec)
    print(f"Fscore: {fscore}\n")

In [39]:
def ground_truth(data_id):
    file = data_id
    
    path = f"Content_FL/eval_data/SysClient0{data_id}.systemia.com.txt"
    f = open(path,'r')
    content = [json.loads(x) for x in f]
    
    raw = pd.DataFrame.from_dict(content)
    acts = set(raw['actorID']) 
    objs = set(raw['objectID'])
    
    thresh = 0
    GT_mal = set()
    if file == '501':
        f = open('content/optc_gt_day2.json')
        GT_mal = set(json.load(f))
        thresh=0.8

    if file == '201':
        f = open('content/optc_gt_day1.json')
        GT_mal = set(json.load(f))
        thresh=0

    if file == '051':
        f = open('content/optc_gt_day3.json')
        GT_mal = set(json.load(f))
        thresh=0.8
           
    return GT_mal,acts,objs,thresh

In [40]:
from itertools import compress
from torch_geometric import utils

def construct_neighborhood(ids,mapp,edges,hops):
    if hops == 0:
        return set()
    else:
        neighbors = set()
        for i in range(len(edges[0])):
            if mapp[edges[0][i]] in ids:
                neighbors.add(mapp[edges[1][i]])
            if mapp[edges[1][i]] in ids:
                neighbors.add(mapp[edges[0][i]])
        return neighbors.union( construct_neighborhood(neighbors,mapp,edges,hops-1) )

In [41]:
def load_pkl(fname):
    with open(fname, 'rb') as f:
        obj = pickle.load(f)
    return obj

In [44]:
data_id = '051'
client_id = 'SysClient0051.systemia.com.txt'

f = open(f"Content_FL/eval_data/SysClient0{data_id}.systemia.com.txt")
content = [json.loads(line) for line in f]

docs,labels,edges,mapp = prepare_graph(transform(content))

wemb = []
word2vec = Word2Vec.load(f"Content_FL/{client_id}.model")
for x in docs:
    wemb.append( infer(x,word2vec) ) 

graph = Data(x=torch.tensor(wemb,dtype=torch.float).to(device),y=torch.tensor(labels,dtype=torch.long).to(device), edge_index=torch.tensor(edges,dtype=torch.long).to(device))

model = GCN().to(device)
model.load_state_dict(torch.load(f'Content_FL/global.pth'))
model.eval()

gemb = model.encode(graph.x, graph.edge_index).detach().cpu().numpy() 
x = np.hstack((wemb,gemb))
y = np.array(labels)

gt,acts,objs,thresh = ground_truth(data_id)

In [None]:
from itertools import compress
import pickle
import time

xgb_cl = load_pkl(f"Content_FL/{client_id}.pkl")

pred = xgb_cl.predict(x)
proba = xgb_cl.predict_proba(x)

sorted = np.sort(proba, axis=1)
conf = (sorted[:,-1] - sorted[:,-2]) / sorted[:,-1]
conf = (conf - conf.min()) / conf.max()

check = (pred == y) & (conf > thresh)
flag = ~torch.tensor(check)

index = utils.mask_to_index(flag).tolist()
ids = set([mapp[x] for x in index])
helper(set(ids),acts,objs,gt,edges,mapp) 