In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import torch
from torch_geometric.data import Data
import torch.nn.functional as F
import warnings
import os
warnings.filterwarnings('ignore')
from torch_geometric.loader import NeighborLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
%matplotlib inline

In [2]:
os.chdir("..")
%pwd

'/sfs/weka/scratch/wkw9be'

In [3]:
from pprint import pprint
import json
import copy

import gensim
from gensim.models import Word2Vec
from multiprocessing import Pool
from itertools import compress
from tqdm import tqdm
import time

import multiprocessing
import random
import xxhash

In [4]:
'''
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed(42)
    torch.cuda.manual_seed_all(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
'''

'\ntorch.manual_seed(42)\nnp.random.seed(42)\nrandom.seed(42)\nif torch.cuda.is_available():\n    torch.cuda.manual_seed(42)\n    torch.cuda.manual_seed_all(42)\ntorch.backends.cudnn.deterministic = True\ntorch.backends.cudnn.benchmark = False\n'

In [5]:
import math
class PositionalEncoding():

    def __init__(self, d_model ,max_len = 100000):
        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        self.pe = torch.zeros(max_len, d_model)
        self.pe[:,0::2] = torch.sin(position * div_term)
        self.pe[:,1::2] = torch.cos(position * div_term)

    def embed(self, x):
        x = x + self.pe[:x.size(0)]
        return x

encoder = PositionalEncoding(20)

In [6]:
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from collections import Counter
from gensim.models import Word2Vec
from multiprocessing import Pool
from itertools import compress
from tqdm import tqdm
import time

def infer(doc,word2vec):  
    word_emb = []
    for word in doc:
        if word in word2vec.wv:
            word_emb.append(word2vec.wv[word])
  
    if len(word_emb) == 0:
        return np.zeros(20)

    out_emb = torch.tensor(word_emb,dtype=torch.float)
    if len(doc) < 100000:
        out_emb = encoder.embed(out_emb)
    out_emb = out_emb.detach().cpu().numpy()
    out_emb = np.mean(out_emb,axis=0)
    return out_emb

In [7]:
def preprocess(data):
    new_data = {}
    for x in data:
        check1 = x['object'] in ['PROCESS','FILE','FLOW','MODULE']
        check2 = not (x['action'] in ['START','TERMINATE'])
        check3 = x['actorID'] != x['objectID']
        key = (x['action'],x['actorID'],x['objectID'],x['object'],x['pid'],x['ppid'])
        if check1 and check2 and check3:
            new_data[key] = x
    return list(new_data.values())

In [8]:
def describe(x):
    action = x["action"]
    props = x['properties']
    typ = x['object']

    phrase = ''
    try:
        if typ == 'PROCESS':
            phrase = f"{props['parent_image_path']} {action} {props['image_path']} {props['command_line']}"    

        elif typ == 'FILE':
            phrase = f"{props['image_path']} {action} {props['file_path']}"    

        elif typ == 'FLOW':
            phrase = f"{props['image_path']} {action}  {props['dest_ip']} {props['dest_port']} {props['direction']}"    

        else:
            phrase = f"{props['image_path']} {action} {props['module_path']}"
    except:
        phrase = ''
  
    return phrase.split(' ')

In [9]:
def transform(text):
    data = preprocess(text)

    temp = [describe(x) for x in data]
    temp = [x for x in temp if len(x) != 0]

    for i in range(len(data)):
        data[i]['phrase'] = temp[i]

    df = pd.DataFrame.from_dict(data)
    df['timestamp'] = df['timestamp'].str[:-6]
    df['timestamp'] = pd.to_datetime(df['timestamp'],infer_datetime_format=True)
    df.sort_values(by='timestamp', ascending=True,inplace=True)

    return df

def load_data(dataset_id):
    f = open(f"content/data/hosts/{dataset_id}")
    content = [json.loads(line) for line in f]
    return prepare_graph(transform(content))

In [10]:
def prepare_graph(df):
    nodes = {}
    labels = {}
    edges = []

    dummies = {'PROCESS':0,'FLOW':1,'FILE':2,'MODULE':3}

    for i in range(len(df)):
        x = df.iloc[i]

        actorid = x['actorID']
        if not (actorid in nodes):
            nodes[actorid] = []
        nodes[actorid] += x['phrase']
        labels[actorid] = dummies['PROCESS']

        objectid = x["objectID"]
        if not (objectid in nodes):
            nodes[objectid] = []
        nodes[objectid] += x['phrase']
        labels[objectid] = dummies[x['object']]

        if x['object'] == 'FLOW':
            edges.append(( actorid, objectid, x['properties']['direction'] ))
        else:
            edges.append(( actorid, objectid, x['action'] ))

    features = []
    feat_labels = []
    edge_index = [[],[]]
    index  = {}
    mapp = []
              
    for k,v in nodes.items():
        features.append(v)
        feat_labels.append(labels[k])
        index[k] = len(features) - 1
        mapp.append(k)

    for x in edges:
        src = index[x[0]]
        dst = index[x[1]]
    
        if x[2] in ['READ','inbound']:
            edge_index[0].append(dst)
            edge_index[1].append(src)    
        else:
            edge_index[0].append(src)
            edge_index[1].append(dst)    
    
    return features,feat_labels,edge_index,mapp

In [11]:
from torch_geometric.nn import GCNConv
from torch_geometric.nn import SAGEConv, GATConv
import torch.nn.functional as F
import torch.nn as nn

'''
Defining the model. The model consists mainly of graph sage and graph attention layers
'''
class GCN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = SAGEConv(20, 32, normalize=True)
        self.conv2 = SAGEConv(32, 20, normalize=True)
        self.linear = nn.Linear(in_features=20,out_features=4)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = F.dropout(x, p=0.5, training=self.training)

        x = self.conv2(x, edge_index)
        x = self.linear(x)
        return F.softmax(x, dim=1)
    
    def freeze_conv_layers(self):
        for param in self.conv1.parameters():
            param.requires_grad = False
        for param in self.conv2.parameters():
            param.requires_grad = False

In [12]:
from gensim.models.callbacks import CallbackAny2Vec
import gensim
from gensim.models import Word2Vec
from multiprocessing import Pool
from itertools import compress
from tqdm import tqdm
import time

class EpochSaver(CallbackAny2Vec):
    '''Callback to save model after each epoch.'''

    def __init__(self,client_id):
        self.epoch = 0
        self.cid = client_id

    def on_epoch_end(self, model):
        model.save(f"Content_FL_Exp/{self.cid}.model")
        self.epoch += 1
        
class EpochLogger(CallbackAny2Vec):
    '''Callback to log information about training'''

    def __init__(self):
        self.epoch = 0

    def on_epoch_begin(self, model):
        pass
        #print("Epoch #{} start".format(self.epoch))

    def on_epoch_end(self, model):
        #print("Epoch #{} end".format(self.epoch))
        self.epoch += 1
        
def train_word2vec_func(docs,client_id):
    logger = EpochLogger()
    saver = EpochSaver(client_id)
    word2vec = Word2Vec(sentences=docs, vector_size=20, window=5, min_count=1,workers=5,epochs=100,callbacks=[saver,logger])

In [13]:
from torch.nn import CrossEntropyLoss
from sklearn.utils import class_weight
import copy

template = GCN().to(device)

def train_gnn_func(nodes,labels,edges,tune=False):
    
    model = copy.deepcopy(template)
    if "global.pth" in os.listdir("Content_FL_Exp"):
        model.load_state_dict(torch.load("Content_FL_Exp/global.pth"))
    
    if tune:
        model.freeze_conv_layers()
        
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
    criterion = CrossEntropyLoss()

    graph = Data(x=torch.tensor(nodes,dtype=torch.float).to(device),y=torch.tensor(labels,dtype=torch.long).to(device), edge_index=torch.tensor(edges,dtype=torch.long).to(device))
    for epoch in range(10):
        model.train()
        optimizer.zero_grad() 
        out = model(graph.x, graph.edge_index) 
        loss = criterion(out, graph.y) 
        loss.backward() 
        optimizer.step()      
        print(f"Epoch: {epoch}, Loss: {loss.item()}")
    
    return model

In [14]:
with open('ben_proc.json', "r") as json_file:
    data_cache = json.load(json_file)

In [14]:
def client_handling_loop(client_id):
    docs,labels,edges,mapp = [None]*4
    
    if client_id in data_cache:
        docs,labels,edges,mapp = data_cache[client_id]
    else:
        docs,labels,edges,mapp = load_data(client_id)
        data_cache[client_id] = [docs,labels,edges,mapp]
        
    #if not f"{client_id}.model" in os.listdir("Content_FL_Exp"):
    #    train_word2vec_func(docs,client_id)
    
    nodes_feat = []
    word2vec = Word2Vec.load(f"Content_FL_Exp/word2vec4.model")
    for x in docs:
        nodes_feat.append( infer(x,word2vec) ) 
        
    trained_client_model = train_gnn_func(nodes_feat,labels,edges)
    torch.save(trained_client_model.state_dict(), f"Content_FL_Exp/{client_id}.pth")
    return trained_client_model

In [15]:
'''
data_cache = {}
for client_id in ['201.txt','501.txt','051.txt']:
        
    docs,labels,edges,mapp = load_data(client_id)
    data_cache[client_id] = [docs,labels,edges,mapp]

file_path = "ben_proc.json"

# Write the dictionary to the JSON file
with open(file_path, "w") as json_file:
    json.dump(data_cache, json_file)
'''

'\ndata_cache = {}\nfor client_id in [\'201.txt\',\'501.txt\',\'051.txt\']:\n        \n    docs,labels,edges,mapp = load_data(client_id)\n    data_cache[client_id] = [docs,labels,edges,mapp]\n\nfile_path = "ben_proc.json"\n\n# Write the dictionary to the JSON file\nwith open(file_path, "w") as json_file:\n    json.dump(data_cache, json_file)\n'

In [16]:
import random
def perform_federated_learning(n_clients):
    client_models = []
    for c in n_clients:
        gnn = client_handling_loop(c)
        client_models.append(gnn)
    return client_models

In [17]:
def server_aggregate(client_models):
    global_model = copy.deepcopy(template)
    global_dict = global_model.state_dict()
    for k in global_dict.keys():
        param_list = [client_models[i].state_dict()[k] for i in range(len(client_models))]
        global_dict[k] = torch.stack(param_list, 0).mean(0)
    global_model.load_state_dict(global_dict)
    torch.save(global_model.state_dict(), "Content_FL_Exp/global.pth")
    return global_model

In [19]:
!rm Content_FL_Exp/*.pth

rm: cannot remove 'Content_FL_Exp/*.pth': No such file or directory


In [18]:
def finetune(client_id):
    docs,labels,edges,mapp = data_cache[client_id]
        
    #if not f"{client_id}.model" in os.listdir("Content_FL_Exp"):
    #    train_word2vec_func(docs,client_id)
    
    nodes_feat = []
    word2vec = Word2Vec.load(f"Content_FL_Exp/word2vec4.model")
    for x in docs:
        nodes_feat.append( infer(x,word2vec) ) 
        
    trained_client_model = train_gnn_func(nodes_feat,labels,edges,tune=True)
    torch.save(trained_client_model.state_dict(), f"Content_FL_Exp/{client_id}.pth")

In [None]:
import random

n_clients = ['201.txt','501.txt','051.txt']
learning_rounds = 10

for r in range(learning_rounds):
    print(f"Federated Learning Round Number: {r}\n")
    client_models = perform_federated_learning(n_clients)
    global_model = server_aggregate(client_models)

for cid in n_clients:
    finetune(cid)

In [19]:
from itertools import compress

In [20]:
def helper(MP,acts,objs,GP,edges,mapp):

    all_pids = acts.union(objs)
    GN = all_pids - GP
    MN = all_pids - MP

    TP = MP.intersection(GP)
    FP = MP.intersection(GN)
    FN = MN.intersection(GP)
    
    two_hop_gp = construct_neighborhood(GP,mapp,edges,2)
    two_hop_tp = construct_neighborhood(TP,mapp,edges,2)
    FP = FP - two_hop_gp
    TP = TP.union(FN.intersection(two_hop_tp))
    FN = FN - two_hop_tp

    TP,FP,FN = len(TP),len(FP),len(FN)
    TN = (len(acts) + len(objs)) - TP - FP - FN
    
    FPR = FP / (FP+TN)
    TPR = TP / (TP+FN)
    
    return TP,FP,FN,TN

In [39]:
from itertools import compress
from torch_geometric import utils

def construct_neighborhood(ids,mapp,edges,hops):
    if hops == 0:
        return set()
    else:
        neighbors = set()
        for i in range(len(edges[0])):
            if mapp[edges[0][i]] in ids:
                neighbors.add(mapp[edges[1][i]])
            if mapp[edges[1][i]] in ids:
                neighbors.add(mapp[edges[0][i]])
        return neighbors.union( construct_neighborhood(neighbors,mapp,edges,hops-1) )

In [23]:
with open('mal_proc.json', "r") as json_file:
    data_cache_mal = json.load(json_file)

In [40]:
'''
path = f"Content_FL_Exp/eval_data/SysClient0402.systemia.com.txt"
f = open(path)
content = [json.loads(line) for line in f]
docs,labels,edges,mapp = prepare_graph(transform(content))
data_cache_mal['402'] = [docs,labels,edges,mapp]

file_path = "mal_proc.json"

# Write the dictionary to the JSON file
with open(file_path, "w") as json_file:
    json.dump(data_cache_mal, json_file)

train_word2vec_func(docs,'402.txt')

GT_mal,acts,objs = ground_truth('402')
with open('gt_402.json', "w") as json_file:
    json.dump([list(GT_mal,acts,objs)], json_file)
'''

'\npath = f"Content_FL_Exp/eval_data/SysClient0402.systemia.com.txt"\nf = open(path)\ncontent = [json.loads(line) for line in f]\ndocs,labels,edges,mapp = prepare_graph(transform(content))\ndata_cache_mal[\'402\'] = [docs,labels,edges,mapp]\n\nfile_path = "mal_proc.json"\n\n# Write the dictionary to the JSON file\nwith open(file_path, "w") as json_file:\n    json.dump(data_cache_mal, json_file)\n\ntrain_word2vec_func(docs,\'402.txt\')\n\nGT_mal,acts,objs = ground_truth(\'402\')\nwith open(\'gt_402.json\', "w") as json_file:\n    json.dump([list(GT_mal,acts,objs)], json_file)\n'

In [43]:
def run_evaluation(fm):
    
    TP,FP,FN,TN = 0,0,0,0
    for data_id in ['201','501','051']:
        
        #path = f"Content_FL_Exp/eval_data/SysClient0{data_id}.systemia.com.txt"
        #f = open(path)
        #content = [json.loads(line) for line in f]
        #docs,labels,edges,mapp = prepare_graph(transform(content))
        docs,labels,edges,mapp = data_cache_mal[data_id]

        nodes_feat = []
        word2vec = Word2Vec.load(f"Content_FL_Exp/word2vec4.model")
        for x in docs:
            nodes_feat.append( infer(x,word2vec) ) 

        #gt,acts,objs = ground_truth(data_id)
        
        with open(f"gt_{data_id}.json", "r") as json_file:
            gt,acts,objs = json.load(json_file)  
        
        gt,acts,objs = set(gt),set(acts),set(objs)
            
        graph = Data(x=torch.tensor(nodes_feat,dtype=torch.float).to(device),y=torch.tensor(labels,dtype=torch.long).to(device), edge_index=torch.tensor(edges,dtype=torch.long).to(device))
        graph.x = nn.functional.normalize(graph.x)

        model = GCN().to(device)
        
        if fm:
            model.load_state_dict(torch.load(f"Content_FL_Exp/global.pth",map_location=torch.device('cpu')))
        else:
            model.load_state_dict(torch.load(f"Content_FL_Exp/{data_id}.txt.pth",map_location=torch.device('cpu')))

            
        model.eval()
        out = model(graph.x, graph.edge_index)

        sorted, indices = out.sort(dim=1,descending=True)
        conf = (sorted[:,0] - sorted[:,1]) / sorted[:,0]
        conf = (conf - conf.min()) / conf.max()

        pred = indices[:,0]
        cond = (pred == graph.y)#  & (conf > 0.9)

        index = utils.mask_to_index(~cond).tolist()
        ids = set([mapp[x] for x in index])
        metrics = helper(set(ids),acts,objs,gt,edges,mapp) 
        
        TP = TP + metrics[0]
        FP = FP + metrics[1]
        FN = FN + metrics[2]
        TN = TN + metrics[3]
    
    print(f"Number of True Positives: {TP}")
    print(f"Number of Fasle Positives: {FP}")
    print(f"Number of False Negatives: {FN}")
    print(f"Number of True Negatives: {TN}\n")

    prec = TP / (TP + FP)
    print(f"Precision: {prec}")

    rec = TP / (TP + FN)
    print(f"Recall: {rec}")

    fscore = (2*prec*rec) / (prec + rec)
    print(f"Fscore: {fscore}\n")

In [29]:
run_evaluation(True)

Number of True Positives: 624
Number of Fasle Positives: 71
Number of False Negatives: 26
Number of True Negatives: 1287284

Precision: 0.897841726618705
Recall: 0.96
Fscore: 0.9278810408921933

