# GNNExplainer Results

| Dataset      | True Positives (TP) | False Positives (FP) | False Negatives (FN) | True Negatives (TN) | Precision           | Recall             | F1-Score           |
|-------------|----------------------|----------------------|-----------------------|----------------------|---------------------|--------------------|--------------------|
| OPTC A1     | 4                    | 3,021                | 65                    | 501,987              | 0.0013              | 0.0580             | 0.0026             |
| OPTC A2     | 29                   | 3,377                | 595                   | 582,940              | 0.0085              | 0.0465             | 0.0142             |
| OPTC A3     | 4                    | 373                  | 87                    | 170,363              | 0.0106              | 0.0440             | 0.0171             |
| Theia    | 20                   | 0                    | 39                    | 5,829,109            | 1.0000              | 0.3390             | 0.5063             |

# PGExplainer Results


| Dataset   | True Positives (TP) | False Positives (FP) | False Negatives (FN) | True Negatives (TN) | Precision         | Recall            | F1-Score         |
|-----------|---------------------|----------------------|----------------------|---------------------|-------------------|-------------------|------------------|
| OPTC A1   | 4                   | 4064                 | 65                   | 500944              | 0.00098  | 0.05797     | 0.00195          |
| OPTC A2   | 29                  | 4946                 | 595                  | 581371              | 0.00583  | 0.04637   | 0.01085          |
| OPTC A3   | 4                   | 519                  | 87                   | 170217              | 0.00765   | 0.04396     | 0.00847          |
| Theia     | 20                  | 0                    | 39                   | 5829109             | 1.00000   | 0.3390 | 0.5060           |

In [17]:
from pprint import pprint
import gzip
from sklearn.manifold import TSNE
import json
import copy
import os

In [18]:
import pandas as pd
import numpy as np
import gdown
import gzip
import os

def store_events_to_disk(events, path):
    out = open(path, 'a')
    out.writelines(events)
    out.close()
    
def load_processed_events(path):
    inp = open(path, 'r')
    events = [line for line in inp]
    inp.close()
    return events

def read_raw_events(path_to_gz_file):
    f = gzip.open(path_to_gz_file, 'r')
    host_events = []
            
    for line in f:
        event_text = line.decode('utf-8')
        event_json = json.loads(event_text)
        host = event_json['hostname']
        
        for hid in ['501','201','051']:
            if host == f'SysClient0{hid}.systemia.com':
                host_events.append(event_text) 
    
    store_events_to_disk(host_events, 'all_system_events')
    return host_events
    
def generate_ground_labels(events):
    events_json = [json.loads(x) for x in events]
    df = pd.DataFrame.from_dict(events_json)
    df['timestamp'] = df['timestamp'].str[:-6]
    df['timestamp'] = pd.to_datetime(df['timestamp'],infer_datetime_format=True)
    
    gt_pdf = open('gt_pdf.txt').read().split('\n')
    gt_pdf = [x.split('--') for x in gt_pdf]

    gt_events = []

    for timestamp,host in gt_pdf:
        host_df = df[df['hostname'].str.contains(host, case=False)]
        in_between = host_df.timestamp.between( pd.to_datetime(timestamp ,infer_datetime_format=True) , pd.to_datetime(timestamp ,infer_datetime_format=True) + pd.Timedelta('300 sec') )

        records = host_df[in_between].astype(str).to_dict(orient='records')
        stringified_records = [json.dumps(record) for record in records]
        
        gt_events = gt_events + stringified_records

    return gt_events
    
    
def filter_events(events):
    true_attack_events = []
    
    for e in events:
        if "SysClient0501" in e:
            keywords = ['"ppid":648','"ppid":5076','"ppid":1748','payroll.docx','202.6.172.98',
                        'bypassuac_eventvwr','privesc','GPP SYSVOL','privesc/bypassuac_env','bypassuac',
                        'privesc/bypassuac_fodhelper','WMI','winenum','findtrusteddocuments',
                        'fileTransfer1000.exe','plink.exe']
            for k in keywords:
                if k in e:
                    true_attack_events.append(e)
                    break
        
        if "SysClient0201" in e:
            keywords = ['"ppid":5452', '"ppid":2952', 'runme.bat', 'mimikatz', 'psinject',
                        'news.com:8000','142.20.56.0/24','invoke_wmi','ping.exe','HKCU:Software\Microsoft\Windows\CurrentVersion\Debug']
            for k in keywords:
                if k in e:
                    true_attack_events.append(e)
                    break

        if "SysClient0051" in e:
            keywords = ['"ppid":2712', 'update.exe', '142.20.56.0/22', 'cKfGW.exe ', 'lsass',
                        '"ppid":568', 'C:\\Windows\\TEMP\\myHbYXTpViwX.vbx', 'timestomp', 'mimikatz',
                        'HKLM\\Software\\Microsoft\\Windows\\CurrentVersion\\Run\\RTqWaEHv',
                        'get_gui']
            for k in keywords:
                if k in e:
                    true_attack_events.append(e)
                    break
                    
        if "SysClient0402" in e:
            keywords = ['NEK5H8GX','142.20.57.0/24','invoke_wmi']
            for k in keywords:
                if k in e:
                    true_attack_events.append(e)
                    break
                    
        if "SysClient0660" in e:
            keywords = ['ipconfig','Mimikatz','DS29HY41','psinject','zipfldr.dll','invoke_wmi']
            for k in keywords:
                if k in e:
                    true_attack_events.append(e)
                    break
        
    return true_attack_events
    
def final_check_ground_labels(events):
    checklist = ['GoogleUpdate.exe','svchost.exe']
    final_events = []
    for x in events:
        check = False
        for k in checklist:
            if k in x:
                check = True
        if not check:
            final_events.append(x)
    store_events_to_disk(final_events, "ground_truth_events")

        
def download_datasets():
    urls = [
         'https://drive.google.com/file/d/1HFSyvmgH0jvdnnnTdKfWRjZYOrLWoIkv/view?usp=drive_link',
         'https://drive.google.com/file/d/1pJLxJsDV8sngiedbfVajMetczIgM3PQd/view?usp=drive_link',
         'https://drive.google.com/file/d/1r4urs7OuKKTO6Y3-DOUMERwNasGEcRUN/view?usp=drive_link',
         'https://drive.google.com/file/d/1O8N3Tc-vN5BTlel9AvJPM5jhug2MjZkL/view?usp=drive_link',
         'https://drive.google.com/file/d/13o8h42QM9Gzv84UyRNwlgc9oWicvxExZ/view?usp=drive_link',
         'https://drive.google.com/file/d/1fRQqc68r8-z5BL7H_eAKIDOeHp7okDuM/view?usp=drive_link',
         'https://drive.google.com/file/d/1VfyGr8wfSe8LBIHBWuYBlU8c2CyEgO5C/view?usp=drive_link',
         'https://drive.google.com/file/d/10N9ZPolq_L8HivBqzf_jFKbwjSxddsZp/view?usp=drive_link',
         ]

    for x in urls:
        gdown.download(x, quiet=False, use_cookies=False, fuzzy=True)

In [19]:
def main():
    download_datasets()
    optc_paths = [x for x in os.listdir() if 'json.gz' in x]
    for p in optc_paths:
        print(f"Processing file: {p}")
        events = read_raw_events(p)
        events = generate_ground_labels(events)
        events = filter_events(events)
        final_check_ground_labels(events)

In [20]:
import json

In [21]:
with open("ground_truth_events.txt", "r") as file:
    content = file.read()  

In [22]:
content = content.split('}{')
content[0] = content[0] + '}'
content[-1] = '{' + content[-1] 
for i in range(1,len(content)-1):
    content[i] = '{' + content[i] + '}'

In [23]:
events = []
for i in range(len(content)):
    event = json.loads(content[i])
    events.append(event)

In [24]:
import json
import re

def Extract_Semantic_Info(event):
    object_type = event['object']
    properties_string = event['properties']

    properties = {}
    pattern = r"'([^']+)':\s*'([^']*)'"
    matches = re.finditer(pattern, properties_string)
    for match in matches:
        key = match.group(1)
        value = match.group(2).replace("\\\\", "\\")  
        properties[key] = value

    label_mapping = {
        "PROCESS": ('parent_image_path', 'image_path'),
        "FILE": ('image_path', 'file_path'),
        "MODULE": ('image_path', 'module_path'),
        "FLOW": ('image_path', 'dest_ip', 'dest_port')
    }

    label_keys = label_mapping.get(object_type, None)
    if label_keys:
        labels = [properties.get(key) for key in label_keys]
        if all(labels):
            event["actorname"], event["objectname"] = labels[0], ' '.join(labels[1:])
            return event
    return None

In [25]:
events_051 = [x for x in events if x['hostname'] == 'SysClient0051.systemia.com']
events_201 = [x for x in events if x['hostname'] == 'SysClient0201.systemia.com']
events_501 = [x for x in events if x['hostname'] == 'SysClient0501.systemia.com']

In [26]:
events_051 = [Extract_Semantic_Info(x) for x in events_051]
events_201 = [Extract_Semantic_Info(x) for x in events_201]
events_501 = [Extract_Semantic_Info(x) for x in events_501] 

In [27]:
events_051 = [x for x in events_051 if x != None]
events_201 = [x for x in events_201 if x != None]
events_501 = [x for x in events_501 if x != None] 

In [28]:
import pandas as pd
import networkx as nx

In [29]:
import pandas as pd
import networkx as nx
from pyvis.network import Network

In [30]:
df = pd.DataFrame.from_dict(events_501)
df = df[df['object'].isin(['PROCESS','FILE'])]

df['actorname'] = df['actorname'].str.split('\\').str[-1]
df['objectname'] = df['objectname'].str.split('\\').str[-1]

edges = df[['actorname', 'objectname']].values.tolist()

G = nx.Graph()
G.add_edges_from(edges)

self_loops = list(nx.selfloop_edges(G))
G.remove_edges_from(self_loops)

net = Network(notebook=True, height="750px", width="100%", bgcolor="#222222", font_color="white",cdn_resources='in_line')

for node in G.nodes:
    net.add_node(node, title=node, label=node)

for edge in G.edges:
    net.add_edge(edge[0], edge[1])

net.set_options("""
var options = {
    "physics": {
        "forceAtlas2Based": {
            "gravitationalConstant": -26,
            "centralGravity": 0.005,
            "springLength": 230,
            "springConstant": 0.18
        },
        "maxVelocity": 146,
        "solver": "forceAtlas2Based",
        "timestep": 0.35,
        "stabilization": {"iterations": 150}
    },
    "nodes": {
        "font": {
            "size": 12
        }
    }
}
""")

net.show("network1.html")

network1.html


In [31]:
df = pd.DataFrame.from_dict(events_201)
df = df[df['object'].isin(['PROCESS','FILE'])]

df['actorname'] = df['actorname'].str.split('\\').str[-1]
df['objectname'] = df['objectname'].str.split('\\').str[-1]

edges = df[['actorname', 'objectname']].values.tolist()

G = nx.Graph()
G.add_edges_from(edges)

self_loops = list(nx.selfloop_edges(G))
G.remove_edges_from(self_loops)

net = Network(notebook=True, height="750px", width="100%", bgcolor="#222222", font_color="white",cdn_resources='in_line')

for node in G.nodes:
    net.add_node(node, title=node, label=node)

for edge in G.edges:
    net.add_edge(edge[0], edge[1])

net.set_options("""
var options = {
    "physics": {
        "forceAtlas2Based": {
            "gravitationalConstant": -26,
            "centralGravity": 0.005,
            "springLength": 230,
            "springConstant": 0.18
        },
        "maxVelocity": 146,
        "solver": "forceAtlas2Based",
        "timestep": 0.35,
        "stabilization": {"iterations": 150}
    },
    "nodes": {
        "font": {
            "size": 12
        }
    }
}
""")

net.show("network2.html")

network2.html


In [32]:
df = pd.DataFrame.from_dict(events_051)
df = df[df['object'].isin(['PROCESS','FILE'])]

df['actorname'] = df['actorname'].str.split('\\').str[-1]
df['objectname'] = df['objectname'].str.split('\\').str[-1]

edges = df[['actorname', 'objectname']].values.tolist()

G = nx.Graph()
G.add_edges_from(edges)

self_loops = list(nx.selfloop_edges(G))
G.remove_edges_from(self_loops)

net = Network(notebook=True, height="750px", width="100%", bgcolor="#222222", font_color="white",cdn_resources='in_line')

for node in G.nodes:
    net.add_node(node, title=node, label=node)

for edge in G.edges:
    net.add_edge(edge[0], edge[1])

net.set_options("""
var options = {
    "physics": {
        "forceAtlas2Based": {
            "gravitationalConstant": -26,
            "centralGravity": 0.005,
            "springLength": 230,
            "springConstant": 0.18
        },
        "maxVelocity": 146,
        "solver": "forceAtlas2Based",
        "timestep": 0.35,
        "stabilization": {"iterations": 150}
    },
    "nodes": {
        "font": {
            "size": 12
        }
    }
}
""")

net.show("network3.html")

network3.html
