In [86]:
'''
Importing the required libraries here
'''
os.chdir("/home/jovyan/aron_workdir/gestalt/code/Evaluation_Scripts")
exec(open("Imports.py").read())

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
%matplotlib inline

In [88]:
# Default path to the "Trustwatch" directory
default_directory = "/home/jovyan/Trustwatch"

# Prompt the user for a directory path, with the default if nothing is entered
user_directory = input(f"Enter directory path (Press Enter to use default '{default_directory}'): ") or default_directory
print(user_directory)
# Check if we're already in the desired directory
if os.getcwd() == user_directory:
    print("Already in the specified directory.")
else:
    print(f"Changing to directory: {user_directory}")
    os.chdir(user_directory)

# Confirm the current directory
print("Current directory:", os.getcwd())

Enter directory path (Press Enter to use default '/home/jovyan/Trustwatch'):  


/home/jovyan/Trustwatch
Already in the specified directory.
Current directory: /home/jovyan/Trustwatch


In [89]:
def train_test_split():
    
    df = pd.read_parquet('e5_data/cadets_df.parquet')
    df['timestamp'] = pd.to_datetime(df['timestamp'], format='mixed')

    start_range_end = '2019-05-9 00:00:00'
    
    df_ben = df[df['timestamp'] <= start_range_end]
    
    grouped = df_ben.groupby('hostid')
    hostdfs = {group: data for group, data in grouped}
    return hostdfs

In [90]:
hostdfs = train_test_split()

## Loading libraries and setting up working directory

In [91]:
num_of_ctg = 10
learning_rounds = 5
epochs = 10
hosts = list(hostdfs.keys())
TRAIN=False

## Defining functions for loading, cleaning and constructing features from the data

In [92]:
'''
This is the main featurizer. It constructs the graph for the cadets dataset.

Args:
    df (DataFrame): This is the main dataframe containing all the system events from the cadets dataset.

return:
    features (list): Contains word2vec encoded feature vectors for each node
    feat_labels (list): Contains label for each node
    edge_index (list): Contains information about edges between nodes in the graph.
    mapp (list): contains id of each node
'''

tokens = ['SUBJECT_PROCESS',
          'FILE_OBJECT_FILE',
          'NETFLOW']

def prepare_graph(df):
    global tokens
    dummies = {token: index for index, token in enumerate(tokens)}
    
    df['actor_label'] = df['actor_type'].map(dummies)
    df['object_label'] = df['object'].map(dummies)
    
    nodes = {}
    labels = {}
    for col in ['actorID', 'objectID']:
        unique_ids = df[col].unique()
        for uid in unique_ids:
            nodes[uid] = []
        if col == 'actorID':
            labels.update(df.set_index('actorID')['actor_label'].to_dict())
        else:
            labels.update(df.set_index('objectID')['object_label'].to_dict())
    
    for _, row in df.iterrows():
        nodes[row['actorID']].extend([row['exec'], row['action']])
        nodes[row['objectID']].extend([row['exec'], row['action']])
        if row['path'] != '':
            nodes[row['actorID']].append(row['path'])
            nodes[row['objectID']].append(row['path'])
    
    edges = list(zip(df['actorID'], df['objectID']))

    mapp = list(nodes.keys())
    features = [nodes[node_id] for node_id in mapp]
    feat_labels = [labels[node_id] for node_id in mapp]
    edge_index = [[], []]
    index_map = {node_id: index for index, node_id in enumerate(mapp)}
    
    for src, dst in edges:
        edge_index[0].append(index_map[src])
        edge_index[1].append(index_map[dst])
    
    all_procs = list(df['actorID'].unique())
    idx_to_proc = {index: proc for index, proc in enumerate(all_procs)}

    return features, feat_labels, edge_index, mapp, all_procs, idx_to_proc

In [93]:
# from torch_geometric.nn import GCNConv
# from torch_geometric.nn import SAGEConv, GATConv
# import torch.nn.functional as F
# import torch.nn as nn

class GCN(torch.nn.Module):
    def __init__(self,in_channel,out_channel):
        super(GCN, self).__init__()
        self.conv1 = SAGEConv(in_channel, 32, normalize=True)
        self.conv2 = SAGEConv(32, out_channel, normalize=True)

    def forward(self, x: torch.Tensor, edge_index: torch.Tensor) -> torch.Tensor:
    
        x = self.encode(x, edge_index)
        return F.softmax(x, dim=1)
    
    def encode(self, x: torch.Tensor, edge_index: torch.Tensor) -> torch.Tensor:
        
        x = self.conv1(x, edge_index)
        x = F.tanh(x)
        x = self.conv2(x, edge_index)
        return x
    
    def freeze_conv_layers(self):
        for param in self.conv1.parameters():
            param.requires_grad = False
        for param in self.conv2.parameters():
            param.requires_grad = False

In [94]:
'''
This function helps visualize the output of the model.
'''
def visualize(h, color):
    z = TSNE(n_components=2).fit_transform(h.detach().cpu().numpy())

    plt.figure(figsize=(10,10))
    plt.xticks([])
    plt.yticks([])

    plt.scatter(z[:, 0], z[:, 1], s=70, c=color, cmap="Set2")
    plt.show()

In [95]:
def combine_word2vec_models(models):
    # Convert the first model's vectors into a dictionary
    unified_dict = {word: models[0].wv[word] for word in models[0].wv.index_to_key}

    # Iterate through the remaining models
    for model in models[1:]:
        model_dict = {word: model.wv[word] for word in model.wv.index_to_key}

        # Iterate through words in the current model
        for word, vector in model_dict.items():
            if word in unified_dict:
                # Average the vectors for overlapping words
                unified_dict[word] = (unified_dict[word] + vector) / 2.0
            else:
                # Add unique words directly
                unified_dict[word] = vector
                
    unified_dict = {word: vector.tolist() if isinstance(vector, np.ndarray) else vector for word, vector in unified_dict.items()}

    return unified_dict

In [96]:
# from gensim.models.callbacks import CallbackAny2Vec
# import gensim
# from gensim.models import Word2Vec
# from multiprocessing import Pool
# from itertools import compress
# from tqdm import tqdm
# import time

In [97]:
class EpochSaver(CallbackAny2Vec):
    '''Callback to save model after each epoch.'''

    def __init__(self,filename):
        self.epoch = 0
        self.filename = filename

    def on_epoch_end(self, model):
        model.save(self.filename)
        self.epoch += 1

In [98]:
class EpochLogger(CallbackAny2Vec):
    '''Callback to log information about training'''

    def __init__(self):
        self.epoch = 0

    def on_epoch_begin(self, model):
        print("Epoch #{} start".format(self.epoch))

    def on_epoch_end(self, model):
        print("Epoch #{} end".format(self.epoch))
        self.epoch += 1

In [99]:
def train_word2vec_models():
    global hosts,hostdfs
    
    for h in hosts:
        print("Running host:",h)
        logger = EpochLogger()
        saver = EpochSaver(f"Content_FL_Exp/{h}.model")

        df = hostdfs[h]
        df = df[df['actor_type'] == 'SUBJECT_PROCESS'] 
        df = df[df['object'].isin(tokens)] 

        phrases,feat_labels,edge_index,mapp,all_procs,idx_to_proc = prepare_graph(df)
        word2vec = Word2Vec(sentences=phrases, vector_size=30, window=5, min_count=1, workers=5,epochs=100,callbacks=[saver,logger])

In [100]:
if TRAIN:
    word_models = []
    for m in hosts:
        word2vec = Word2Vec.load(f"Content_FL_Exp/{m}.model")
        word_models.append(word2vec)
        
    global_word = combine_word2vec_models(word_models)

    with open('Content_FL_Exp/e5_cadets_word2vec_global.json', 'w') as json_file:
        json.dump(global_word, json_file)

In [101]:
def load_word_model():
    with open('Content_FL_Exp/e5_cadets_word2vec_global.json', 'r') as json_file:
        loaded_dict = json.load(json_file)

    converted_dict = {word: np.array(vector) for word, vector in loaded_dict.items()}
    return converted_dict

In [102]:
# from sklearn.utils import class_weight
# import torch.nn.functional as F
# from torch.nn import CrossEntropyLoss

In [103]:
from collections import Counter
word2vec = load_word_model()

def infer(doc):
    global word2vec
    temp = dict(Counter(doc))
    emb = np.zeros(30)
    count = 0
    for k,v in temp.items():
        if k in word2vec:
            emb = emb + word2vec[k]*v
            count = count + 1
    emb = emb / count
    return emb

In [104]:
def init_gnns():
    global num_of_ctg,tokens
    n = num_of_ctg 
    gnn_models = []
    for i in range(n):
        m = GCN(30,len(tokens)).to(device)
        gnn_models.append(m)
    return gnn_models

In [105]:
def define_categories(pids):
    global num_of_ctg
    n = num_of_ctg - 1
    ctg = set(pids)
    ctg = list(ctg)
    k, m = divmod(len(ctg), n)
    return [set(ctg[i * k + min(i, m):(i + 1) * k + min(i + 1, m)]) for i in range(n)]

In [106]:
def map_pids_to_category_indices(pids, categories):
    pid_to_category_index = {}
    
    for pid in pids:
        for category_index, category_set in enumerate(categories):
            if pid in category_set:
                pid_to_category_index[pid] = category_index 
                break 
    
    return pid_to_category_index

In [107]:
# from torch.nn import CrossEntropyLoss
# from sklearn.utils import class_weight
# import copy

templates = init_gnns()

def train_gnn_func(nodes,labels,edges,mapp,pids,idx_to_pid):
    
    global categories ,epochs
    
    pid_to_gnn_index = map_pids_to_category_indices(pids, categories)
    
    set_pids = set(pids)

    proc_index = [i for i in range(len(mapp)) if mapp[i] in set_pids]

    train_splits = [[] for _ in range(len(categories))]

    for i in proc_index:
        pname = idx_to_pid[str(i)]
        split_indx = pid_to_gnn_index[pname]
        train_splits[split_indx].append(int(i))
        
    local_models = [copy.deepcopy(x) for x in templates]
    
    for i in range(len(local_models)-1):
            
        if len(train_splits[i]) == 0:
            local_models[i] = None
        else:
            if f"target_e5_cadets_global{i}.pth" in os.listdir("Content_FL_Exp"):
                local_models[i].load_state_dict(torch.load(f"Content_FL_Exp/target_e5_cadets_global{i}.pth"))

            optimizer = torch.optim.Adam(local_models[i].parameters(), lr=0.01, weight_decay=5e-4)
            criterion = CrossEntropyLoss()

            graph = Data(x=torch.tensor(nodes,dtype=torch.float).to(device),y=torch.tensor(labels,dtype=torch.long).to(device), edge_index=torch.tensor(edges,dtype=torch.long).to(device))
            mask = torch.tensor([False]*graph.num_nodes, dtype=torch.bool)
            mask[train_splits[i]] = True
            
            def get_neighbors(edge_index, nodes):
                neighbors = []
                for node in nodes:
                    mask = edge_index[0] == node
                    neighbors.extend(edge_index[1, mask].tolist())
                return torch.tensor(list(set(neighbors)), dtype=torch.long)

            one_hop_neighbors = get_neighbors(graph.edge_index, train_splits[i])
            two_hop_neighbors = get_neighbors(graph.edge_index, one_hop_neighbors)
            two_hop_neighbors = two_hop_neighbors[~mask[two_hop_neighbors]]
            mask[two_hop_neighbors] = True
            
            for epoch in range(epochs):
                print(f'Training GNN Category {i} Model for Epoch {epoch}')

                loader = NeighborLoader(graph, num_neighbors=[-1,-1], batch_size=5000,input_nodes=mask)
                total_loss = 0
                for subg in loader:
                    local_models[i].train()
                    optimizer.zero_grad() 
                    out = local_models[i](subg.x, subg.edge_index) 
                    loss = criterion(out, subg.y) 
                    loss.backward() 
                    optimizer.step()      
                    total_loss += loss.item() * subg.batch_size
                print("Loss: ", total_loss / mask.sum().item(), '\n')
    
    graph = Data(x=torch.tensor(nodes,dtype=torch.float).to(device),y=torch.tensor(labels,dtype=torch.long).to(device), edge_index=torch.tensor(edges,dtype=torch.long).to(device))
    optimizer = torch.optim.Adam(local_models[-1].parameters(), lr=0.01, weight_decay=5e-4)
    criterion = CrossEntropyLoss()
    
    for epoch in range(epochs):
        print(f'Training Catch all GNN Category Model for Epoch {epoch}')    
        local_models[-1].train()
        optimizer.zero_grad() 
        out = local_models[-1](graph.x, graph.edge_index) 
        loss = criterion(out, graph.y) 
        loss.backward() 
        optimizer.step()      
        print(f"Epoch: {epoch}, Loss: {loss.item()}")

    return local_models

In [108]:
procs_total = []
data_cache = {}
categories = None

def load_clients_data():
    
    global data_cache,categories,procs_total,tokens,hostdfs,hosts
    
    for name in hosts:
        df = hostdfs[name]
        df = df[df['actor_type'] == 'SUBJECT_PROCESS'] 
        df = df[df['object'].isin(tokens)]    

        docs,labels,edges,mapp,pids,idx_to_pid = prepare_graph(df)
        data_cache[name] = [docs,labels,edges,mapp,pids,idx_to_pid]
        procs_total = procs_total + pids

    categories = define_categories(procs_total)

In [109]:
def client_handling_loop(client_id):    
    print(f"Running Setup on Client {client_id} \n")
    
    docs,labels,edges,mapp,pids,idx_to_pid = data_cache[client_id]
    
    nodes_feat = []
    for x in docs:
        nodes_feat.append(infer(x)) 
        
    trained_local_models = train_gnn_func(nodes_feat,labels,edges,mapp,pids,idx_to_pid)
    return trained_local_models

In [110]:
def server_aggregate(all_models):
    global_models = copy.deepcopy(templates)
    
    for l in range(len(all_models)):
        
        current_models = all_models[l]
        current_models = [x for x in current_models if x != None]
        
        if not len(current_models) == 0:
        
            global_dict = global_models[l].state_dict()

            for k in global_dict.keys():
                param_list = [current_models[i].state_dict()[k] for i in range(len(current_models))]
                global_dict[k] = torch.stack(param_list, 0).mean(0)

            global_models[l].load_state_dict(global_dict)
            torch.save(global_models[l].state_dict(), f"Content_FL_Exp/target_e5_cadets_global{l}.pth")
                   
    return global_models

In [111]:
def perform_federated_learning(n_clients):
    client_models = []
    for c in n_clients:
        local_gnns = client_handling_loop(c)
        client_models.append(local_gnns)
    return client_models

In [112]:
if TRAIN:
    !rm Content_FL_Exp/target_e5_cadets*.pth

In [113]:
with open('Content_FL_Exp/e5_cadets_ensemble_ben.json', 'r') as f:
    data_cache = json.load(f)

proc_total = []
for x in hosts:
    proc_total = proc_total + data_cache[x][-2]
    
categories = define_categories(proc_total)

In [114]:
#load_clients_data()

In [115]:
if TRAIN:
    for r in range(learning_rounds):
        print(f"Federated Learning Round Number: {r}\n")
        client_models = perform_federated_learning(hosts)
        arranged_models =  [list(group) for group in zip(*client_models)]
        global_models = server_aggregate(arranged_models)

## Evaluation of the trained GNN model starts here

In [116]:
'''
This function is used for constructing neighborhood around a given 
set of nodes for backwards or forward tracking
'''
# from itertools import compress
# from torch_geometric import utils

def construct_neighborhood(ids,mapp,edges,hops):
    if hops == 0:
        return set()
    else:
        neighbors = set()
        for i in range(len(edges[0])):
            if mapp[edges[0][i]] in ids:
                neighbors.add(mapp[edges[1][i]])
            if mapp[edges[1][i]] in ids:
                neighbors.add(mapp[edges[0][i]])
        return neighbors.union( construct_neighborhood(neighbors,mapp,edges,hops-1) )

In [117]:
'''
This function logs the evaluation metrics.
'''

def helper(MP,all_pids,GP,edges,mapp):

    TP = MP.intersection(GP)  
    FP = MP - GP              
    FN = GP - MP              
    TN = all_pids - (GP | MP)
    
    two_hop_gp = construct_neighborhood(GP,mapp,edges,2)
    two_hop_tp = construct_neighborhood(TP,mapp,edges,2)
    FPL = FP - two_hop_gp
    TPL = TP.union(FN.intersection(two_hop_tp))
    FN = FN - two_hop_tp
    
    alerts = TP.union(FP)

    TP,FP,FN,TN = len(TPL),len(FPL),len(FN),len(TN)
    
    FPR = FP / (FP+TN)
    TPR = TP / (TP+FN)

    print(f"Number of True Positives: {TP}")
    print(f"Number of Fasle Positives: {FP}")
    print(f"Number of False Negatives: {FN}")

    prec = TP / (TP + FP)
    print(f"Precision: {prec}")

    rec = TP / (TP + FN)
    print(f"Recall: {rec}")

    fscore = (2*prec*rec) / (prec + rec)
    print(f"Fscore: {fscore}\n")
    
    #return alerts
    return TP, FP, FN, TN

In [118]:
def generate_groundtruth():
    
    df = pd.read_parquet('e5_data/cadets_df.parquet')
    df['timestamp'] = pd.to_datetime(df['timestamp'], format='mixed')
    
    df = df[df['actor_type'] == 'SUBJECT_PROCESS'] 
    df = df[df['object'].isin(tokens)]

    timestamps = [
        ('2019-05-16 09:20:32.093582942', '2019-05-16 09:36:08.903494477'),
        ('2019-05-16 09:36:08.903494477', '2019-05-16 09:51:22.110949680'),
        ('2019-05-16 09:51:22.110949680', '2019-05-16 10:06:29.403713371'),
        ('2019-05-16 10:06:29.403713371', '2019-05-16 10:21:47.983513184'),
        ('2019-05-16 20:32:27.570220441', '2019-05-16 20:48:38.072848659'),
        ('2019-05-16 21:19:00.930018779', '2019-05-16 21:34:46.231624861'),
        ('2019-05-16 21:34:46.231624861', '2019-05-16 21:49:46.992678639'),
        ('2019-05-16 21:49:46.992678639', '2019-05-16 22:06:14.950154813'),
        ('2019-05-16 22:06:14.950154813', '2019-05-16 22:21:40.662702391'),
        ('2019-05-16 22:21:40.662702391', '2019-05-16 22:36:45.602858389'),
        ('2019-05-16 22:36:45.602858389', '2019-05-16 22:51:51.220035024'),
        ('2019-05-16 22:51:51.220035024', '2019-05-16 23:07:16.890296254'),
        ('2019-05-16 23:07:16.890296254', '2019-05-16 23:22:54.052353000'),
        ('2019-05-17 10:02:11.321524261', '2019-05-17 10:17:26.881636687'),
        ('2019-05-17 10:17:26.881636687', '2019-05-17 10:32:38.131495470'),
        ('2019-05-17 10:32:38.131495470', '2019-05-17 10:48:02.091564015')
    ]

    filtered_dfs = []  

    for start_time, end_time in timestamps:

        # Filter df for rows where the timestamp column is within the start and end times
        mask = (pd.to_datetime(df['timestamp']) >= start_time) & (pd.to_datetime(df['timestamp']) <= end_time)
        filtered_df = df.loc[mask]

        # Append the filtered DataFrame to the list
        filtered_dfs.append(filtered_df)
        
    concatenated_df = pd.concat(filtered_dfs)

    # Extract unique 'actorID' and 'objectID' values
    unique_actorIDs = set(concatenated_df['actorID'].unique())
    unique_objectIDs = set(concatenated_df['objectID'].unique())

    # Combine the sets of 'actorID' and 'objectID' values
    unified_set = unique_actorIDs.union(unique_objectIDs)
    
    return unified_set

In [119]:
GT_mal = generate_groundtruth()

In [120]:
def load_data_test():

    df = pd.read_parquet('e5_data/cadets_df.parquet')
    df['timestamp'] = pd.to_datetime(df['timestamp'], format='mixed')

    second_range_start = '2019-05-16 00:00:00'
    second_range_end = '2019-05-17 00:00:00'

    df = df[(df['timestamp'] >= second_range_start) & (df['timestamp'] <= second_range_end)]
    
    df = df[df['actor_type'] == 'SUBJECT_PROCESS'] 
    df = df[df['object'].isin(tokens)]

    docs,labels,edges,mapp,pids,idx_to_pid = prepare_graph(df)
    return [docs,labels,edges,mapp,pids,idx_to_pid], df

In [121]:
#data_mal = load_data_test()

In [122]:
with open('e5_data/cadets_mal_proc.json', 'r') as file:
    data_mal =  json.load(file)

In [123]:
def run_evaluation(thresh):    
    global word2vec,tokens,GT_mal
            
    phrases,labels,edges,mapp,pids,idx_to_pid = data_mal

    model = GCN(30,len(tokens)).to(device)
    word2vec = load_word_model()

    nodes = [infer(x) for x in phrases]
    nodes = np.array(nodes)  

    all_ids = set(mapp)
        
    graph = Data(x=torch.tensor(nodes,dtype=torch.float).to(device),y=torch.tensor(labels,dtype=torch.long).to(device), edge_index=torch.tensor(edges,dtype=torch.long).to(device))

    flag = torch.tensor([True]*graph.num_nodes, dtype=torch.bool)
    
    for m_n in range(num_of_ctg):
        if f"target_e5_cadets_global{m_n}.pth" in os.listdir("Content_FL_Exp"): 
            model.load_state_dict(torch.load(f"Content_FL_Exp/target_e5_cadets_global{m_n}.pth",map_location=torch.device('cpu')))
            
        model.eval()
        out = model(graph.x, graph.edge_index)
    
        sorted, indices = out.sort(dim=1,descending=True)
        conf = (sorted[:,0] - sorted[:,1]) / sorted[:,0]
        conf = (conf - conf.min()) / conf.max()
    
        pred = indices[:,0]
        cond = (pred == graph.y) & (conf >= thresh)
        flag[cond] = torch.logical_and(flag[cond], torch.tensor([False]*len(flag[cond]), dtype=torch.bool))

    index = utils.mask_to_index(flag).tolist()
    ids = set([mapp[x] for x in index])
    metrics = helper(set(ids),set(all_ids),GT_mal,edges,mapp)
    return metrics

In [124]:
run_evaluation(0.95)

Number of True Positives: 214430
Number of Fasle Positives: 113
Number of False Negatives: 19827
Precision: 0.9994732990589299
Recall: 0.9153621876827587
Fscore: 0.9555704099821747



(214430, 113, 19827, 831094)