In [7]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import torch
from torch_geometric.data import Data
import os
import torch.nn.functional as F
import json 
import warnings
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
warnings.filterwarnings('ignore')
from torch_geometric.loader import NeighborLoader
import multiprocessing

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
%matplotlib inline

In [8]:
 attack_list=[
    '2019-05-16 09:20:32',
    '2019-05-16 09:36:08',
    '2019-05-16 09:51:22',
    '2019-05-16 10:06:29',
]

In [9]:
def train_test_split():
    
    df = pd.read_parquet('e5_data/cadets_df.parquet')
    df['timestamp'] = pd.to_datetime(df['timestamp'])

    # Define your time ranges
    start_range_end = '2019-05-9 00:00:00'
    
    df_ben = df[df['timestamp'] <= start_range_end]
    
    grouped = df_ben.groupby('hostid')
    hostdfs = {group: data for group, data in grouped}
    return hostdfs

In [10]:
hostdfs= train_test_split()

## Loading libraries and setting up working directory

In [11]:
'''
Importing some additional libraries
'''
from pprint import pprint
import gzip
from sklearn.manifold import TSNE
import json
import copy
import os

In [12]:
# 10 2 10
num_of_ctg = 10
learning_rounds = 5
epochs = 10
hosts = list(hostdfs.keys())

## Defining functions for loading, cleaning and constructing features from the data

In [13]:
'''
This is the main featurizer. It constructs the graph for the cadets dataset.

Args:
    df (DataFrame): This is the main dataframe containing all the system events from the cadets dataset.

return:
    features (list): Contains word2vec encoded feature vectors for each node
    feat_labels (list): Contains label for each node
    edge_index (list): Contains information about edges between nodes in the graph.
    mapp (list): contains id of each node
'''

tokens = ['SUBJECT_PROCESS',
          'FILE_OBJECT_FILE',
          'NETFLOW']

def prepare_graph(df):
    global tokens
    dummies = {token: index for index, token in enumerate(tokens)}
    
    df['actor_label'] = df['actor_type'].map(dummies)
    df['object_label'] = df['object'].map(dummies)
    
    nodes = {}
    labels = {}
    for col in ['actorID', 'objectID']:
        unique_ids = df[col].unique()
        for uid in unique_ids:
            nodes[uid] = []
        if col == 'actorID':
            labels.update(df.set_index('actorID')['actor_label'].to_dict())
        else:
            labels.update(df.set_index('objectID')['object_label'].to_dict())
    
    for _, row in df.iterrows():
        nodes[row['actorID']].extend([row['exec'], row['action']])
        nodes[row['objectID']].extend([row['exec'], row['action']])
        if row['path'] != '':
            nodes[row['actorID']].append(row['path'])
            nodes[row['objectID']].append(row['path'])
    
    edges = list(zip(df['actorID'], df['objectID']))

    mapp = list(nodes.keys())
    features = [nodes[node_id] for node_id in mapp]
    feat_labels = [labels[node_id] for node_id in mapp]
    edge_index = [[], []]
    index_map = {node_id: index for index, node_id in enumerate(mapp)}
    
    for src, dst in edges:
        edge_index[0].append(index_map[src])
        edge_index[1].append(index_map[dst])
    
    all_procs = list(df['actorID'].unique())
    idx_to_proc = {index: proc for index, proc in enumerate(all_procs)}

    return features, feat_labels, edge_index, mapp, all_procs, idx_to_proc

In [14]:
from torch_geometric.nn import GCNConv
from torch_geometric.nn import SAGEConv, GATConv
import torch.nn.functional as F
import torch.nn as nn

class GCN(torch.nn.Module):
    def __init__(self,in_channel,out_channel):
        super(GCN, self).__init__()
        self.conv1 = SAGEConv(in_channel, 32, normalize=True)
        self.conv2 = SAGEConv(32, out_channel, normalize=True)

    def forward(self, x: torch.Tensor, edge_index: torch.Tensor) -> torch.Tensor:
    
        x = self.encode(x, edge_index)
        return F.softmax(x, dim=1)
    
    def encode(self, x: torch.Tensor, edge_index: torch.Tensor) -> torch.Tensor:
        
        x = self.conv1(x, edge_index)
        x = F.tanh(x)
        x = self.conv2(x, edge_index)
        return x
    
    def freeze_conv_layers(self):
        for param in self.conv1.parameters():
            param.requires_grad = False
        for param in self.conv2.parameters():
            param.requires_grad = False

In [15]:
'''
This function helps visualize the output of the model.
'''
def visualize(h, color):
    z = TSNE(n_components=2).fit_transform(h.detach().cpu().numpy())

    plt.figure(figsize=(10,10))
    plt.xticks([])
    plt.yticks([])

    plt.scatter(z[:, 0], z[:, 1], s=70, c=color, cmap="Set2")
    plt.show()

In [16]:
def combine_word2vec_models(models):
    # Convert the first model's vectors into a dictionary
    unified_dict = {word: models[0].wv[word] for word in models[0].wv.index_to_key}

    # Iterate through the remaining models
    for model in models[1:]:
        model_dict = {word: model.wv[word] for word in model.wv.index_to_key}

        # Iterate through words in the current model
        for word, vector in model_dict.items():
            if word in unified_dict:
                # Average the vectors for overlapping words
                unified_dict[word] = (unified_dict[word] + vector) / 2.0
            else:
                # Add unique words directly
                unified_dict[word] = vector
                
    unified_dict = {word: vector.tolist() if isinstance(vector, np.ndarray) else vector for word, vector in unified_dict.items()}

    return unified_dict

In [17]:
from gensim.models.callbacks import CallbackAny2Vec
import gensim
from gensim.models import Word2Vec
from multiprocessing import Pool
from itertools import compress
from tqdm import tqdm
import time

In [18]:
class EpochSaver(CallbackAny2Vec):
    '''Callback to save model after each epoch.'''

    def __init__(self,filename):
        self.epoch = 0
        self.filename = filename

    def on_epoch_end(self, model):
        model.save(self.filename)
        self.epoch += 1

In [19]:
class EpochLogger(CallbackAny2Vec):
    '''Callback to log information about training'''

    def __init__(self):
        self.epoch = 0

    def on_epoch_begin(self, model):
        print("Epoch #{} start".format(self.epoch))

    def on_epoch_end(self, model):
        print("Epoch #{} end".format(self.epoch))
        self.epoch += 1

In [20]:
def train_word2vec_models():
    global hosts,hostdfs
    
    for h in hosts:
        print("Running host:",h)
        logger = EpochLogger()
        saver = EpochSaver(f"Content_FL_Exp/{h}.model")

        df = hostdfs[h]
        df = df[df['actor_type'] == 'SUBJECT_PROCESS'] 
        df = df[df['object'].isin(tokens)] 

        phrases,feat_labels,edge_index,mapp,all_procs,idx_to_proc = prepare_graph(df)
        word2vec = Word2Vec(sentences=phrases, vector_size=30, window=5, min_count=1, workers=5,epochs=100,callbacks=[saver,logger])

In [53]:
word_models = []
for m in hosts:
    word2vec = Word2Vec.load(f"Content_FL_Exp/{m}.model")
    word_models.append(word2vec)
    
global_word = combine_word2vec_models(word_models)

with open('Content_FL_Exp/e5_cadets_word2vec_global.json', 'w') as json_file:
    json.dump(global_word, json_file)

In [21]:
def load_word_model():
    with open('Content_FL_Exp/e5_cadets_word2vec_global.json', 'r') as json_file:
        loaded_dict = json.load(json_file)

    # Convert lists back to NumPy arrays
    converted_dict = {word: np.array(vector) for word, vector in loaded_dict.items()}
    return converted_dict

In [22]:
'''
Defining the train and test function in this cell 
'''
from sklearn.utils import class_weight
import torch.nn.functional as F
from torch.nn import CrossEntropyLoss

In [23]:
from collections import Counter
word2vec = load_word_model()

def infer(doc):
    global word2vec
    temp = dict(Counter(doc))
    emb = np.zeros(30)
    count = 0
    for k,v in temp.items():
        if k in word2vec:
            emb = emb + word2vec[k]*v
            count = count + 1
    emb = emb / count
    return emb

In [24]:
def init_gnns():
    global num_of_ctg,tokens
    n = num_of_ctg 
    gnn_models = []
    for i in range(n):
        m = GCN(30,len(tokens)).to(device)
        gnn_models.append(m)
    return gnn_models

In [25]:
def define_categories(pids):
    global num_of_ctg
    n = num_of_ctg - 1
    ctg = set(pids)
    ctg = list(ctg)
    k, m = divmod(len(ctg), n)
    return [set(ctg[i * k + min(i, m):(i + 1) * k + min(i + 1, m)]) for i in range(n)]

In [26]:
def map_pids_to_category_indices(pids, categories):
    pid_to_category_index = {}
    
    for pid in pids:
        for category_index, category_set in enumerate(categories):
            if pid in category_set:
                pid_to_category_index[pid] = category_index 
                break 
    
    return pid_to_category_index

In [27]:
from torch.nn import CrossEntropyLoss
from sklearn.utils import class_weight
import copy

templates = init_gnns()

def train_gnn_func(nodes,labels,edges,mapp,pids,idx_to_pid):
    
    global categories ,epochs
    
    pid_to_gnn_index = map_pids_to_category_indices(pids, categories)
    
    set_pids = set(pids)

    proc_index = [i for i in range(len(mapp)) if mapp[i] in set_pids]

    train_splits = [[] for _ in range(len(categories))]

    for i in proc_index:
        pname = idx_to_pid[str(i)]
        split_indx = pid_to_gnn_index[pname]
        train_splits[split_indx].append(int(i))
        
    local_models = [copy.deepcopy(x) for x in templates]
    
    for i in range(len(local_models)-1):
            
        if len(train_splits[i]) == 0:
            local_models[i] = None
        else:
            if f"target_e5_cadets_global{i}.pth" in os.listdir("Content_FL_Exp"):
                local_models[i].load_state_dict(torch.load(f"Content_FL_Exp/target_e5_cadets_global{i}.pth"))

            optimizer = torch.optim.Adam(local_models[i].parameters(), lr=0.01, weight_decay=5e-4)
            criterion = CrossEntropyLoss()

            graph = Data(x=torch.tensor(nodes,dtype=torch.float).to(device),y=torch.tensor(labels,dtype=torch.long).to(device), edge_index=torch.tensor(edges,dtype=torch.long).to(device))
            mask = torch.tensor([False]*graph.num_nodes, dtype=torch.bool)
            mask[train_splits[i]] = True
            
            def get_neighbors(edge_index, nodes):
                neighbors = []
                for node in nodes:
                    mask = edge_index[0] == node
                    neighbors.extend(edge_index[1, mask].tolist())
                return torch.tensor(list(set(neighbors)), dtype=torch.long)

            one_hop_neighbors = get_neighbors(graph.edge_index, train_splits[i])
            two_hop_neighbors = get_neighbors(graph.edge_index, one_hop_neighbors)
            two_hop_neighbors = two_hop_neighbors[~mask[two_hop_neighbors]]
            mask[two_hop_neighbors] = True
            
            for epoch in range(epochs):
                print(f'Training GNN Category {i} Model for Epoch {epoch}')

                loader = NeighborLoader(graph, num_neighbors=[-1,-1], batch_size=5000,input_nodes=mask)
                total_loss = 0
                for subg in loader:
                    local_models[i].train()
                    optimizer.zero_grad() 
                    out = local_models[i](subg.x, subg.edge_index) 
                    loss = criterion(out, subg.y) 
                    loss.backward() 
                    optimizer.step()      
                    total_loss += loss.item() * subg.batch_size
                print("Loss: ", total_loss / mask.sum().item(), '\n')
    
    graph = Data(x=torch.tensor(nodes,dtype=torch.float).to(device),y=torch.tensor(labels,dtype=torch.long).to(device), edge_index=torch.tensor(edges,dtype=torch.long).to(device))
    optimizer = torch.optim.Adam(local_models[-1].parameters(), lr=0.01, weight_decay=5e-4)
    criterion = CrossEntropyLoss()
    
    for epoch in range(epochs):
        print(f'Training Catch all GNN Category Model for Epoch {epoch}')    
        local_models[-1].train()
        optimizer.zero_grad() 
        out = local_models[-1](graph.x, graph.edge_index) 
        loss = criterion(out, graph.y) 
        loss.backward() 
        optimizer.step()      
        print(f"Epoch: {epoch}, Loss: {loss.item()}")

    return local_models

In [28]:
procs_total = []
data_cache = {}
categories = None

def load_clients_data():
    
    global data_cache,categories,procs_total,tokens,hostdfs,hosts
    
    for name in hosts:
        df = hostdfs[name]
        df = df[df['actor_type'] == 'SUBJECT_PROCESS'] 
        df = df[df['object'].isin(tokens)]    

        docs,labels,edges,mapp,pids,idx_to_pid = prepare_graph(df)
        data_cache[name] = [docs,labels,edges,mapp,pids,idx_to_pid]
        procs_total = procs_total + pids

    categories = define_categories(procs_total)

In [29]:
def client_handling_loop(client_id):    
    print(f"Running Setup on Client {client_id} \n")
    
    docs,labels,edges,mapp,pids,idx_to_pid = data_cache[client_id]
    
    nodes_feat = []
    for x in docs:
        nodes_feat.append(infer(x)) 
        
    trained_local_models = train_gnn_func(nodes_feat,labels,edges,mapp,pids,idx_to_pid)
    return trained_local_models

In [30]:
def server_aggregate(all_models):
    global_models = copy.deepcopy(templates)
    
    for l in range(len(all_models)):
        
        current_models = all_models[l]
        current_models = [x for x in current_models if x != None]
        
        if not len(current_models) == 0:
        
            global_dict = global_models[l].state_dict()

            for k in global_dict.keys():
                param_list = [current_models[i].state_dict()[k] for i in range(len(current_models))]
                global_dict[k] = torch.stack(param_list, 0).mean(0)

            global_models[l].load_state_dict(global_dict)
            torch.save(global_models[l].state_dict(), f"Content_FL_Exp/target_e5_cadets_global{l}.pth")
                   
    return global_models

In [31]:
import random
def perform_federated_learning(n_clients):
    client_models = []
    for c in n_clients:
        local_gnns = client_handling_loop(c)
        client_models.append(local_gnns)
    return client_models

In [23]:
!rm Content_FL_Exp/target_e5_cadets*.pth

In [29]:
with open('Content_FL_Exp/e5_cadets_ensemble_ben.json', 'r') as f:
    data_cache = json.load(f)

proc_total = []
for x in hosts:
    proc_total = proc_total + data_cache[x][-2]
    
categories = define_categories(proc_total)

In [26]:
load_clients_data()

In [27]:
#with open("Content_FL_Exp/e5_cadets_ensemble_ben.json", 'w') as file:
#    json.dump(data_cache, file)

In [None]:
for r in range(learning_rounds):
    print(f"Federated Learning Round Number: {r}\n")
    client_models = perform_federated_learning(hosts)
    arranged_models =  [list(group) for group in zip(*client_models)]
    global_models = server_aggregate(arranged_models)

## Evaluation of the trained GNN model starts here

In [32]:
'''
This function is used for constructing neighborhood around a given 
set of nodes for backwards or forward tracking
'''
from itertools import compress
from torch_geometric import utils

def construct_neighborhood(ids,mapp,edges,hops):
    if hops == 0:
        return set()
    else:
        neighbors = set()
        for i in range(len(edges[0])):
            if mapp[edges[0][i]] in ids:
                neighbors.add(mapp[edges[1][i]])
            if mapp[edges[1][i]] in ids:
                neighbors.add(mapp[edges[0][i]])
        return neighbors.union( construct_neighborhood(neighbors,mapp,edges,hops-1) )

In [33]:
'''
This function logs the evaluation metrics.
'''

def helper(MP,all_pids,GP,edges,mapp):

    TP = MP.intersection(GP)  
    FP = MP - GP              
    FN = GP - MP              
    TN = all_pids - (GP | MP)
    
    two_hop_gp = construct_neighborhood(GP,mapp,edges,2)
    two_hop_tp = construct_neighborhood(TP,mapp,edges,2)
    FPL = FP - two_hop_gp
    TPL = TP.union(FN.intersection(two_hop_tp))
    FN = FN - two_hop_tp
    
    alerts = TP.union(FP)

    TP,FP,FN,TN = len(TPL),len(FPL),len(FN),len(TN)
    
    FPR = FP / (FP+TN)
    TPR = TP / (TP+FN)

    print(f"Number of True Positives: {TP}")
    print(f"Number of Fasle Positives: {FP}")
    print(f"Number of False Negatives: {FN}")
    print(f"Number of True Negatives: {TN}\n")

    prec = TP / (TP + FP)
    print(f"Precision: {prec}")

    rec = TP / (TP + FN)
    print(f"Recall: {rec}")

    fscore = (2*prec*rec) / (prec + rec)
    print(f"Fscore: {fscore}\n")
    
    #return alerts
    return TPL,FPL

In [34]:
'''
Timestamps:

second_range_start = '2019-05-16 09:20:32'
second_range_end = '2019-05-16 10:06:29'
'''

"\nTimestamps:\n\nsecond_range_start = '2019-05-16 09:20:32'\nsecond_range_end = '2019-05-16 10:06:29'\n\n"

In [47]:
def load_data_test():
    
    df = pd.read_parquet('e5_data/cadets_df.parquet')
    df['timestamp'] = pd.to_datetime(df['timestamp'])

    second_range_start = '2019-05-16 07:00:00'
    second_range_end = '2019-05-16 13:00:00'

    df = df[(df['timestamp'] >= second_range_start) & (df['timestamp'] <= second_range_end)]
    
    df = df[df['actor_type'] == 'SUBJECT_PROCESS'] 
    df = df[df['object'].isin(tokens)]

    docs,labels,edges,mapp,pids,idx_to_pid = prepare_graph(df)
    return [docs,labels,edges,mapp,pids,idx_to_pid]

In [None]:
data_mal = load_data_test()

In [None]:
GT_mal = {'2D11355E-77BD-11E9-A28B-D4AE52C1DBD3', '77FF0EE8-CE75-A759-B5CE-26AD59A71AB0', 'B81E7548-56D4-3E5A-9456-26143A3E530D', 'AB27FA87-77C0-11E9-B41B-D4AE52C1DBD3', 'AC0CD001-BE10-6853-90BE-4F75536885A9', '0EF90AE6-77BC-11E9-B129-6C2B597E484C', 'C2A6F4AF-47B3-F65E-B347-BD835EF6776C', '91258993-B942-C35D-82B9-25794DC34401', '53B689C2-D5CE-7A5A-8ED5-B1B8DA7A0B7D', '986BCC27-77BD-11E9-A28B-D4AE52C1DBD3', 'AB21A999-77C0-11E9-B129-6C2B597E484C', '4A9B6AEE-317D-9751-BD31-DFBEA1978EB7', 'A47214EF-B795-315C-95B7-CE527C314504', '0563A9B1-77BE-11E9-A28B-D4AE52C1DBD3', 'D42392CE-7042-11E9-B129-6C2B597E484C', 'E4A97321-64C1-7E54-8164-2E76747EDD2C', '987063DE-77BD-11E9-A28B-D4AE52C1DBD3', '43614DAB-29AC-AC5B-AC29-4A2ABBAC0541', '587E67A7-39FE-6E5B-BE39-C7D97B6ECB13', '5DFD613C-77C1-11E9-B129-6C2B597E484C', 'D4502939-F856-7C59-96F8-2268B97CCB31', '10274AC1-C8F7-E455-B7C8-5354A5E47FF1', '234FCAB9-197A-9C53-BA19-6A19B39CBFF0', '908C1871-9556-355C-9695-1D084C35A52A', '6DDF309B-D209-8350-89D2-B97CB083C0F4', '5DF70258-77C1-11E9-A28B-D4AE52C1DBD3', '8B37ED08-DC01-7D55-81DC-2D0B857DAE7B', 'AC71B1EF-2455-5452-9524-9C5DA25489C0', '0EF735F0-77BC-11E9-B41B-D4AE52C1DBD3', 'B3325DB8-77BE-11E9-B129-6C2B597E484C', '10C43677-77C2-11E9-B41B-D4AE52C1DBD3', 'A6036179-7457-1859-9774-D307091888E3', '2BBD32B0-77BC-11E9-B129-6C2B597E484C', '5DFA99EE-77C1-11E9-A28B-D4AE52C1DBD3', '267A288C-77BE-11E9-B129-6C2B597E484C', '09823C55-578E-9450-8E57-7C7090940A81', '0EFE3642-77BC-11E9-A28B-D4AE52C1DBD3', 'F36EC5A4-EE55-8855-95EE-424395884C3D', '985E5B4E-77BD-11E9-B129-6C2B597E484C', '5DFE0DEF-77C1-11E9-B41B-D4AE52C1DBD3', '00CED438-7043-11E9-B41B-D4AE52C1DBD3', 'A5327AD1-EF4B-0959-8BEF-B5F71909321C', 'FC67BD6A-77BB-11E9-A28B-D4AE52C1DBD3', '2B92217F-77BC-11E9-B129-6C2B597E484C', '5DF3939C-77C1-11E9-A28B-D4AE52C1DBD3', '0EFB3F32-77BC-11E9-B129-6C2B597E484C', 'F410BE96-4C0C-765E-8C4C-6F0E2E760020', '986CFF59-77BD-11E9-A28B-D4AE52C1DBD3', '21C17819-77BF-11E9-B129-6C2B597E484C', '986C73C8-77BD-11E9-A28B-D4AE52C1DBD3', 'AB24D5E0-77C0-11E9-B41B-D4AE52C1DBD3', '5DFDAAE9-77C1-11E9-B129-6C2B597E484C', '21C45373-77BF-11E9-A28B-D4AE52C1DBD3', 'AB28A3F5-77C0-11E9-A28B-D4AE52C1DBD3', '556CE3B4-9A31-195B-B19A-0A5F7B1937EB', '2BD094D1-77C2-11E9-A28B-D4AE52C1DBD3', 'E5E2EEDE-77C1-11E9-A28B-D4AE52C1DBD3', '985FFE16-77BD-11E9-B41B-D4AE52C1DBD3', '92B1CBC0-77BE-11E9-A28B-D4AE52C1DBD3', '15D7DD48-70AF-7657-AF70-1CEF4776FAB6', '986108F0-77BD-11E9-B41B-D4AE52C1DBD3', '21CD453C-77BF-11E9-B41B-D4AE52C1DBD3', '2D119F64-77BD-11E9-B41B-D4AE52C1DBD3', '21CE3700-77BF-11E9-B41B-D4AE52C1DBD3', 'DE1C5D85-439B-AB59-9B43-B02329AB616A', '416AB837-8E42-6D58-828E-6976586D8E68', '41D6D923-C5FE-A353-BEC5-D091A3A30DC8', '70416923-0B7F-3258-BF0B-9FDAF8329E70', 'AB25235A-77C0-11E9-B129-6C2B597E484C', 'AB256F03-77C0-11E9-B129-6C2B597E484C', '547BF4B0-6304-7B53-8463-B496937BC4A2', '2B6A35DC-6526-E25B-A665-0F2D2BE24875', 'AB2605BD-77C0-11E9-B129-6C2B597E484C', 'E05AD5A8-E187-E35D-87E1-C0CEEDE3D310', 'B34969B0-77BE-11E9-B129-6C2B597E484C', 'A465C0C7-77BF-11E9-A28B-D4AE52C1DBD3', 'DC93405E-EB5D-E654-9DEB-281F44E63AD8', '7B2D29EB-215F-7757-9F21-CFCF9777740B', '21C62B05-77BF-11E9-A28B-D4AE52C1DBD3', '985EF130-77BD-11E9-B129-6C2B597E484C', 'AB2C8FE9-77C0-11E9-A28B-D4AE52C1DBD3', '9887A62B-6C91-765E-916C-3169BE7667E7', 'C45A3305-0123-3B58-A301-6A04E83B09DF', '98632A47-77BD-11E9-B41B-D4AE52C1DBD3', '21844ACD-77C0-11E9-A28B-D4AE52C1DBD3', '816E6A3D-1172-585C-B211-BC4D3C585A59', '21C3D47D-77BF-11E9-A28B-D4AE52C1DBD3', '0EFF1E29-77BC-11E9-B41B-D4AE52C1DBD3', 'F381167E-9787-A65F-8797-DAE93FA6C677', 'F5A81F19-1907-7454-8719-281FF474EA17', 'DFE18C61-77BD-11E9-A28B-D4AE52C1DBD3', 'AB24D07A-77C0-11E9-B41B-D4AE52C1DBD3', 'F852D5DF-77BF-11E9-B129-6C2B597E484C', 'DFE00498-77BD-11E9-B129-6C2B597E484C', '5DFCCFED-77C1-11E9-B129-6C2B597E484C', '0EF96C4A-77BC-11E9-B129-6C2B597E484C', '2D8BF073-6506-EC5A-8665-24DD9AEC5C56', '970A4F19-8EB0-BA55-B08E-91C775BA02B5', '7A40D143-77BC-11E9-A28B-D4AE52C1DBD3', '985E0ED2-77BD-11E9-B129-6C2B597E484C', '0A6725DF-089C-8051-9C08-E2F211807F26', 'B8FFD54B-E634-C156-B4E6-8D03E6C1084F', '21C541A6-77BF-11E9-A28B-D4AE52C1DBD3', 'EFCC799A-BA9A-D150-9ABA-287680D1CE20', '21BEDF21-77BF-11E9-A28B-D4AE52C1DBD3', '1B0F7AD2-36A6-725B-A636-41FD3B721FC0', 'AB2ADEF7-77C0-11E9-B41B-D4AE52C1DBD3', '5F5D4650-B585-1256-85B5-0F0B16120B00', 'AB2A69E7-77C0-11E9-B41B-D4AE52C1DBD3', 'FC94A382-77BB-11E9-A28B-D4AE52C1DBD3', '5DFA128D-77C1-11E9-A28B-D4AE52C1DBD3', '174172A3-D558-C15E-98D5-EF6F0EC1164B', 'AB29A847-77C0-11E9-A28B-D4AE52C1DBD3', '4E5BBF82-09B5-1F58-B509-9CED081F462C', '45FB0EE9-473E-665D-BE47-E6306D6603D7', '4D5BA965-77BE-11E9-B41B-D4AE52C1DBD3', 'B3944524-7042-11E9-A28B-D4AE52C1DBD3', 'DBE56166-8DF5-D156-B58D-41C346D11FAC', '45822348-77BF-11E9-A28B-D4AE52C1DBD3', '985F81E8-77BD-11E9-B41B-D4AE52C1DBD3', 'F8547062-77BF-11E9-B41B-D4AE52C1DBD3', '0A4A0810-7043-11E9-B41B-D4AE52C1DBD3', '0EFC5BC4-77BC-11E9-B41B-D4AE52C1DBD3', 'AB292EAE-77C0-11E9-A28B-D4AE52C1DBD3', '0EFAC465-77BC-11E9-A28B-D4AE52C1DBD3', '84507AF2-99C8-F358-8899-573038F30CFC', '4581D172-77BF-11E9-B129-6C2B597E484C', '985FD512-77BD-11E9-B129-6C2B597E484C', '986EF24F-77BD-11E9-A28B-D4AE52C1DBD3', '0EFA3DC9-77BC-11E9-B41B-D4AE52C1DBD3', 'AB21A56F-77C0-11E9-B129-6C2B597E484C', '52ACBCC7-77BE-11E9-B41B-D4AE52C1DBD3', '5DF55064-77C1-11E9-B41B-D4AE52C1DBD3', '9B800DA5-E9A3-F257-A3E9-9260F7F20BDD', '21C3962C-77BF-11E9-B129-6C2B597E484C', '23AE53AA-475A-775A-9A47-37F3AA772422', '21C4CB34-77BF-11E9-A28B-D4AE52C1DBD3', 'F852A9C5-77BF-11E9-A28B-D4AE52C1DBD3', '75C6B2C2-4B61-F450-A14B-A77FC0F45172', '60818B3D-77BF-11E9-A28B-D4AE52C1DBD3', '0EFA1068-77BC-11E9-B129-6C2B597E484C', 'AF168E23-7042-11E9-A28A-D4AE52C1DBD3', 'AFF42659-F08F-5F51-8FF0-EB8B315F0779', '0EFAA91F-77BC-11E9-B129-6C2B597E484C', '21C35697-77BF-11E9-A28B-D4AE52C1DBD3', '653D8CCE-2987-A95B-8729-8875CBA92F9C', 'E431EE7F-C5AD-0C57-ADC5-8AD2C70C2917', '70CC7497-3CAB-EC55-AB3C-2E99B5EC8C0B', '421A0178-300D-0D55-8D30-E335D50D999D', '986FE51C-77BD-11E9-A28B-D4AE52C1DBD3', '2D12BFA6-77BD-11E9-B129-6C2B597E484C', '21CEB140-77BF-11E9-B41B-D4AE52C1DBD3', '5DFB8F5B-77C1-11E9-A28B-D4AE52C1DBD3', 'D590CAB3-9386-DD5C-8693-F242CCDDEC62', '21C6A79E-77BF-11E9-A28B-D4AE52C1DBD3', '7A8124B7-1BFA-AB51-BA1B-D766A1AB156B', 'A03DDACC-F74D-D957-8DF7-ACFCD7D94951', '5DF7F6D5-77C1-11E9-B129-6C2B597E484C', 'BD6781D6-7042-11E9-A28B-D4AE52C1DBD3', '5DFD18FC-77C1-11E9-B129-6C2B597E484C', '9567DB9B-03C9-A35A-8903-11AF6AA3AC14', '21C6F74A-77BF-11E9-B41B-D4AE52C1DBD3', '985B2D9A-77BD-11E9-B129-6C2B597E484C', '985B3C2C-77BD-11E9-B41B-D4AE52C1DBD3', '49CA1749-5058-8959-9850-8F10398966E4', 'FDF0A4F7-767F-FC53-BF76-A987F3FCDCD8', '0EFB8C9A-77BC-11E9-B129-6C2B597E484C', '24A23B99-669B-1656-9B66-54CE3616CDA6', '5DF7FE5B-77C1-11E9-B129-6C2B597E484C', 'AF709E80-7042-11E9-A28A-D4AE52C1DBD3', '403C38D2-1E20-5058-A01E-5AD9B850311F', '0EFEA35F-77BC-11E9-B41B-D4AE52C1DBD3', '21C21C90-77BF-11E9-A28B-D4AE52C1DBD3', '0EF72D39-77BC-11E9-B129-6C2B597E484C', 'DFE1EFBC-77BD-11E9-B41B-D4AE52C1DBD3', 'AB292805-77C0-11E9-B41B-D4AE52C1DBD3', 'AB2C13C8-77C0-11E9-A28B-D4AE52C1DBD3', 'AB2C4819-77C0-11E9-B41B-D4AE52C1DBD3', 'FB9FD49A-C575-DA5B-B5C5-11966BDA15C8', 'A463D3C4-F4D6-2E5A-96F4-6B055A2EB379', '5DFBB0A5-77C1-11E9-B41B-D4AE52C1DBD3', '92B67F14-77BE-11E9-B129-6C2B597E484C', 'FC49A5EE-7157-6455-9771-A34D95641A12', '1BFC813E-DE27-E357-A7DE-CF04C7E3CE67', '5DF53893-77C1-11E9-B41B-D4AE52C1DBD3', '9F9AEA69-4556-F35D-9645-7D751DF345F6', '21C34859-77BF-11E9-B129-6C2B597E484C', '21C11727-77BF-11E9-B129-6C2B597E484C', 'A901490D-1FA2-AF50-A21F-868A70AFC6E2', '21A98C39-77C0-11E9-A28B-D4AE52C1DBD3', '98607586-77BD-11E9-B41B-D4AE52C1DBD3', 'AB29A774-77C0-11E9-B41B-D4AE52C1DBD3', '985D6D95-77BD-11E9-B129-6C2B597E484C', 'AB289469-77C0-11E9-B41B-D4AE52C1DBD3', 'E5BEEAAE-77C1-11E9-A28B-D4AE52C1DBD3', 'E5BE02A8-1A59-045B-991A-504E5B04464E', '0EFAF427-77BC-11E9-B129-6C2B597E484C', '986F6AC4-77BD-11E9-A28B-D4AE52C1DBD3', 'B343CA9F-7042-11E9-A28A-D4AE52C1DBD3', 'C9553E40-617E-B258-BE61-2E6188B2F3F4', '20655ED4-DB2C-465F-ACDB-A5EF7F468B07', '0EFA5D10-77BC-11E9-B129-6C2B597E484C', '7E48A854-81F9-5754-B981-FDA0F457BA6C', '7A40FE16-77BC-11E9-B129-6C2B597E484C', 'AB2CBF15-77C0-11E9-B41B-D4AE52C1DBD3', '4A2960C4-31C3-4458-8331-C9F87844401D', 'AB280656-77C0-11E9-A28B-D4AE52C1DBD3', '985F8871-77BD-11E9-B129-6C2B597E484C', '9B3C4DF5-13DF-7D5E-9F13-76D88E7DC649', '0EFAE2B3-77BC-11E9-B41B-D4AE52C1DBD3', '5DFBECC3-77C1-11E9-B129-6C2B597E484C', '985EA601-77BD-11E9-B129-6C2B597E484C', 'AB248B9F-77C0-11E9-B129-6C2B597E484C', 'E5A588A2-77C1-11E9-A28B-D4AE52C1DBD3', 'FFF3003E-0E40-155C-800E-7E804C152486', '0EFD41A9-77BC-11E9-A28B-D4AE52C1DBD3', '72F1B0DA-370B-B850-8B37-097B90B864E6', '5DF52CB1-77C1-11E9-B41B-D4AE52C1DBD3', '0EFCC80A-77BC-11E9-A28B-D4AE52C1DBD3', '4F51B276-B332-8A5D-B2B3-1D7F0D8A022C', '21CB63B9-77BF-11E9-B41B-D4AE52C1DBD3', 'E5DBE605-77C1-11E9-A28B-D4AE52C1DBD3', '21C2FD9D-77BF-11E9-B129-6C2B597E484C', 'E1ACD559-0CD9-5357-990C-2891875394A8', '45FA8B1C-1628-D459-A816-202B49D415DC', '0EFDB6E8-77BC-11E9-B41B-D4AE52C1DBD3', '21CC55A7-77BF-11E9-B41B-D4AE52C1DBD3', 'D1042279-ADA7-1A51-A7AD-427D911A1470', '21C1CC76-77BF-11E9-B129-6C2B597E484C', 'AB26513D-77C0-11E9-B129-6C2B597E484C', '9B987587-DCB5-295E-B5DC-5F4B2E292E34', '44AA7080-44DD-EE57-9D44-F69A47EEBAD7', '0EFC4BAF-77BC-11E9-A28B-D4AE52C1DBD3', 'CF615640-7042-11E9-B129-6C2B597E484C', 'AB2389D4-77C0-11E9-B129-6C2B597E484C', 'AB22E209-77C0-11E9-A28B-D4AE52C1DBD3', '680AFF23-5240-8954-8052-DB7044890360', '3647624B-77C2-11E9-A28B-D4AE52C1DBD3', '0EF705DE-77BC-11E9-A28B-D4AE52C1DBD3', '227A4375-488A-255B-8A48-A64EDB250DD2', 'AB2AA0B7-77C0-11E9-A28B-D4AE52C1DBD3', '10C3C2DD-77C2-11E9-B129-6C2B597E484C', '281785AC-CC65-AD55-A5CC-8BDD75AD1003', '1633500B-D34B-8058-8BD3-ABB7D880915E', '5DFC0D40-77C1-11E9-A28B-D4AE52C1DBD3', 'CB396750-6738-2D5F-B867-B9D3BF2D87D5', 'BE5788E9-3A56-6556-963A-148E16650E31', '985EFAAF-77BD-11E9-B41B-D4AE52C1DBD3', 'EF7D402F-7605-E356-8576-F21B76E320D8', '5DFC895F-77C1-11E9-A28B-D4AE52C1DBD3', '5DFC9EFB-77C1-11E9-B41B-D4AE52C1DBD3', '7A41378F-77BC-11E9-B41B-D4AE52C1DBD3', '5DC8E37B-CBB9-165F-B9CB-DBD5BF16FA30', '63F6BE39-77C1-11E9-A28B-D4AE52C1DBD3', 'B207934D-DBF1-3650-B1DB-026F10367CF9', '5DFD0167-77C1-11E9-A28B-D4AE52C1DBD3', 'A9331079-AB10-8656-90AB-7E5E76860DF4', '268F6A76-77BE-11E9-B129-6C2B597E484C', 'B9173445-682B-F455-AB68-86E245F4B9A6', 'DD95BBDF-7042-11E9-B129-6C2B597E484C', '0EFCE9B4-77BC-11E9-B41B-D4AE52C1DBD3', '3316E533-7A07-0F52-877A-2B9E220F9C8A', '0EFBE4EB-77BC-11E9-B41B-D4AE52C1DBD3', '0EFBD976-77BC-11E9-B129-6C2B597E484C', 'B37118C7-77BE-11E9-B129-6C2B597E484C', 'B6206885-77BF-11E9-A28B-D4AE52C1DBD3', '5DFB398C-77C1-11E9-B41B-D4AE52C1DBD3', '985DC070-77BD-11E9-B129-6C2B597E484C', '373D02C0-A81B-7D54-9BA8-F181547DCBCF', 'AB24D6A5-77C0-11E9-B129-6C2B597E484C', '5DFB397B-77C1-11E9-B129-6C2B597E484C', '5DFDF5F1-77C1-11E9-B129-6C2B597E484C', '22CF080D-4455-C153-9544-1401B3C16DC0', 'F1ED1A55-BAC4-F55A-84BA-B2812AF5498E', '21C266C5-77BF-11E9-B129-6C2B597E484C', '98624082-77BD-11E9-B41B-D4AE52C1DBD3', '0EFDBD1E-77BC-11E9-A28B-D4AE52C1DBD3', '9861CC0C-77BD-11E9-B41B-D4AE52C1DBD3', '10C3C30B-77C2-11E9-A28B-D4AE52C1DBD3', '1C918A35-ECFB-9C53-BBEC-6D6AE39CCB44', '709926E8-26FC-495E-BC26-07EABE498DBF', '07AA22F3-28AA-EC54-AA28-A281C4EC6FB2', '023C8CA3-7390-4D57-9073-46DD874D304A', '21C21A68-77BF-11E9-B129-6C2B597E484C', '23E0B115-8D17-A852-978D-F09FF2A8117D', '985E4EFB-77BD-11E9-B41B-D4AE52C1DBD3', '2274BE80-2358-3F5A-9823-B56EDA3F8510', 'D44C2740-A6DD-2056-9DA6-2C2E86207843', '2BB6D486-77BC-11E9-B129-6C2B597E484C', '5DFD16D9-77C1-11E9-B41B-D4AE52C1DBD3', '5DF37D1E-77C1-11E9-A28B-D4AE52C1DBD3', 'A7C0B6CE-C649-5C51-89C6-9308515CF876', 'D6BFF2A5-44AC-4D5D-AC44-0959ED4DC0F8', '5DFE9463-77C1-11E9-B41B-D4AE52C1DBD3', '7BB898FC-A586-1E5C-86A5-A7054C1E3DC4', '4582AB63-77BF-11E9-B41B-D4AE52C1DBD3', '2BA551DA-77BC-11E9-B129-6C2B597E484C', '33B6DB1F-890B-215D-8B89-5E53ED2177B7', '5DFC2719-77C1-11E9-B41B-D4AE52C1DBD3', 'A440DAF8-77BF-11E9-A28B-D4AE52C1DBD3', '733FB083-1430-055E-B014-D06BDE054066', '0EF9C04B-77BC-11E9-B129-6C2B597E484C', '985D0BFD-77BD-11E9-B129-6C2B597E484C', 'AB2B5A0A-77C0-11E9-B41B-D4AE52C1DBD3', 'AB2A1F52-77C0-11E9-A28B-D4AE52C1DBD3', '21BF32FC-77BF-11E9-B129-6C2B597E484C', '5DFC85CC-77C1-11E9-B129-6C2B597E484C', '3C654629-ACBD-2E5E-BDAC-AA8C0E2E30C4', '5CC42092-7FD6-7D51-967F-C031817DA246', '985F3C33-77BD-11E9-B129-6C2B597E484C', '08949D0F-898D-E959-8D89-325159E99399', 'AB2B966D-77C0-11E9-A28B-D4AE52C1DBD3', '0EFA207D-77BC-11E9-A28B-D4AE52C1DBD3', '0EFB511B-77BC-11E9-A28B-D4AE52C1DBD3', 'FE857CDC-79A9-4557-A979-B551074517C8', '8DAD393D-2C9E-7351-9E2C-63F73173EDAE', '01DDFF5B-77BE-11E9-A28B-D4AE52C1DBD3', '0EFBD181-77BC-11E9-A28B-D4AE52C1DBD3', '609A437D-77BF-11E9-A28B-D4AE52C1DBD3', 'AB2B1D5A-77C0-11E9-A28B-D4AE52C1DBD3', 'C8AB800D-7D40-6057-807D-918927602196', 'BC36EE8E-0BE3-DE50-A30B-67ADE0DECCE1', 'AB22ECD0-77C0-11E9-A28B-D4AE52C1DBD3', '0EFB69F5-77BC-11E9-B41B-D4AE52C1DBD3', '0E51AC0C-77BC-11E9-A28B-D4AE52C1DBD3', '5DFAB226-77C1-11E9-B41B-D4AE52C1DBD3', '21CAD601-77BF-11E9-B41B-D4AE52C1DBD3', 'C747C575-30AA-5D50-AA30-6528105D4D5D', '591B6D97-AB8A-C752-8AAB-31CA42C79B37', '0E8027DC-7002-155E-8270-8A3CAE153B5B', '0EFE2EA5-77BC-11E9-B41B-D4AE52C1DBD3', '92B0ACDA-77BE-11E9-B41B-D4AE52C1DBD3', '5DFB99D8-77C1-11E9-B129-6C2B597E484C', '5DFD7D71-77C1-11E9-A28B-D4AE52C1DBD3', 'B622B508-77BF-11E9-A28B-D4AE52C1DBD3', '5DF80D65-77C1-11E9-B129-6C2B597E484C', '98112028-D237-5157-B7D2-280CE751AE02', '21C3E304-77BF-11E9-B129-6C2B597E484C', 'AB23E8EF-77C0-11E9-B129-6C2B597E484C', '39E5C148-D4A1-CC58-A1D4-B07DD8CCBD49', 'AB243B67-77C0-11E9-B129-6C2B597E484C', '5DF36937-77C1-11E9-A28B-D4AE52C1DBD3', 'D8F507F7-4B0E-C952-8E4B-830F52C98AFF', '6D53A12C-D093-6C5C-93D0-7616BC6CD4D2', '21CA2827-77BF-11E9-B41B-D4AE52C1DBD3', '21CDC320-77BF-11E9-B41B-D4AE52C1DBD3', '21C5B6C3-77BF-11E9-A28B-D4AE52C1DBD3', '986DF89E-77BD-11E9-A28B-D4AE52C1DBD3', '5DF8740C-77C1-11E9-B41B-D4AE52C1DBD3', '5DFDFA89-77C1-11E9-A28B-D4AE52C1DBD3', '738B7C9F-D3EE-6A5F-AED3-FF407F6A7609', 'B1E1F342-C4A9-8950-A9C4-792D4089BF48', '986E76FE-77BD-11E9-A28B-D4AE52C1DBD3', '0EFEB375-77BC-11E9-A28B-D4AE52C1DBD3', '9862B4FE-77BD-11E9-B41B-D4AE52C1DBD3', '035DD774-4CD4-3F50-944C-D5BB803F6619', 'AB2BCFB0-77C0-11E9-B41B-D4AE52C1DBD3', '18744ECE-618C-3A55-8C61-BB5B453AB671', 'F08AD9C4-66CE-0C54-8E66-156F140C51CF', '5108317C-59BA-245A-BA59-DDB76A243440', '0E4F5AE8-77BC-11E9-A28B-D4AE52C1DBD3', '21C2B279-77BF-11E9-B129-6C2B597E484C', '9868A740-77BD-11E9-A28B-D4AE52C1DBD3', 'AFA4EC8C-5776-FD5D-B657-EE55DDFD7978', 'B36A68D2-77BE-11E9-B129-6C2B597E484C', '3C08BE34-5951-6850-9159-659CF0681915', '21C2CA92-77BF-11E9-A28B-D4AE52C1DBD3', '21CCCCE7-77BF-11E9-B41B-D4AE52C1DBD3', '32C189F2-77C2-11E9-A28B-D4AE52C1DBD3', '6A5473C0-DCA6-325F-A6DC-C237DF321E65'}

In [None]:
def run_evaluation(thresh):    
    global word2vec,tokens,GT_mal
            
    phrases,labels,edges,mapp,pids,idx_to_pid = data_mal

    model = GCN(30,len(tokens)).to(device)
    word2vec = load_word_model()

    nodes = [infer(x) for x in phrases]
    nodes = np.array(nodes)  

    all_ids = set(mapp)
        
    graph = Data(x=torch.tensor(nodes,dtype=torch.float).to(device),y=torch.tensor(labels,dtype=torch.long).to(device), edge_index=torch.tensor(edges,dtype=torch.long).to(device))

    flag = torch.tensor([True]*graph.num_nodes, dtype=torch.bool)
    
    for m_n in range(num_of_ctg):
        if f"target_e5_cadets_global{m_n}.pth" in os.listdir("Content_FL_Exp"): 
            model.load_state_dict(torch.load(f"Content_FL_Exp/target_e5_cadets_global{m_n}.pth"))
            
        model.eval()
        out = model(graph.x, graph.edge_index)

        sorted, indices = out.sort(dim=1,descending=True)
        conf = (sorted[:,0] - sorted[:,1]) / sorted[:,0]
        conf = (conf - conf.min()) / conf.max()

        pred = indices[:,0]
        cond = (pred == graph.y) & (conf >= thresh)
        flag[cond] = torch.logical_and(flag[cond], torch.tensor([False]*len(flag[cond]), dtype=torch.bool))

    index = utils.mask_to_index(flag).tolist()
    ids = set([mapp[x] for x in index])
    print(len(ids))
    metrics = helper(set(ids),set(all_ids),GT_mal,edges,mapp)

In [None]:
run_evaluation(0)

In [None]:
import math

# Function to calculate Shannon Diversity Index for a set
def calculate_sdi(species_set):
    total_species = len(species_set)
    if total_species > 0:
        # Assuming equal abundance for simplicity, each species has an equal proportion
        pi = 1 / total_species
        # Calculate SDI using the formula
        sdi = -sum([pi * math.log(pi) for _ in range(total_species)])
        return sdi
    else:
        return 0

# Calculate SDI for each category
sdi_per_category = [calculate_sdi(category) for category in categories]
print("SDI for each category:", sdi_per_category)

# Calculate overall SDI by combining all sets into one
combined_set = set().union(*categories)
overall_sdi = calculate_sdi(combined_set)
print("Overall SDI:", overall_sdi)