In [38]:
'''
Importing the require libraries here
'''
import os
os.chdir("/home/jovyan/aron_workdir/gestalt/code/Evaluation_Scripts")
exec(open("Imports.py").read())

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
%matplotlib inline
# Default path to the "Trustwatch" directory
default_directory = "/home/jovyan/Trustwatch"

# Prompt the user for a directory path, with the default if nothing is entered
user_directory = input(f"Enter directory path (Press Enter to use default '{default_directory}'): ") or default_directory
print(user_directory)
# Check if we're already in the desired directory
if os.getcwd() == user_directory:
    print("Already in the specified directory.")
else:
    print(f"Changing to directory: {user_directory}")
    os.chdir(user_directory)

# Confirm the current directory
print("Current directory:", os.getcwd())

Enter directory path (Press Enter to use default '/home/jovyan/Trustwatch'):  


/home/jovyan/Trustwatch
Changing to directory: /home/jovyan/Trustwatch
Current directory: /home/jovyan/Trustwatch


## Loading libraries and setting up working directory

In [39]:
num_of_ctg = 10
learning_rounds = 3
epochs = 10
hosts = ['cadets','theia','trace']
TRAIN=False

## Defining functions for loading, cleaning and constructing features from the data

In [40]:
'''
This is the main featurizer. It constructs the graph for the cadets dataset.

Args:
    df (DataFrame): This is the main dataframe containing all the system events from the cadets dataset.

return:
    features (list): Contains word2vec encoded feature vectors for each node
    feat_labels (list): Contains label for each node
    edge_index (list): Contains information about edges between nodes in the graph.
    mapp (list): contains id of each node
'''

tokens = ['SUBJECT_PROCESS',
          'FILE_OBJECT_FILE',
          'NetFlowObject'
         ]


def prepare_graph(df):
    nodes = {}
    labels = {}
    edges = []
    proc = {}
    
    global tokens
    dummies = {token: index for index, token in enumerate(tokens)}


    for i in range(len(df)):
        x = df.iloc[i]
        action = x["action"]
        
        actorid = x["actorID"]
        if not (actorid in nodes):
            nodes[actorid] =  []
        nodes[actorid].append(x['exec'])
        nodes[actorid].append(action)
        if x['path'] != '':
            nodes[actorid].append(x['path'])
        labels[actorid] = dummies[x['actor_type']]

        objectid = x["objectID"]
        if not (objectid in nodes):
            nodes[objectid] =  []
        nodes[objectid].append(x['exec'])
        nodes[objectid].append(action)
        if x['path'] != '':
             nodes[objectid].append(x['path'])
        labels[objectid] = dummies[x['object']]

        edges.append(( actorid, objectid ))
        
        proc[actorid] = x['actorID']

    features = []
    feat_labels = []
    edge_index = [[],[]]
    index  = {}
    mapp = []

    all_procs = set()

    for k,v in nodes.items():
        features.append(v)
        feat_labels.append(labels[k])
        index[k] = len(features) - 1
        mapp.append(k)
        
        if k in proc:
            all_procs.add(proc[k])

    for x in edges:
        src = index[x[0]]
        dst = index[x[1]]

        edge_index[0].append(src)
        edge_index[1].append(dst)
        
    idx_to_proc = {}
    for i in range(len(mapp)):
        if mapp[i] in proc:
            idx_to_proc[i] = proc[mapp[i]]
            
    all_procs = list(all_procs)

    return features,feat_labels,edge_index,mapp,all_procs,idx_to_proc

In [41]:
class GCN(torch.nn.Module):
    def __init__(self,in_channel,out_channel):
        super(GCN, self).__init__()
        self.conv1 = SAGEConv(in_channel, 32, normalize=True)
        self.conv2 = SAGEConv(32, out_channel, normalize=True)

    def forward(self, x: torch.Tensor, edge_index: torch.Tensor) -> torch.Tensor:
    
        x = self.encode(x, edge_index)
        return F.softmax(x, dim=1)
    
    def encode(self, x: torch.Tensor, edge_index: torch.Tensor) -> torch.Tensor:
        
        x = self.conv1(x, edge_index)
        x = F.tanh(x)
        x = self.conv2(x, edge_index)
        return x
    
    def freeze_conv_layers(self):
        for param in self.conv1.parameters():
            param.requires_grad = False
        for param in self.conv2.parameters():
            param.requires_grad = False

In [42]:
'''
This function helps visualize the output of the model.
'''
def visualize(h, color):
    z = TSNE(n_components=2).fit_transform(h.detach().cpu().numpy())

    plt.figure(figsize=(10,10))
    plt.xticks([])
    plt.yticks([])

    plt.scatter(z[:, 0], z[:, 1], s=70, c=color, cmap="Set2")
    plt.show()

## Adding semantic attributes from the raw cadets data

In [43]:
'''
This function is used for attributing semnatic information like process names, executable paths,
file paths etc using the raw cadets data
'''

def add_attributes(d,p):
    
    f = open(p)
    data = [json.loads(x) for x in f if "EVENT" in x]

    info = []
    for x in data:
        try:
            action = x['datum']['com.bbn.tc.schema.avro.cdm18.Event']['type']
        except:
            action = ''
        try:
            actor = x['datum']['com.bbn.tc.schema.avro.cdm18.Event']['subject']['com.bbn.tc.schema.avro.cdm18.UUID']
        except:
            actor = ''
        try:
            obj = x['datum']['com.bbn.tc.schema.avro.cdm18.Event']['predicateObject']['com.bbn.tc.schema.avro.cdm18.UUID']
        except:
            obj = ''
        try:
            timestamp = x['datum']['com.bbn.tc.schema.avro.cdm18.Event']['timestampNanos']
        except:
            timestamp = ''
        try:
            cmd = x['datum']['com.bbn.tc.schema.avro.cdm18.Event']['properties']['map']['exec']
        except:
            cmd = ''
        try:
            path = x['datum']['com.bbn.tc.schema.avro.cdm18.Event']['predicateObjectPath']['string']
        except:
            path = ''
        try:
            path2 = x['datum']['com.bbn.tc.schema.avro.cdm18.Event']['predicateObject2Path']['string']
        except:
            path2 = ''
        try:
            obj2 = x['datum']['com.bbn.tc.schema.avro.cdm18.Event']['predicateObject2']['com.bbn.tc.schema.avro.cdm18.UUID']
            info.append({'actorID':actor,'objectID':obj2,'action':action,'timestamp':timestamp,'exec':cmd, 'path':path2})
        except:
            pass

        info.append({'actorID':actor,'objectID':obj,'action':action,'timestamp':timestamp,'exec':cmd, 'path':path})

    rdf = pd.DataFrame.from_records(info).astype(str)
    d = d.astype(str)

    return d.merge(rdf,how='inner',on=['actorID','objectID','action','timestamp']).drop_duplicates()

In [44]:
def combine_word2vec_models(models):
    # Create an empty unified model
    unified_model = Word2Vec(vector_size=models[0].vector_size, window=models[0].window, min_count=models[0].min_count, sg=models[0].sg)

    # Initialize the vocabulary with the words from the first model
    unified_model.build_vocab([list(models[0].wv.index_to_key)])

    # Copy the vectors from the first model to the unified model for the initial vocabulary
    for word in unified_model.wv.index_to_key:
        unified_model.wv[word] = models[0].wv[word]

    # Iterate through the remaining models and add their unique words and average vectors for overlapping words
    for model in models[1:]:
        # Get the set of unique words in the current model's vocabulary
        unique_words = set(model.wv.index_to_key) - set(unified_model.wv.index_to_key)

        # Add the unique words to the unified model's vocabulary
        unified_model.build_vocab([list(unique_words)], update=True)

        # Iterate through the overlapping words and average their vectors
        for word in set(model.wv.index_to_key).intersection(set(unified_model.wv.index_to_key)):
            unified_model.wv[word] = (unified_model.wv[word] + model.wv[word]) / 2.0

        # Copy the vectors for the unique words from the current model to the unified model
        for word in unique_words:
            unified_model.wv[word] = model.wv[word]

    return unified_model

In [45]:
#word_models = []
#for m in ['cadets','theia','trace']:
#    word2vec = Word2Vec.load(f"content/word2vec_{m}_E3.model")
#    word_models.append(word2vec)

#global_word = combine_word2vec_models(word_models)
#global_word.save("Content_FL_Exp/global_word2vec_E3.model")

#phrases,labels,edges,mapp = prepare_graph(df)
#word2vec = Word2Vec(sentences=phrases, vector_size=30, window=5, min_count=1, workers=8,epochs=300,callbacks=[saver,logger])

In [46]:
'''
Defining the train and test function in this cell 
'''

'\nDefining the train and test function in this cell \n'

In [47]:
'''
Encoding function for running word2vec inference
'''
from collections import Counter
word2vec = Word2Vec.load("Content_FL_Exp/global_word2vec_E3.model")

def infer(doc):
    global word2vec
    temp = dict(Counter(doc))
    emb = np.zeros(30)
    count = 0
    for k,v in temp.items():
        if k in word2vec.wv:
            emb = emb + word2vec.wv[k]*v
            count = count + 1
    emb = emb / count
    return emb

In [48]:
def init_gnns():
    global num_of_ctg,tokens
    n = num_of_ctg 
    gnn_models = []
    for i in range(n):
        m = GCN(30,len(tokens)).to(device)
        gnn_models.append(m)
    return gnn_models

In [49]:
def define_categories(pids):
    global num_of_ctg
    n = num_of_ctg - 1
    ctg = set(pids)
    ctg = list(ctg)
    k, m = divmod(len(ctg), n)
    return [set(ctg[i * k + min(i, m):(i + 1) * k + min(i + 1, m)]) for i in range(n)]

In [50]:
def map_pids_to_category_indices(pids, categories):
    pid_to_category_index = {}
    
    for pid in pids:
        for category_index, category_set in enumerate(categories):
            if pid in category_set:
                pid_to_category_index[pid] = category_index 
                break 
    
    return pid_to_category_index

In [51]:
templates = init_gnns()

def train_gnn_func(nodes,labels,edges,mapp,pids,idx_to_pid):
    
    global categories ,epochs
    
    pid_to_gnn_index = map_pids_to_category_indices(pids, categories)
    
    set_pids = set(pids)

    proc_index = [i for i in range(len(mapp)) if mapp[i] in set_pids]

    train_splits = [[] for _ in range(len(categories))]

    for i in proc_index:
        pname = idx_to_pid[str(i)]
        split_indx = pid_to_gnn_index[pname]
        train_splits[split_indx].append(int(i))
        
    local_models = [copy.deepcopy(x) for x in templates]
    
    for i in range(len(local_models)-1):
            
        if len(train_splits[i]) == 0:
            local_models[i] = None
        else:
            if f"target_e3_global{i}.pth" in os.listdir("Content_FL_Exp"):
                local_models[i].load_state_dict(torch.load(f"Content_FL_Exp/target_e3_global{i}.pth"))

            optimizer = torch.optim.Adam(local_models[i].parameters(), lr=0.01, weight_decay=5e-4)
            criterion = CrossEntropyLoss()

            graph = Data(x=torch.tensor(nodes,dtype=torch.float).to(device),y=torch.tensor(labels,dtype=torch.long).to(device), edge_index=torch.tensor(edges,dtype=torch.long).to(device))
            mask = torch.tensor([False]*graph.num_nodes, dtype=torch.bool)
            mask[train_splits[i]] = True
            
            def get_neighbors(edge_index, nodes):
                neighbors = []
                for node in nodes:
                    mask = edge_index[0] == node
                    neighbors.extend(edge_index[1, mask].tolist())
                return torch.tensor(list(set(neighbors)), dtype=torch.long)

            one_hop_neighbors = get_neighbors(graph.edge_index, train_splits[i])
            two_hop_neighbors = get_neighbors(graph.edge_index, one_hop_neighbors)
            two_hop_neighbors = two_hop_neighbors[~mask[two_hop_neighbors]]
            mask[two_hop_neighbors] = True
            
            for epoch in range(epochs):
                print(f'Training GNN Category {i} Model for Epoch {epoch}')

                loader = NeighborLoader(graph, num_neighbors=[-1,-1], batch_size=5000,input_nodes=mask)
                total_loss = 0
                for subg in loader:
                    local_models[i].train()
                    optimizer.zero_grad() 
                    out = local_models[i](subg.x, subg.edge_index) 
                    loss = criterion(out, subg.y) 
                    loss.backward() 
                    optimizer.step()      
                    total_loss += loss.item() * subg.batch_size
                print("Loss: ", total_loss / mask.sum().item(), '\n')
    
    graph = Data(x=torch.tensor(nodes,dtype=torch.float).to(device),y=torch.tensor(labels,dtype=torch.long).to(device), edge_index=torch.tensor(edges,dtype=torch.long).to(device))
    optimizer = torch.optim.Adam(local_models[-1].parameters(), lr=0.01, weight_decay=5e-4)
    criterion = CrossEntropyLoss()
    
    for epoch in range(epochs):
        print(f'Training Catch all GNN Category Model for Epoch {epoch}')    
        local_models[-1].train()
        optimizer.zero_grad() 
        out = local_models[-1](graph.x, graph.edge_index) 
        loss = criterion(out, graph.y) 
        loss.backward() 
        optimizer.step()      
        print(f"Epoch: {epoch}, Loss: {loss.item()}")

    return local_models

In [52]:
procs_total = []
data_cache = {}
categories = None

def load_clients_data():
    
    global data_cache,categories,procs_total,tokens
    
    for name in ['cadets','theia','trace']:
        if name == 'cadets':
            train_file = 'content/darpatc/cadets_train.txt'
            attribute_file = "content/ta1-cadets-e3-official.json.1"

        if name == 'theia':
            train_file = "content/darpatc/theia_train.txt"
            attribute_file = "content/ta1-theia-e3-official-1r.json"

        if name == 'trace':
            train_file = "content/darpatc/trace_train.txt"
            attribute_file = "content/ta1-trace-e3-official-1.json"  

        f = open(train_file)

        data = f.read().split('\n')
        data = [line.split('\t') for line in data]

        df = pd.DataFrame (data, columns = ['actorID', 'actor_type','objectID','object','action','timestamp'])
        df = df.dropna()
        df.sort_values(by='timestamp', ascending=True,inplace=True)
        df = add_attributes(df,attribute_file)
        df = df[df['actor_type'] == 'SUBJECT_PROCESS'] 
        df = df[df['object'].isin(tokens)]    

        docs,labels,edges,mapp,pids,idx_to_pid = prepare_graph(df)
        data_cache[name] = [docs,labels,edges,mapp,pids,idx_to_pid]
        procs_total = procs_total + pids

    categories = define_categories(procs_total)

In [53]:
procs_total = []
data_cache = {}
categories = None

def process_file(name):
    if name == 'cadets':
        train_file = 'content/darpatc/cadets_train.txt'
        attribute_file = "content/ta1-cadets-e3-official.json.1"

    elif name == 'theia':
        train_file = "content/darpatc/theia_train.txt"
        attribute_file = "content/ta1-theia-e3-official-1r.json"

    elif name == 'trace':
        train_file = "content/darpatc/trace_train.txt"
        attribute_file = "content/ta1-trace-e3-official-1.json"

    # Load and process data
    with open(train_file) as f:
        data = f.read().split('\n')
    data = [line.split('\t') for line in data]

    df = pd.DataFrame(data, columns=['actorID', 'actor_type', 'objectID', 'object', 'action', 'timestamp'])
    df = df.dropna()
    df.sort_values(by='timestamp', ascending=True, inplace=True)
    df = add_attributes(df, attribute_file)
    df = df[df['actor_type'] == 'SUBJECT_PROCESS']
    df = df[df['object'].isin(tokens)]

    docs, labels, edges, mapp, pids, idx_to_pid = prepare_graph(df)

    return name, docs, labels, edges, mapp, pids, idx_to_pid

def load_clients_data_parallel(pool_size=100):
    global data_cache, categories, procs_total

    start_time = time.time()

    # Parallel processing using multiple CPUs
    with Pool(pool_size) as pool:
        results = pool.map(process_file, ['cadets', 'theia', 'trace'])

    # Gather results
    for result in results:
        name, docs, labels, edges, mapp, pids, idx_to_pid = result
        data_cache[name] = [docs, labels, edges, mapp, pids, idx_to_pid]
        procs_total += pids

    categories = define_categories(procs_total)

    end_time = time.time()
    print(f"Execution time: {end_time - start_time} seconds")

In [54]:
def client_handling_loop(client_id):    
    print(f"Running Setup on Client {client_id} \n")
    
    docs,labels,edges,mapp,pids,idx_to_pid = data_cache[client_id]
    
    nodes_feat = []
    for x in docs:
        nodes_feat.append(infer(x)) 
        
    trained_local_models = train_gnn_func(nodes_feat,labels,edges,mapp,pids,idx_to_pid)
    return trained_local_models

In [55]:
def server_aggregate(all_models):
    global_models = copy.deepcopy(templates)
    
    for l in range(len(all_models)):
        
        current_models = all_models[l]
        current_models = [x for x in current_models if x != None]
        
        if not len(current_models) == 0:
        
            global_dict = global_models[l].state_dict()

            for k in global_dict.keys():
                param_list = [current_models[i].state_dict()[k] for i in range(len(current_models))]
                global_dict[k] = torch.stack(param_list, 0).mean(0)

            global_models[l].load_state_dict(global_dict)
            torch.save(global_models[l].state_dict(), f"Content_FL_Exp/target_e3_global{l}.pth")
                   
    return global_models

In [56]:
import random
def perform_federated_learning(n_clients):
    client_models = []
    for c in n_clients:
        local_gnns = client_handling_loop(c)
        client_models.append(local_gnns)
    return client_models

In [57]:
if TRAIN:
    !rm Content_FL_Exp/target_e3_*.pth

In [58]:
with open('Content_FL_Exp/e3_ensemble_ben.json', 'r') as f:
    data_cache = json.load(f)

proc_total = []
for x in ['cadets','theia','trace']:
    proc_total = proc_total + data_cache[x][-2]
    
categories = define_categories(proc_total)

In [59]:
#load_clients_data()

In [60]:
if TRAIN:
    for r in range(learning_rounds):
        print(f"Federated Learning Round Number: {r}\n")
        client_models = perform_federated_learning(hosts)
        arranged_models =  [list(group) for group in zip(*client_models)]
        global_models = server_aggregate(arranged_models)

## Evaluation of the trained GNN model starts here

In [61]:
'''
This function is used for constructing neighborhood around a given 
set of nodes for backwards or forward tracking
'''

def construct_neighborhood(ids,mapp,edges,hops):
    if hops == 0:
        return set()
    else:
        neighbors = set()
        for i in range(len(edges[0])):
            if mapp[edges[0][i]] in ids:
                neighbors.add(mapp[edges[1][i]])
            if mapp[edges[1][i]] in ids:
                neighbors.add(mapp[edges[0][i]])
        return neighbors.union( construct_neighborhood(neighbors,mapp,edges,hops-1) )

In [62]:
'''
This function logs the evaluation metrics.
'''

def helper(MP,all_pids,GP,edges,mapp):

    TP = MP.intersection(GP)  
    FP = MP - GP              
    FN = GP - MP              
    TN = all_pids - (GP | MP)
    
    two_hop_gp = construct_neighborhood(GP,mapp,edges,2)
    two_hop_tp = construct_neighborhood(TP,mapp,edges,2)
    FPL = FP - two_hop_gp
    TPL = TP.union(FN.intersection(two_hop_tp))
    FN = FN - two_hop_tp
    
    alerts = TP.union(FP)

    TP,FP,FN,TN = len(TPL),len(FPL),len(FN),len(TN)
    
    FPR = FP / (FP+TN)
    TPR = TP / (TP+FN)

    print(f"Number of True Positives: {TP}")
    print(f"Number of False Positives: {FP}")
    print(f"Number of False Negatives: {FN}")

    prec = TP / (TP + FP)
    print(f"Precision: {prec}")

    rec = TP / (TP + FN)
    print(f"Recall: {rec}")

    fscore = (2*prec*rec) / (prec + rec)
    print(f"Fscore: {fscore}\n")
    
    #return alerts
    return TPL,FPL

In [63]:
data_cache_mal = {}
def load_data_test():
    
    test_file = None
    attribute_file = None
    
    for name in ['cadets','theia','trace']:

        if name == 'cadets':
            test_file = 'content/darpatc/cadets_test.txt'
            attribute_file = "content/ta1-cadets-e3-official-2.json"

        if name == 'theia':
            test_file = "content/darpatc/theia_test.txt"
            attribute_file = "content/ta1-theia-e3-official-6r.json.8"

        if name == 'trace':
            test_file = "content/darpatc/trace_test.txt"
            attribute_file = "content/ta1-trace-e3-official-1.json.4"

        f = open(test_file)

        data = f.read().split('\n')
        data = [line.split('\t') for line in data]

        df = pd.DataFrame (data, columns = ['actorID', 'actor_type','objectID','object','action','timestamp'])
        df = df.dropna()
        df.sort_values(by='timestamp', ascending=True,inplace=True)

        df = add_attributes(df,attribute_file)

        df = df[df['actor_type'] == 'SUBJECT_PROCESS'] 
        df = df[df['object'].isin(tokens)]

        docs,labels,edges,mapp,pids,idx_to_pid = prepare_graph(df)
        data_cache_mal[name] = [docs,labels,edges,mapp,pids,idx_to_pid]

In [64]:
with open('Content_FL_Exp/e3_ensemble_mal.json', 'r') as f:
    data_cache_mal = json.load(f)

In [65]:
#load_data_test()

In [66]:
def run_evaluation(data_name,thresh):    
    global word2vec,tokens

    client_data = data_cache_mal[data_name]
            
    phrases,labels,edges,mapp,pids,idx_to_pid = client_data

    gt = open(f"{data_name}.txt").read()
    GT_mal = gt.split("\n")
    GT_mal = set(GT_mal)

    model = GCN(30,len(tokens)).to(device)
    word2vec = Word2Vec.load("Content_FL_Exp/global_word2vec_E3.model")

    nodes = [infer(x) for x in phrases]
    nodes = np.array(nodes)  

    all_ids = set(mapp)
        
    graph = Data(x=torch.tensor(nodes,dtype=torch.float).to(device),y=torch.tensor(labels,dtype=torch.long).to(device), edge_index=torch.tensor(edges,dtype=torch.long).to(device))

    flag = torch.tensor([True]*graph.num_nodes, dtype=torch.bool)
    
    for m_n in range(num_of_ctg):
        if f"target_e3_global{m_n}.pth" in os.listdir("Content_FL_Exp"): 
            model.load_state_dict(torch.load(f"Content_FL_Exp/target_e3_global{m_n}.pth",map_location=torch.device('cpu')))
            
        model.eval()
        out = model(graph.x, graph.edge_index)

        sorted, indices = out.sort(dim=1,descending=True)
        conf = (sorted[:,0] - sorted[:,1]) / sorted[:,0]
        conf = (conf - conf.min()) / conf.max()

        pred = indices[:,0]
        cond = (pred == graph.y) & (conf >= thresh)
        flag[cond] = torch.logical_and(flag[cond], torch.tensor([False]*len(flag[cond]), dtype=torch.bool))

    index = utils.mask_to_index(flag).tolist()
    ids = set([mapp[x] for x in index])
    metrics = helper(set(ids),set(all_ids),GT_mal,edges,mapp)

In [67]:
_ = run_evaluation('cadets',0.85)

Number of True Positives: 12846
Number of False Positives: 308
Number of False Negatives: 12
Precision: 0.9765850691804774
Recall: 0.999066728884741
Fscore: 0.987697985545133



In [68]:
_ = run_evaluation('theia',0.90)

Number of True Positives: 25311
Number of False Positives: 697
Number of False Negatives: 48
Precision: 0.9732005536757921
Recall: 0.9981071808825269
Fscore: 0.985496525006327



In [69]:
_ = run_evaluation('trace',0.8)

Number of True Positives: 67357
Number of False Positives: 3599
Number of False Negatives: 816
Precision: 0.9492784260668583
Recall: 0.9880304519384507
Fscore: 0.9682668602519963

