In [326]:
import os
import json
import pickle
import random
import logging
import numpy as np
import pandas as pd
from glob import glob
from tqdm.notebook import tqdm
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report

import dgl
import dgl.nn as dglnn
from dgl.nn import GraphConv, GATConv, SAGEConv

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import AdamW, lr_scheduler
from torch.utils.data import Dataset, DataLoader
from transformers import get_linear_schedule_with_warmup

os.environ['CUDA_VISIBLE_DEVICES'] = "0"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
logging.basicConfig(format='%(asctime)s | %(levelname)s | %(message)s', level=logging.INFO, datefmt='%Y-%m-%d %H:%M:%S') 

In [192]:
import re

def build_dictionary(file_path):
    with open(file_path, 'r') as file:
        next(file)
        # 使用正则表达式去除行末的数字
        dictionary = {re.sub(r'\s\d+$', '', line.strip()): index for index, line in enumerate(file)}
    return dictionary

In [299]:
with open("./relations.txt") as fp:
    relations = [r.strip() for r in fp.readlines()]
    
# with open("../data/3_openKE_2/label2id.txt", "r") as fp:
#     label2index = fp.load()
# with open("./index2label.pkl", "rb") as fp:
#     index2label = pickle.load(fp)

# Make dataset

In [301]:
def get_value(event):
    global type2attr

    srcUUID = event["srcNode"]["UUID"]
    srcType = event["srcNode"]["Type"]
    srcAttr = event["srcNode"][type2attr[srcType]]
    dstUUID = event["dstNode"]["UUID"] if event["dstNode"] != None else srcUUID
    dstType = event["dstNode"]["Type"] if event["dstNode"] != None else srcType
    dstAttr = event["dstNode"][type2attr[dstType]] if event["dstNode"] != None else srcAttr
    return srcUUID, srcAttr, dstUUID, dstAttr, event["relation"], event["label"]

In [302]:
def make_dataset(dataset):
    global node_ent2idx, edge_ent2idx, node_ent2emb, edge_ent2emb

    data_list = []
    for p in tqdm(dataset):
        with open(p) as fp:
            events = json.load(fp)
            
        nodes = set()
        edges = []
        relations = []
        labels = []
        uuid2res = {}
        for e in events:
            srcUUID, srcAttr, dstUUID, dstAttr, rel, label = get_value(e)

            uuid2res[srcUUID], uuid2res[dstUUID] = srcAttr, dstAttr
            nodes.add(srcUUID)
            nodes.add(dstUUID)
            edges.append((srcUUID, dstUUID))
            relations.append(edge_ent2idx[rel])
            labels.append(label2index[label])
        nodes = list(nodes)  
        node_feat = [torch.tensor(node_ent2emb[node_ent2idx[uuid2res[uuid]]], dtype=torch.float32) for uuid in nodes]
        edge_attr = [torch.tensor(edge_ent2emb[idx], dtype=torch.float32) for idx in relations]

        src = [nodes.index(src_uuid) for src_uuid, dst_uuid in edges]
        dst = [nodes.index(dst_uuid) for src_uuid, dst_uuid in edges]
        edge_index = torch.tensor([src, dst], dtype=torch.long)

        
        data_list.append({
            "labels": labels,
            "num_nodes": len(nodes),
            "node_feat": node_feat,
            "edge_attr": edge_attr,
            "edge_index": edge_index
        })
    return data_list         

### Directly change the entity to index and change the index into embedding
- need ent -> index
- need ent -> embedding

In [303]:
file_path = '../data/3_openKE_2/label2id.txt'  # 替換為您檔案的路徑
label2index = build_dictionary(file_path)

label2index

{'T1003.002_5a484b65c247675e3b7ada4ba648d376_I': 0,
 'T1113_316251ed-6a28-4013-812b-ddf5b5b007f8_I': 1,
 'T1105_c521e0a70b243a0cf9217907ca3c6d27_I': 2,
 'T1564.004_28862487a99f5f89bc0d68c87396c7e9_I': 3,
 'T1531_b25ae80dad74142fafb510e9c1949ace_B': 4,
 'T1562.002_6a8d25d65a7d481dc479f89c62af1e6a_B': 5,
 'T1047_6935e41353aa781bb723462d26114c44_B': 6,
 'T1562.004_8fe59e288f10a486dc8b44bc872019ff_I': 7,
 'T1105_0856c235a1d26113d4f2d92e39c9a9f8_B': 8,
 'T1069.001_a1f48fa3ddee658b29b414523c9a295b_B': 9,
 'T1518.001_33a24ff44719e6ac0614b58f8c9a7c72_B': 10,
 'T1482_6131397e-7765-424e-a594-3d7fb2d93a6a_B': 11,
 'T1059.001_ccdb8caf-c69e-424b-b930-551969450c57_B': 12,
 'T1201_38f6f0e50a6b196140ec40d3dc9cc9e6_B': 13,
 'T1016_a0676fe1-cd52-482e-8dde-349b73f9aa69_B': 14,
 'T1547.004_0856714c9810ac55b53e9964d02958a0_B': 15,
 'T1021.001_dd67068b052fa553ad4a0ac7d6a5ea89_B': 16,
 'T1552.002_3e5b04b8ee0a1a4950da8f35d95e65fc_B': 17,
 'T1547.001_163b023f43aba758d36f524d146cb8ea_I': 18,
 'T1057_f8de05d1741

In [304]:
index2label = {v: k for k, v in label2index.items()}

index2label

{0: 'T1003.002_5a484b65c247675e3b7ada4ba648d376_I',
 1: 'T1113_316251ed-6a28-4013-812b-ddf5b5b007f8_I',
 2: 'T1105_c521e0a70b243a0cf9217907ca3c6d27_I',
 3: 'T1564.004_28862487a99f5f89bc0d68c87396c7e9_I',
 4: 'T1531_b25ae80dad74142fafb510e9c1949ace_B',
 5: 'T1562.002_6a8d25d65a7d481dc479f89c62af1e6a_B',
 6: 'T1047_6935e41353aa781bb723462d26114c44_B',
 7: 'T1562.004_8fe59e288f10a486dc8b44bc872019ff_I',
 8: 'T1105_0856c235a1d26113d4f2d92e39c9a9f8_B',
 9: 'T1069.001_a1f48fa3ddee658b29b414523c9a295b_B',
 10: 'T1518.001_33a24ff44719e6ac0614b58f8c9a7c72_B',
 11: 'T1482_6131397e-7765-424e-a594-3d7fb2d93a6a_B',
 12: 'T1059.001_ccdb8caf-c69e-424b-b930-551969450c57_B',
 13: 'T1201_38f6f0e50a6b196140ec40d3dc9cc9e6_B',
 14: 'T1016_a0676fe1-cd52-482e-8dde-349b73f9aa69_B',
 15: 'T1547.004_0856714c9810ac55b53e9964d02958a0_B',
 16: 'T1021.001_dd67068b052fa553ad4a0ac7d6a5ea89_B',
 17: 'T1552.002_3e5b04b8ee0a1a4950da8f35d95e65fc_B',
 18: 'T1547.001_163b023f43aba758d36f524d146cb8ea_I',
 19: 'T1057_f8de05d

In [305]:
file_path = '../data/3_openKE_2/relation2id.txt'
edge_ent2idx = build_dictionary(file_path)

edge_ent2idx

{'Process Create': 0,
 'Process Start': 1,
 'CreateFile': 2,
 'SetBasicInformationFile': 3,
 'SetDispositionInformationEx': 4,
 'SetDispositionInformationFile': 5,
 'WriteFile': 6,
 'TCP Connect': 7,
 'TCP Send': 8,
 'UDP Send': 9,
 'TCP Disconnect': 10,
 'RegQueryKey': 11,
 'RegQueryValue': 12,
 'CloseFile': 13,
 'QueryAllInformationFile': 14,
 'QueryAttributeTagFile': 15,
 'QueryBasicInformationFile': 16,
 'QueryDirectory': 17,
 'QueryNetworkOpenInformationFile': 18,
 'ReadFile': 19,
 'TCP Receive': 20,
 'UDP Receive': 21,
 'RegCreateKey': 22,
 'RegSetValue': 23,
 'RegCloseKey': 24,
 'RegDeleteValue': 25,
 'RegOpenKey': 26}

In [306]:
file_path = '../data/3_openKE_2/entity2id.txt'
node_ent2id_txt = build_dictionary(file_path)

In [307]:
node_ent2id_txt['\x1f@028;0 2K5740 70 3@0=8FC A>B@C4=8:0<.exe']

KeyError: '\x1f@028;0 2K5740 70 3@0=8FC A>B@C4=8:0<.exe'

In [308]:
len(node_ent2id_txt)

816212

In [309]:
# node_ent2idx = build_dictionary('../data/3_openKE/entity2id.txt')

with open (f'../data/3_openKE_2/entity2id.pkl', 'rb') as fp:
    node_ent2idx = pickle.load(fp)

In [310]:
node_ent2idx['\x1f@028;0 2K5740 70 3@0=8FC A>B@C4=8:0<.exe']

663432

In [311]:
len(node_ent2idx)

816212

In [313]:
with open(f'./secureBERT_YR/node_vocab2index.pkl', 'rb') as fp:
    node_ent2idx_YR = pickle.load(fp)
with open(f'./secureBERT_YR/edge_vocab2index.pkl', 'rb') as fp:
    edge_ent2idx_YR = pickle.load(fp)
    
with open(f'./secureBERT_YR/nodes_ent2emb_256.pkl', 'rb') as fp:
    node_ent2emb_YR = pickle.load(fp)
with open(f'./secureBERT_YR/edges_ent2emb_16.pkl', 'rb') as fp:
    edge_ent2emb_YR = pickle.load(fp)

In [312]:
node_ent2idx

{'C:\\Users\\bmoore\\AppData\\Local\\Microsoft\\Edge\\User Data\\hyphen-data\\101.0.4906.0\\*': 0,
 'C:\\Users\\nicholas37\\AppData\\Local\\Packages\\Microsoft.BingWeather_8wekyb3d8bbwe\\LocalState\\.git': 1,
 'C:\\Users\\amanda90\\AppData\\Local\\Microsoft\\input\\en-GB': 2,
 'C:\\Users\\ohall\\AppData\\Local\\Microsoft\\OneDrive\\logs\\ListSync\\.git': 3,
 'C:\\Users\\ohall\\AppData\\Local\\Microsoft\\Edge\\User Data\\Web Notifications Deny List\\.git': 4,
 'C:\\Users\\amberhernandez\\AppData\\Local\\Microsoft\\InputPersonalization\\*': 5,
 'C:\\Users\\evansdaniel\\AppData\\Local\\Mozilla\\Firefox\\Profiles\\wfxwaomf.default-release\\cache2\\.git': 6,
 'C:\\Users\\nataliekent\\AppData\\Local\\Google\\Chrome\\User Data\\Default\\Extensions\\felcaaldnbdncclmgdcncolpebgiejap\\1.2_0\\_locales\\en_GB\\*': 7,
 'C:\\Users\\amanda90\\AppData\\Local\\Microsoft\\input\\fr-CI\\*': 8,
 'C:\\Users\\ricky35\\AppData\\Local\\Microsoft\\Edge\\User Data\\WidevineCdm\\4.10.2391.6\\.git': 9,
 'C:\\User

### Add the embedding

In [314]:
DIM = 50
embedding = "transE"
embedding = f'{embedding}_{DIM}'

# with open(f"../data/4_embedding/{embedding}.vec.json", "r") as f:
with open(f"../data/4_embedding_2/{embedding}.vec.json", "r") as f:
    tmp = json.load(f)

node_ent2emb = {idx:emb for idx, emb in tqdm(enumerate(tmp["ent_embeddings.weight"]))}
edge_ent2emb = {idx:emb for idx, emb in tqdm(enumerate(tmp["rel_embeddings.weight"]))}

0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [315]:
# 将 node_ent2emb, edge_ent2emb 转换为 NumPy 数组
node_ent2emb = np.array([node_ent2emb[i] for i in range(len(node_ent2emb))], dtype=np.float32)
edge_ent2emb = np.array([edge_ent2emb[i] for i in range(len(edge_ent2emb))], dtype=np.float32)

In [316]:
node_ent2emb.shape

(816212, 50)

In [317]:
edge_ent2emb.shape

(27, 50)

In [318]:
type2attr = {
    "Process": "Cmdline", 
    "File": "Name", 
    "Registry": "Key", 
    "Network": "Dstaddress"
}

random.seed(42)
trainset, validset, testset = [], [], []
for ability in tqdm(os.listdir('../data/Raw_dataset/')):
    paths = glob(f'../data/Raw_dataset/{ability}/number_*/expanded_instance.json')
    random.shuffle(paths)
    trainset.extend(paths[:80])
    validset.extend(paths[80:90])
    testset.extend(paths[90:])
    
train_data = make_dataset(trainset)
valid_data = make_dataset(validset)
test_data = make_dataset(testset)

  0%|          | 0/167 [00:00<?, ?it/s]

  0%|          | 0/13360 [00:00<?, ?it/s]

  0%|          | 0/1670 [00:00<?, ?it/s]

  0%|          | 0/1670 [00:00<?, ?it/s]

# Make Torch dataset

In [319]:
class GraphDataset(Dataset):
    def __init__(self, data_list, device):
        self.data_list = data_list
        self.device = device

    def __len__(self):
        return len(self.data_list)
    
    def __getitem__(self, idx):
        data = self.data_list[idx]
        return data

def collate(samples):
    data_list = samples
    batched_graphs = []
    for data in data_list:
        g = dgl.graph((data["edge_index"][0], data["edge_index"][1]), num_nodes=data["num_nodes"])

        g.ndata['feat'] = torch.stack(data["node_feat"])
        g.edata['feat'] = torch.stack(data["edge_attr"])
        # print(data["labels"])
        g.edata['label'] = torch.tensor(data["labels"])  

        batched_graphs.append(g)
    
    return dgl.batch(batched_graphs)

In [321]:
train_GraphDataset = GraphDataset(train_data, device)
valid_GraphDataset = GraphDataset(valid_data, device)
test_GraphDataset = GraphDataset(test_data, device)

batch_size = 32

train_dataloader = DataLoader(train_GraphDataset, batch_size, shuffle=True, collate_fn=collate)
valid_dataloader = DataLoader(valid_GraphDataset, batch_size, shuffle=True, collate_fn=collate)
test_dataloader = DataLoader(test_GraphDataset, batch_size, shuffle=False, collate_fn=collate)

# Model

In [322]:
class GraphSAGE(nn.Module):
    def __init__(self, in_dim, hidden_dim, out_dim):
        super(GraphSAGE, self).__init__()
        self.layer1 = dglnn.SAGEConv(in_dim, hidden_dim, 'pool')
        self.layer2 = dglnn.SAGEConv(hidden_dim, out_dim, 'pool')
        self.dropout = nn.Dropout(0.25)

    def forward(self, g, inputs):
        h = self.layer1(g, inputs)
        h = torch.relu(h)
        h = self.dropout(h)
        h = self.layer2(g, h)
        return h
    
class MLPPredictor(nn.Module):
    def __init__(self, out_feats, out_classes, edge_embedding_dim):
        super().__init__()
        self.W = nn.Linear(out_feats*2 + edge_embedding_dim, out_classes)

    def apply_edges(self, edges, edge_feat):
        h_u = edges.src['h']
        h_v = edges.dst['h']
        h_e = edge_feat
        score = self.W(torch.cat([h_u, h_v, h_e], 1))
        return {'score': score}

    def forward(self, graph, h, edge_feat):
        with graph.local_scope():
            graph.ndata['h'] = h
            # graph.apply_edges(self.apply_edges)
            graph.apply_edges(lambda edges: self.apply_edges(edges, edge_feat))
            return graph.edata['score']
        
class Model(nn.Module):
    def __init__(self, in_features, hidden_features, out_features, num_classes, edge_embedding_dim):
        super().__init__()
        self.sage = GraphSAGE(in_features, hidden_features, out_features)
        self.pred = MLPPredictor(out_features, num_classes, edge_embedding_dim)
      
    def forward(self, g, node_feat, edge_feat, return_logits=False):
        h = self.sage(g, node_feat)
        logits = self.pred(g, h, edge_feat)
        
        return logits

In [323]:
def same_seeds(seed = 42):
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)  
    np.random.seed(seed)  
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

def model_fn(batched_g, model, criterion, device, which_type='train'):
    """Forward a batch through the model."""
    batched_g = batched_g.to(device)
    
    labels = batched_g.edata['label'].to(device)    
    # logits = model(batched_g, batched_g.ndata['feat'].float())
    logits = model(batched_g, batched_g.ndata['feat'].float(), batched_g.edata['feat'].float())
    loss = criterion(logits, labels)

    output = torch.softmax(logits, dim=1)
    preds = output.argmax(1)
    # print(preds)
    
    accuracy = torch.mean((preds == labels).float())
        
    return loss, accuracy, preds

In [324]:
# for data in tqdm(train_dataloader):
#     print(data)
#     print()
#     print(data.edata['label'])
#     break

In [325]:
len(label2index)

274

In [286]:
os.environ['CUDA_VISIBLE_DEVICES'] = "0"

In [327]:
print(device)

cuda


In [328]:
same_seeds(42)
model = Model(in_features=50, hidden_features=64, out_features=128, num_classes=len(label2index)-1, edge_embedding_dim = 50)
model = model.to(device)
optimizer = AdamW(model.parameters(), lr=5e-4)
scheduler = lr_scheduler.CosineAnnealingLR(optimizer, T_max=20, eta_min=0, last_epoch=- 1, verbose=False)
criterion = nn.CrossEntropyLoss()
model_save_path = "./model/GraphSAGE_transE_50"

best_val_loss = float('inf')

if not os.path.isdir(model_save_path):
    os.makedirs(model_save_path)

epochs = 200
best_val_loss = float('inf')
best_val_acc = float('-inf')
best_model_path = ""
for epoch in tqdm(range(epochs)):
    model.train()
    total_loss = 0.0
    total_accuracy = 0.0    
    for data in tqdm(train_dataloader):
        loss, accuracy, _ = model_fn(data, model, criterion, device, which_type='train')        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        total_accuracy += accuracy.item()

    scheduler.step()
    
    avg_loss = total_loss / len(train_dataloader)
    avg_accuracy = total_accuracy / len(train_dataloader)
    logging.info(f'Epoch {epoch} | Train Loss: {avg_loss:.4f} | Train Accuracy: {avg_accuracy:.4f}')
    
    # Validation Part
    model.eval()
    total_accuracy = 0.0
    total_loss = 0.0
    with torch.no_grad():
        for data in tqdm(valid_dataloader):
            loss, accuracy, _ = model_fn(data, model, criterion, device, which_type='validation')
            total_accuracy += accuracy.item()
            total_loss += loss.item()

    avg_accuracy = total_accuracy / len(valid_dataloader)
    current_loss = total_loss / len(valid_dataloader)
    if current_loss < best_val_loss and avg_accuracy > best_val_acc:
        best_val_loss = current_loss
        best_val_acc = avg_accuracy
        best_model_path = f'{model_save_path}/epoch_{epoch}_loss_{current_loss:.4f}_acc_{avg_accuracy:.4f}'
        print("Best Model Found!! ", best_model_path)
        
    logging.info(f'Validation Loss: {current_loss:.4f} | Validation Accuracy: {avg_accuracy:.4f}\n')    
    torch.save(model.state_dict(), f'{model_save_path}/epoch_{epoch}_loss_{current_loss:.4f}_acc_{avg_accuracy:.4f}')

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/418 [00:00<?, ?it/s]

DGLError: [15:29:31] /opt/dgl/src/runtime/cuda/cuda_device_api.cc:343: Check failed: e == cudaSuccess || e == cudaErrorCudartUnloading: CUDA: device-side assert triggered
Stack trace:
  [bt] (0) /media/Raid6_disk/bai/anaconda3/envs/DGL/lib/python3.10/site-packages/dgl/libdgl.so(+0x98b4e7) [0x7f3a29f3f4e7]
  [bt] (1) /media/Raid6_disk/bai/anaconda3/envs/DGL/lib/python3.10/site-packages/dgl/libdgl.so(dgl::runtime::CUDADeviceAPI::CopyDataFromTo(void const*, unsigned long, void*, unsigned long, unsigned long, DGLContext, DGLContext, DGLDataType)+0x82) [0x7f3a29f418a2]
  [bt] (2) /media/Raid6_disk/bai/anaconda3/envs/DGL/lib/python3.10/site-packages/dgl/libdgl.so(dgl::runtime::NDArray::CopyFromTo(DGLArray*, DGLArray*)+0x1ac) [0x7f3a29debdfc]
  [bt] (3) /media/Raid6_disk/bai/anaconda3/envs/DGL/lib/python3.10/site-packages/dgl/libdgl.so(dgl::runtime::NDArray::CopyTo(DGLContext const&) const+0xf9) [0x7f3a29e21d29]
  [bt] (4) /media/Raid6_disk/bai/anaconda3/envs/DGL/lib/python3.10/site-packages/dgl/libdgl.so(dgl::UnitGraph::CopyTo(std::shared_ptr<dgl::BaseHeteroGraph>, DGLContext const&)+0x1e7) [0x7f3a29f148f7]
  [bt] (5) /media/Raid6_disk/bai/anaconda3/envs/DGL/lib/python3.10/site-packages/dgl/libdgl.so(dgl::HeteroGraph::CopyTo(std::shared_ptr<dgl::BaseHeteroGraph>, DGLContext const&)+0xfa) [0x7f3a29e2daaa]
  [bt] (6) /media/Raid6_disk/bai/anaconda3/envs/DGL/lib/python3.10/site-packages/dgl/libdgl.so(+0x88a5f6) [0x7f3a29e3e5f6]
  [bt] (7) /media/Raid6_disk/bai/anaconda3/envs/DGL/lib/python3.10/site-packages/dgl/libdgl.so(DGLFuncCall+0x4c) [0x7f3a29dd5ebc]
  [bt] (8) /media/Raid6_disk/bai/anaconda3/envs/DGL/lib/python3.10/site-packages/dgl/_ffi/_cy3/core.cpython-310-x86_64-linux-gnu.so(+0x1b986) [0x7f3a1af67986]



In [None]:
best_model_path

In [None]:
# load the pretrained model
model.load_state_dict(torch.load(best_model_path))

model.to(device)
model.eval()

total = 0
correct = 0
true_labels = []
predicted_labels = []
with torch.no_grad():
    for data in test_dataloader:
        loss, accuracy, predicted = model_fn(data, model, criterion, device, which_type='test')
        labels = data.edata['label'].to(device)
        
        true_labels.extend(labels.cpu().numpy())
        predicted_labels.extend(predicted.cpu().numpy())
                
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

logging.info(f'Test Accuracy: {100 * correct / total:.4f} %\n\n\n')

In [None]:
report_data = classification_report(true_labels, predicted_labels, output_dict=True)
report_df = pd.DataFrame(report_data).transpose()

output_path = "./result/GraphSAGE_emb256"
if not os.path.isdir(output_path):
    os.makedirs(output_path)
    
report_df.reset_index(inplace=True, names='label')

label_list = []
for idx, row in report_df.iterrows():
    if row["label"].isdigit():
        row["label"] = index2label[int(row["label"])]
    label_list.append(row["label"])
report_df["label"] = label_list

report_df.to_csv(f'{output_path}/result.csv', index=False)

In [None]:
report_df