## This is the code to make the original DARPA dataset(json file) to the format we want(src, dest, rel, label)

In [None]:
from glob import glob
import linecache
import json
from tqdm import tqdm

#### get the input file(DARPA)

In [None]:
# 获取文件路径
file_path = glob(f'../data_new/DARPA/source_data/3.4/*.json')
file_path = sorted(file_path)

all_events = []
for p in file_path:
    all_events.extend(linecache.updatecache(p))

# 对事件进行排序（如果需要）
all_events = sorted(all_events)

#### get the subject, relation, object mapping

In [None]:
# subj 和 obj 的 uuids
nodes = set()
rels = set()

# 遍历所有事件，提取所需的字段
for e in tqdm(all_events, desc="Processing events"):
    e = json.loads(e)

    nodes.add(e["subj"]["uuid"])
    nodes.add(e["obj"]["uuid"])
    rels.add(e["relation"])

# 创建映射
node2idx = {uuid: idx for idx, uuid in enumerate(tqdm(nodes, desc="Mapping nodes"))}
rel2idx = {rel: idx for idx, rel in enumerate(tqdm(rels, desc="Mapping relations"))}

#### get the output format of the data with the encoded uuids and label

In [None]:
event_encode = []
for e in tqdm(all_events, desc="Encoding events"):
    e = json.loads(e)
    label = 0 if e["label"] == "benign" else 1
    event_encode.append((node2idx[e["subj"]["uuid"]], rel2idx[e["relation"]], node2idx[e["obj"]["uuid"]], label))

# 打印结果
print(event_encode)
print(node2idx)

### output the txt file
- node mapping  
- relation mapping  
- output file(in the format we want)

In [None]:

# 保存映射和事件编码到文件
with open('../data_new/DARPA/before_embedding/3.10/node2idx.txt', 'w') as f:
    for key, value in node2idx.items():
        f.write(f'{key} {value}\n')

with open('../data_new/DARPA/before_embedding/3.10/rel2idx.txt', 'w') as f:
    for key, value in rel2idx.items():
        f.write(f'{key} {value}\n')

with open('../data_new/DARPA/before_embedding/3.10/event_encode.txt', 'w') as f:
    for item in event_encode:
        f.write(f'{item[0]} {item[1]} {item[2]} {item[3]}\n')


#### get the # of the attack

In [None]:
# 计算 label = 1 的事件数量
label_1_count = sum(1 for event in event_encode if event[-1] == 1)

# 打印结果
print("Number of events with label = 1:", label_1_count)


In [None]:
from glob import glob
import linecache
import json
from tqdm import tqdm

file_path = glob(f'../data_new/DARPA/source_data/test/*.json')
file_path = sorted(file_path)

all_events = []
for p in file_path:
    all_events.extend(linecache.updatecache(p))

all_events = sorted(all_events)

nodes = set()
rels = set()

for e in tqdm(all_events, desc="Processing events"):
    e = json.loads(e)

    nodes.add(e["subj"]["uuid"])
    nodes.add(e["obj"]["uuid"])
    rels.add(e["relation"])

node2idx = {uuid: idx for idx, uuid in enumerate(tqdm(nodes, desc="Mapping nodes"))}
rel2idx = {rel: idx for idx, rel in enumerate(tqdm(rels, desc="Mapping relations"))}

event_encode = []
for e in tqdm(all_events, desc="Encoding events"):
    e = json.loads(e)
    label = 0 if e["label"] == "benign" else 1
    event_encode.append((node2idx[e["subj"]["uuid"]], rel2idx[e["relation"]], node2idx[e["obj"]["uuid"]], label))

print(event_encode)
print(node2idx)
