### This is the script to get the embedding on the synthesized dataset  
- ``Trans Family``: directly use the word embedding -> do not consider the relation  
- ``secureBERT``: would consider the relation and get the overall embedding -> node and relation embedding

#### The version with the PCA reduce dimension
- dimension = [DIM, 26]
- the original version

In [8]:
import os
import json
import numpy as np
from tqdm.notebook import tqdm
from sklearn.decomposition import PCA

# Load embedding function
def load_embedding(input_embedding_name, model, DIM):
    if model.startswith('trans'):
        with open(input_embedding_name) as f:
            data = json.load(f)

        # trans family wouldn't consider the relation embedding -> directly use the word embedding
        # so the dimension of the node and the edge would be the same
        ent_embeddings = np.array(data['ent_embeddings.weight'])
        rel_embeddings = np.array(data['rel_embeddings.weight'])
        return ent_embeddings, rel_embeddings
    
    elif model == 'secureBERT':
        ent_embeddings = np.empty((0, 768), dtype=np.float32)
        for filename in sorted(os.listdir(input_embedding_name)):
            print(filename)

            if not filename.startswith('embeddings_chunk'):
                continue

            embedding = np.load(f'{input_embedding_name}/{filename}')

            print(ent_embeddings.shape, embedding.shape)

            ent_embeddings = np.concatenate((ent_embeddings, embedding), axis=0)
            print(filename, ent_embeddings.shape)

        print(f'Reducing entity embedding to ({DIM},)')
        print(ent_embeddings.shape, '->', end=' ')
        
        pca = PCA(n_components=DIM)
        ent_embeddings = pca.fit_transform(ent_embeddings)
        print(ent_embeddings.shape)

        # secureBERT would consider the edge embedding -> input is relation.npy
        # dimension of the node -> depends on us
        # dimension of the edge -> 26 (since PCA)
        
        
        rel_embeddings = np.load(f'{input_embedding_name}/relation.npy')
        print(f'Reducing relation embedding to ({len(rel_embeddings)},)')
        print(rel_embeddings.shape, '->', end=' ')
        
        
        pca = PCA(n_components=len(rel_embeddings))
        rel_embeddings = pca.fit_transform(rel_embeddings)
        print(rel_embeddings.shape)
        return ent_embeddings, rel_embeddings
    else:
        print('Error!!')
        return None

#### The version of without PCA reduce dimension
- node dimension = 768
- edge dimension = 768

In [18]:
import os
import json
import numpy as np
from tqdm.notebook import tqdm

# Load embedding function
def load_embedding(input_embedding_name, model):
    if model.startswith('trans'):
        with open(input_embedding_name) as f:
            data = json.load(f)
        ent_embeddings = np.array(data['ent_embeddings.weight'])
        rel_embeddings = np.array(data['rel_embeddings.weight'])
        return ent_embeddings, rel_embeddings
    
    elif model == 'secureBERT':
        ent_embeddings = np.empty((0, 768), dtype=np.float32)
        for filename in sorted(os.listdir(input_embedding_name)):
            filepath = os.path.join(input_embedding_name, filename)
            if not os.path.isfile(filepath) or not filename.startswith('embeddings_chunk'):
                continue

            embedding = np.load(filepath)
            print(filename)
            print(ent_embeddings.shape, embedding.shape)
            ent_embeddings = np.concatenate((ent_embeddings, embedding), axis=0)
            print(filename, ent_embeddings.shape)

        # secureBERT would consider the edge embedding -> input is relation.npy
        # 直接加载关系嵌入，不进行 PCA 降维
        rel_embeddings = np.load(f'{input_embedding_name}/relation.npy')
        return ent_embeddings, rel_embeddings

    else:
        print('Error!!')
        return None

#### The version of PCA with node and without PCA with edge
- node dimension: depends on the model
- edge dimension: 768

In [6]:
import os
import json
import numpy as np
from tqdm.notebook import tqdm
from sklearn.decomposition import PCA

# Load embedding function
def load_embedding(input_embedding_name, model, DIM):
    if model.startswith('trans'):
        with open(input_embedding_name) as f:
            data = json.load(f)
        ent_embeddings = np.array(data['ent_embeddings.weight'])
        rel_embeddings = np.array(data['rel_embeddings.weight'])
        return ent_embeddings, rel_embeddings

    elif model == 'secureBERT':
        ent_embeddings = np.empty((0, 768), dtype=np.float32)
        for filename in sorted(os.listdir(input_embedding_name)):
            filepath = os.path.join(input_embedding_name, filename)
            if not os.path.isfile(filepath) or not filename.startswith('embeddings_chunk'):
                continue

            embedding = np.load(filepath)
            print(filename)
            print(ent_embeddings.shape, embedding.shape)
            ent_embeddings = np.concatenate((ent_embeddings, embedding), axis=0)
            print(filename, ent_embeddings.shape)
            
            ent_embeddings = np.concatenate((ent_embeddings, embedding), axis=0)

        # 对实体嵌入进行 PCA 降维
        print(f'Reducing entity embedding to ({DIM},)')
        pca = PCA(n_components=DIM)
        ent_embeddings = pca.fit_transform(ent_embeddings)
        print(f'Entity embeddings reduced: {ent_embeddings.shape}')

        # 直接加载关系嵌入，不进行 PCA 降维
        rel_embeddings = np.load(f'{input_embedding_name}/relation.npy')
        print(f'Relation embeddings: {rel_embeddings.shape}')

        return ent_embeddings, rel_embeddings

    else:
        print('Error!!')
        return None


- We have 165000 data here

### Main Function

- This is for synthesized dataset

In [10]:
data = [1,2,3]

print(1 in data)

True


In [17]:
import json
from tqdm.notebook import tqdm

weird_nodes = [189923, 829358, 270488, 405143, 829356, 829357]
input_filename = '../data/exp3/before_embedding/all_graph.jsonl'
output_filename = '../data/exp3/before_embedding/all_graph_modified(node_feat).jsonl'

with open(input_filename, 'r') as file, open(output_filename, 'w') as outfile:
    for line in tqdm(file):
        data = json.loads(line.strip())

#         data["node_feat"] = [node_id - 1 if node_id in weird_nodes else node_id for node_id in data["node_feat"]]
        data["node_feat"] = [node_id - 3 if node_id in weird_nodes else node_id for node_id in data["node_feat"]]

        json.dump(data, outfile)
        outfile.write('\n')

print(f"Modified data saved in '{output_filename}'")


0it [00:00, ?it/s]

Modified data saved in '../data/exp3/before_embedding/all_graph_modified(node_feat).jsonl'


In [18]:
import os
import json
import numpy as np
from tqdm.notebook import tqdm
from sklearn.decomposition import PCA

embedding_files = ["../data/4_embedding/synthesize/secureBERT"]
model = 'secureBERT'

# 输入文件列表
# input_filenames = ["../data_new/graph/benign/graph_benign.jsonl"]
# input_filenames = ["../data_new/graph/without_benign/graph_without_benign.jsonl"]
input_filenames = ["../data/exp3/before_embedding/all_graph_modified(node_feat).jsonl"]
# input_filenames = ["../data/exp3/before_embedding/test.jsonl"]

# for i in tqdm(range(3)):
#     DIM = 150 - 50*i
DIM = 50

for input_filename in tqdm(input_filenames):
    print("Start!")
    base, ext = os.path.splitext(input_filename)

    with open(input_filename, "r") as f:
        input_data = list(f)

    for embedding_file in tqdm(embedding_files):
        # output_filename = f"../data_new/exp3/graph/secureBERT_{DIM}_embedded.jsonl"
        output_filename = f"../data/exp3/after_embedding/secureBERT_{DIM}_embedded.jsonl"
#         output_filename = f"../data/exp3/secureBERT_{DIM}_embedded_test.jsonl"

        print(f"output file name: {output_filename}")

        with open(output_filename, "w") as out_file:
            model = embedding_file.split('/')[-1].split('_')[0]
            # ent_embeddings, rel_embeddings = load_embedding(embedding_file, model)
            ent_embeddings, rel_embeddings = load_embedding(embedding_file, model, DIM)


            for line, data in tqdm(zip(input_data, input_data)):
                data = json.loads(data.strip())
                
                data["node_feat"] = [ent_embeddings[node_id].tolist() if model == 'secureBERT' else ent_embeddings[node_id] for node_id in data["node_feat"]]

                data["edge_attr"] = [rel_embeddings[edge_id].tolist() for edge_id in data["edge_attr"]]

                # Convert the data back to a JSON string and write to the output file
                out_file.write(json.dumps(data) + '\n')
                
                
                
                
# ==========for handling the weird nodes (4 of them)========== 2nd modified
#                 new_node_feats = []
#                 weird_nodes = [189923, 829358, 270488, 405143]
        
#                 data["node_feat"] = []
#                 for node_id in data["node_feat"]:
#                     if node_id in weird_nodes and node_id > 0:
#                         embedding = ent_embeddings[node_id - 1]
#                     else:
#                         embedding = ent_embeddings[node_id]

#                     if model == 'secureBERT':
#                         embedding = embedding.tolist()

#                     data["node_feat"].append(embedding)
                
                

# ==========for handling the weird nodes (4 of them)========== 1st modified
# ==========if index error -> use the last one -> need to be fixed==========
#             for line, data in tqdm(zip(input_data, input_data)):
#                 data = json.loads(data.strip())

#                 # 处理节点特征
#                 data["node_feat"] = []
#                 for node_id in data["node_feat"]:
#                     if node_id < len(ent_embeddings):
#                         embedding = ent_embeddings[node_id]
#                     else:
#                         # 如果索引不存在，使用最接近的有效索引
#                         nearest_valid_index = min(node_id, len(ent_embeddings) - 1)
#                         embedding = ent_embeddings[nearest_valid_index]

#                     data["node_feat"].append(embedding.tolist() if model == 'secureBERT' else embedding)



#                 Replace node_feat and edge_attr with embeddings =====> original version


  0%|          | 0/1 [00:00<?, ?it/s]

Start!


  0%|          | 0/1 [00:00<?, ?it/s]

output file name: ../data/exp3/after_embedding/secureBERT_50_embedded.jsonl
embeddings_chunk_0.npy
(0, 768) (160000, 768)
embeddings_chunk_0.npy (160000, 768)
embeddings_chunk_1.npy
(160000, 768) (160000, 768)
embeddings_chunk_1.npy (320000, 768)
embeddings_chunk_2.npy
(320000, 768) (160000, 768)
embeddings_chunk_2.npy (480000, 768)
embeddings_chunk_3.npy
(480000, 768) (160000, 768)
embeddings_chunk_3.npy (640000, 768)
embeddings_chunk_4.npy
(640000, 768) (160000, 768)
embeddings_chunk_4.npy (800000, 768)
embeddings_chunk_5.npy
(800000, 768) (29355, 768)
embeddings_chunk_5.npy (829355, 768)
relation.npy
Reducing entity embedding to (50,)
(829355, 768) -> (829355, 50)
Reducing relation embedding to (27,)
(27, 768) -> (27, 27)


0it [00:00, ?it/s]

IndexError: index 829356 is out of bounds for axis 0 with size 829355

- This is for DARPA dataset

In [7]:
import os
import json
import numpy as np
from tqdm.notebook import tqdm
from sklearn.decomposition import PCA

embedding_files = ["/workdir/home/bai/Euni_HO_modified/data/4_embedding/synthesize/secureBERT"]
model = 'secureBERT'

# 输入文件列表
input_filenames = '/workdir/home/bai/Euni_HO_modified/data/3_openKE/synthesize/all_graph_data.jsonl'

# for i in tqdm(range(3)):
#     DIM = 150 - 50*i
DIM = 150

print("Start!")
# base, ext = os.path.splitext(input_filenames)

with open(input_filenames, "r") as f:
    input_data = list(f)

for embedding_file in tqdm(embedding_files):
    # output_filename = f"../data_new/exp3/graph/secureBERT_{DIM}_embedded.jsonl"
#         output_filename = f"../data_new/exp3/graph/secureBERT_{DIM}_embedded(edge768).jsonl"
    output_filename = f"/workdir/home/bai/Euni_HO_modified/data/training_data/secureBERT_{DIM}_embedded(edge768).jsonl"

    print(f"output file name: {output_filename}")

    with open(output_filename, "w") as out_file:
        model = embedding_file.split('/')[-1].split('_')[0]
        # ent_embeddings, rel_embeddings = load_embedding(embedding_file, model)
        ent_embeddings, rel_embeddings = load_embedding(embedding_file, model, DIM)
        # ...

        for line, data in tqdm(zip(input_data, input_data)):
            data = json.loads(data.strip())

            # Replace node_feat and edge_attr with embeddings
            data["node_feat"] = [ent_embeddings[node_id].tolist() if model == 'secureBERT' else ent_embeddings[node_id] for node_id in data["node_feat"]]
            data["edge_attr"] = [rel_embeddings[edge_id].tolist() for edge_id in data["edge_attr"]]

            # Convert the data back to a JSON string and write to the output file
            out_file.write(json.dumps(data) + '\n')
            
# for input_filename in tqdm(input_filenames):
#     print("Start!")
#     base, ext = os.path.splitext(input_filename)

#     with open(input_filename, "r") as f:
#         input_data = list(f)

#     for embedding_file in tqdm(embedding_files):
#         # output_filename = f"../data_new/exp3/graph/secureBERT_{DIM}_embedded.jsonl"
# #         output_filename = f"../data_new/exp3/graph/secureBERT_{DIM}_embedded(edge768).jsonl"
#         output_filename = f"/workdir/home/bai/Euni_HO_modified/data/training_data/secureBERT_{DIM}_embedded(edge768).jsonl"

#         print(f"output file name: {output_filename}")

#         with open(output_filename, "w") as out_file:
#             model = embedding_file.split('/')[-1].split('_')[0]
#             # ent_embeddings, rel_embeddings = load_embedding(embedding_file, model)
#             ent_embeddings, rel_embeddings = load_embedding(embedding_file, model, DIM)
#             # ...

#             for line, data in tqdm(zip(input_data, input_data)):
#                 data = json.loads(data.strip())

#                 # Replace node_feat and edge_attr with embeddings
#                 data["node_feat"] = [ent_embeddings[node_id].tolist() if model == 'secureBERT' else ent_embeddings[node_id] for node_id in data["node_feat"]]
#                 data["edge_attr"] = [rel_embeddings[edge_id].tolist() for edge_id in data["edge_attr"]]

#                 # Convert the data back to a JSON string and write to the output file
#                 out_file.write(json.dumps(data) + '\n')

Start!


  0%|          | 0/1 [00:00<?, ?it/s]

output file name: /workdir/home/bai/Euni_HO_modified/data/training_data/secureBERT_150_embedded(edge768).jsonl
embeddings_chunk_0.npy
(0, 768) (160000, 768)
embeddings_chunk_0.npy (160000, 768)
embeddings_chunk_1.npy
(320000, 768) (160000, 768)
embeddings_chunk_1.npy (480000, 768)
embeddings_chunk_2.npy
(640000, 768) (160000, 768)
embeddings_chunk_2.npy (800000, 768)
embeddings_chunk_3.npy
(960000, 768) (20281, 768)
embeddings_chunk_3.npy (980281, 768)
Reducing entity embedding to (150,)
Entity embeddings reduced: (1000562, 150)
Relation embeddings: (23, 768)


0it [00:00, ?it/s]

- Combine 2 jsonl files

In [15]:
# files = ['transE_50', 'transE_100', 'transE_150', 'transH_50', 'transH_100', 'transH_150', 'transR_50',
#          'secureBERT_250', 'secureBERT_150', 'secureBERT_100', 'secureBERT_50']
files = ['secureBERT_150', 'secureBERT_100', 'secureBERT_50']

for file in tqdm(files):
    file1 = f"../data_new/graph/benign/{file}_embedded.jsonl"
    data1 = []

    with open(file1, 'r') as f:
        for line in f:
            data1.append(json.loads(line))

    file2 = f"../data_new/graph/without_benign/{file}_embedded.jsonl"
    data2 = []

    with open(file2, 'r') as f:
        for line in f:
            data2.append(json.loads(line))

    combined_data = data1 + data2

    output_file = f"../data_new/graph/with_benign/{file}_embedded.jsonl"
    print(output_file)

    with open(output_file, 'w') as f:
        for item in combined_data:
            f.write(json.dumps(item) + '\n')


  0%|          | 0/3 [00:00<?, ?it/s]

../data_new/graph/with_benign/secureBERT_150_embedded.jsonl
../data_new/graph/with_benign/secureBERT_100_embedded.jsonl
../data_new/graph/with_benign/secureBERT_50_embedded.jsonl
