In [1]:
import os
import json
import numpy as np
from tqdm.notebook import tqdm
from sklearn.decomposition import PCA

# Load embedding function
def load_embedding(input_embedding_name, model):
    if model.startswith('trans'):
        with open(input_embedding_name) as f:
            data = json.load(f)
        ent_embeddings = np.array(data['ent_embeddings.weight'])
        rel_embeddings = np.array(data['rel_embeddings.weight'])
        return ent_embeddings, rel_embeddings
    
    elif model == 'secureBERT':
        ent_embeddings = np.empty((0, 768), dtype=np.float32)
        for filename in sorted(os.listdir(input_embedding_name)):
            print(filename)

            if not filename.startswith('embeddings_chunk'):
                continue

            embedding = np.load(f'{input_embedding_name}/{filename}')

            print(ent_embeddings.shape, embedding.shape)

            ent_embeddings = np.concatenate((ent_embeddings, embedding), axis=0)
            print(filename, ent_embeddings.shape)

        print(f'Reducing entity embedding to ({DIM},)')
        print(ent_embeddings.shape, '->', end=' ')
        
        pca = PCA(n_components=DIM)
        ent_embeddings = pca.fit_transform(ent_embeddings)
        print(ent_embeddings.shape)

        rel_embeddings = np.load(f'{input_embedding_name}/relation.npy')
        print(f'Reducing relation embedding to ({len(rel_embeddings)},)')
        print(rel_embeddings.shape, '->', end=' ')
        pca = PCA(n_components=len(rel_embeddings))
        rel_embeddings = pca.fit_transform(rel_embeddings)
        print(rel_embeddings.shape)
        return ent_embeddings, rel_embeddings
    else:
        print('Error!!')
        return None
    


In [2]:
embedding_files = ["../data_new/embedding/secureBERT"]
model = 'secureBERT'

# 输入文件列表
# input_filenames = ["../data_new/graph/benign/graph_benign.jsonl"]
# input_filenames = ["../data_new/graph/without_benign/graph_without_benign.jsonl"]
input_filenames = ["../data_new/exp3/graph/graph_exp3.jsonl"]

# for i in tqdm(range(3)):
#     DIM = 150 - 50*i
DIM = 150

for input_filename in tqdm(input_filenames):
    print("Start!")
    base, ext = os.path.splitext(input_filename)

    with open(input_filename, "r") as f:
        input_data = list(f)

    for embedding_file in tqdm(embedding_files):
        output_filename = f"../data_new/exp3/graph/secureBERT_{DIM}_embedded.jsonl"

        print(f"output file name: {output_filename}")

        with open(output_filename, "w") as out_file:
            model = embedding_file.split('/')[-1].split('_')[0]
            ent_embeddings, rel_embeddings = load_embedding(embedding_file, model)
            # ...

            for line, data in tqdm(zip(input_data, input_data)):
                data = json.loads(data.strip())

                # Replace node_feat and edge_attr with embeddings
                data["node_feat"] = [ent_embeddings[node_id].tolist() if model == 'secureBERT' else ent_embeddings[node_id] for node_id in data["node_feat"]]
                data["edge_attr"] = [rel_embeddings[edge_id].tolist() for edge_id in data["edge_attr"]]

                # Convert the data back to a JSON string and write to the output file
                out_file.write(json.dumps(data) + '\n')

  0%|          | 0/1 [00:00<?, ?it/s]

Start!


  0%|          | 0/1 [00:00<?, ?it/s]

output file name: ../data_new/exp3/graph/secureBERT_150_embedded.jsonl
.ipynb_checkpoints
embeddings_chunk_0.npy
(0, 768) (160000, 768)
embeddings_chunk_0.npy (160000, 768)
embeddings_chunk_1.npy
(160000, 768) (160000, 768)
embeddings_chunk_1.npy (320000, 768)
embeddings_chunk_2.npy
(320000, 768) (160000, 768)
embeddings_chunk_2.npy (480000, 768)
embeddings_chunk_3.npy
(480000, 768) (160000, 768)
embeddings_chunk_3.npy (640000, 768)
embeddings_chunk_4.npy
(640000, 768) (160000, 768)
embeddings_chunk_4.npy (800000, 768)
embeddings_chunk_5.npy
(800000, 768) (160000, 768)
embeddings_chunk_5.npy (960000, 768)
embeddings_chunk_6.npy
(960000, 768) (160000, 768)
embeddings_chunk_6.npy (1120000, 768)
embeddings_chunk_7.npy
(1120000, 768) (51204, 768)
embeddings_chunk_7.npy (1171204, 768)
relation.npy
Reducing entity embedding to (150,)
(1171204, 768) -> (1171204, 150)
Reducing relation embedding to (26,)
(26, 768) -> (26, 26)


0it [00:00, ?it/s]

- Combine 2 jsonl files

In [15]:
# files = ['transE_50', 'transE_100', 'transE_150', 'transH_50', 'transH_100', 'transH_150', 'transR_50',
#          'secureBERT_250', 'secureBERT_150', 'secureBERT_100', 'secureBERT_50']
files = ['secureBERT_150', 'secureBERT_100', 'secureBERT_50']

for file in tqdm(files):
    file1 = f"../data_new/graph/benign/{file}_embedded.jsonl"
    data1 = []

    with open(file1, 'r') as f:
        for line in f:
            data1.append(json.loads(line))

    file2 = f"../data_new/graph/without_benign/{file}_embedded.jsonl"
    data2 = []

    with open(file2, 'r') as f:
        for line in f:
            data2.append(json.loads(line))

    combined_data = data1 + data2

    output_file = f"../data_new/graph/with_benign/{file}_embedded.jsonl"
    print(output_file)

    with open(output_file, 'w') as f:
        for item in combined_data:
            f.write(json.dumps(item) + '\n')


  0%|          | 0/3 [00:00<?, ?it/s]

../data_new/graph/with_benign/secureBERT_150_embedded.jsonl
../data_new/graph/with_benign/secureBERT_100_embedded.jsonl
../data_new/graph/with_benign/secureBERT_50_embedded.jsonl
