In [1]:
import os
import json
import numpy as np
from tqdm.notebook import tqdm
from sklearn.decomposition import PCA

# Load embedding function
def load_embedding(input_embedding_name, model):
    if model.startswith('trans'):
        with open(input_embedding_name) as f:
            data = json.load(f)
        ent_embeddings = np.array(data['ent_embeddings.weight'])
        rel_embeddings = np.array(data['rel_embeddings.weight'])
        return ent_embeddings, rel_embeddings
    
    elif model == 'secureBERT':
        ent_embeddings = np.empty((0, 768), dtype=np.float32)
        for filename in sorted(os.listdir(input_embedding_name)):
            print(filename)

            if not filename.startswith('embeddings_chunk'):
                continue

            embedding = np.load(f'{input_embedding_name}/{filename}')

            print(ent_embeddings.shape, embedding.shape)

            ent_embeddings = np.concatenate((ent_embeddings, embedding), axis=0)
            print(filename, ent_embeddings.shape)

        print(f'Reducing entity embedding to ({DIM},)')
        print(ent_embeddings.shape, '->', end=' ')
        pca = PCA(n_components=DIM)
        ent_embeddings = pca.fit_transform(ent_embeddings)
        print(ent_embeddings.shape)

        rel_embeddings = np.load(f'{input_embedding_name}/relation.npy')
        print(f'Reducing relation embedding to ({len(rel_embeddings)},)')
        print(rel_embeddings.shape, '->', end=' ')
        pca = PCA(n_components=len(rel_embeddings))
        rel_embeddings = pca.fit_transform(rel_embeddings)
        print(rel_embeddings.shape)
        return ent_embeddings, rel_embeddings
    else:
        print('Error!!')
        return None
    


In [2]:
embedding_files = ["../data_new/source_data/embedding/secureBERT"]
model = 'secureBERT'
DIM = 250

# 输入文件列表
input_filenames = ["../data_new/graph/with_benign/graph_benign.jsonl"]

for input_filename in tqdm(input_filenames):
    print("Start!")
    base, ext = os.path.splitext(input_filename)
    
    with open(input_filename, "r") as f:
        input_data = list(f)

    for embedding_file in tqdm(embedding_files):
        output_filename = f"{embedding_file.replace('.json', '_embedded').replace('.vec', '')}{ext}"
        print(f"output file name: {output_filename}")

        with open(output_filename, "w") as out_file:
            model = embedding_file.split('/')[-1].split('_')[0]
            ent_embeddings, rel_embeddings = load_embedding(embedding_file, model)
            # ...

            for line, data in tqdm(zip(input_data, input_data)):
                data = json.loads(data.strip())

                # Replace node_feat and edge_attr with embeddings
                data["node_feat"] = [ent_embeddings[node_id].tolist() if model == 'secureBERT' else ent_embeddings[node_id] for node_id in data["node_feat"]]
                data["edge_attr"] = [rel_embeddings[edge_id].tolist() for edge_id in data["edge_attr"]]

                # Convert the data back to a JSON string and write to the output file
                out_file.write(json.dumps(data) + '\n')

  0%|          | 0/1 [00:00<?, ?it/s]

Start!


FileNotFoundError: [Errno 2] No such file or directory: '../data_new/graph/graph_without_benign.jsonl'