In [1]:
import os
import json
import numpy as np
from tqdm.notebook import tqdm
from sklearn.decomposition import PCA

# Load embedding function
def load_embedding(input_embedding_name, model):
    if model.startswith('trans'):
        with open(input_embedding_name) as f:
            data = json.load(f)
        ent_embeddings = np.array(data['ent_embeddings.weight'])
        rel_embeddings = np.array(data['rel_embeddings.weight'])
        return ent_embeddings, rel_embeddings
    
    elif model == 'secureBERT':
        ent_embeddings = np.empty((0, 768), dtype=np.float32)
        for filename in sorted(os.listdir(input_embedding_name)):
            print(filename)

            if not filename.startswith('embeddings_chunk'):
                continue

            embedding = np.load(f'{input_embedding_name}/{filename}')

            print(ent_embeddings.shape, embedding.shape)

            ent_embeddings = np.concatenate((ent_embeddings, embedding), axis=0)
            print(filename, ent_embeddings.shape)

        print(f'Reducing entity embedding to ({DIM},)')
        print(ent_embeddings.shape, '->', end=' ')
        pca = PCA(n_components=DIM)
        ent_embeddings = pca.fit_transform(ent_embeddings)
        print(ent_embeddings.shape)

        rel_embeddings = np.load(f'{input_embedding_name}/relation.npy')
        print(f'Reducing relation embedding to ({len(rel_embeddings)},)')
        print(rel_embeddings.shape, '->', end=' ')
        pca = PCA(n_components=len(rel_embeddings))
        rel_embeddings = pca.fit_transform(rel_embeddings)
        print(rel_embeddings.shape)
        return ent_embeddings, rel_embeddings
    else:
        print('Error!!')
        return None
    


In [2]:
embedding_files = ["../data_new/source_data/embedding/secureBERT"]
model = 'secureBERT'
DIM = 250

# 输入文件列表
input_filenames = ["../data_new/graph/graph_without_benign.jsonl"]

for input_filename in tqdm(input_filenames):
    print("Start!")
    base, ext = os.path.splitext(input_filename)
    
    with open(input_filename, "r") as f:
        input_data = list(f)

    for embedding_file in tqdm(embedding_files):
        output_filename = f"{embedding_file.replace('.json', '_embedded').replace('.vec', '')}{ext}"
        print(f"output file name: {output_filename}")

        with open(output_filename, "w") as out_file:
            model = embedding_file.split('/')[-1].split('_')[0]
            ent_embeddings, rel_embeddings = load_embedding(embedding_file, model)
            # ...

            for line, data in tqdm(zip(input_data, input_data)):
                data = json.loads(data.strip())

                # Replace node_feat and edge_attr with embeddings
                data["node_feat"] = [ent_embeddings[node_id].tolist() if model == 'secureBERT' else ent_embeddings[node_id] for node_id in data["node_feat"]]
                data["edge_attr"] = [rel_embeddings[edge_id].tolist() for edge_id in data["edge_attr"]]

                # Convert the data back to a JSON string and write to the output file
                out_file.write(json.dumps(data) + '\n')

  0%|          | 0/1 [00:00<?, ?it/s]

Start!



  0%|          | 0/1 [00:00<?, ?it/s][A

../data_new/source_data/embedding/secureBERT.jsonl
.ipynb_checkpoints
embeddings_chunk_0.npy
(0, 768) (160000, 768)
embeddings_chunk_0.npy (160000, 768)
embeddings_chunk_1.npy
(160000, 768) (160000, 768)
embeddings_chunk_1.npy (320000, 768)
embeddings_chunk_2.npy
(320000, 768) (160000, 768)
embeddings_chunk_2.npy (480000, 768)
embeddings_chunk_3.npy
(480000, 768) (160000, 768)
embeddings_chunk_3.npy (640000, 768)
embeddings_chunk_4.npy
(640000, 768) (160000, 768)
embeddings_chunk_4.npy (800000, 768)
embeddings_chunk_5.npy
(800000, 768) (160000, 768)
embeddings_chunk_5.npy (960000, 768)
embeddings_chunk_6.npy
(960000, 768) (160000, 768)
embeddings_chunk_6.npy (1120000, 768)
embeddings_chunk_7.npy
(1120000, 768) (51204, 768)
embeddings_chunk_7.npy (1171204, 768)
relation.npy
Reducing entity embedding to (250,)
(1171204, 768) -> (1171204, 250)
Reducing relation embedding to (26,)
(26, 768) -> (26, 26)




0it [00:00, ?it/s][A[A

66it [00:00, 659.63it/s][A[A

182it [00:00, 919.11it/s][A[A

332it [00:00, 1177.87it/s][A[A

467it [00:00, 1242.25it/s][A[A

594it [00:00, 1250.73it/s][A[A

720it [00:00, 1227.23it/s][A[A

844it [00:00, 1227.94it/s][A[A

972it [00:00, 1242.59it/s][A[A

1117it [00:00, 1303.91it/s][A[A

1248it [00:01, 1261.03it/s][A[A

1378it [00:01, 1271.16it/s][A[A

1506it [00:01, 1236.35it/s][A[A

1632it [00:01, 1239.93it/s][A[A

1757it [00:01, 1223.80it/s][A[A

1899it [00:01, 1280.89it/s][A[A

2028it [00:01, 1141.36it/s][A[A

2146it [00:01, 762.22it/s] [A[A

2241it [00:02, 631.10it/s][A[A

2320it [00:02, 565.01it/s][A[A

2388it [00:02, 525.29it/s][A[A

2448it [00:02, 492.20it/s][A[A

2502it [00:02, 477.16it/s][A[A

2553it [00:02, 469.21it/s][A[A

2602it [00:03, 462.03it/s][A[A

2650it [00:03, 456.89it/s][A[A

2697it [00:03, 427.22it/s][A[A

2741it [00:03, 429.03it/s][A[A

2785it [00:03, 422.32it/s][A[A

2828it [00:03,

14448it [00:28, 918.64it/s][A[A

14542it [00:28, 861.59it/s][A[A

14632it [00:28, 871.15it/s][A[A

14721it [00:28, 849.57it/s][A[A

14818it [00:28, 880.36it/s][A[A

14907it [00:28, 869.43it/s][A[A

14995it [00:28, 857.73it/s][A[A

15082it [00:29, 544.50it/s][A[A

15151it [00:29, 446.50it/s][A[A

15208it [00:29, 393.04it/s][A[A

15257it [00:29, 369.79it/s][A[A

15300it [00:30, 347.24it/s][A[A

15339it [00:30, 335.79it/s][A[A

15375it [00:30, 320.06it/s][A[A

15409it [00:30, 310.63it/s][A[A

15441it [00:30, 297.19it/s][A[A

15472it [00:30, 290.98it/s][A[A

15502it [00:30, 285.59it/s][A[A

15531it [00:30, 286.04it/s][A[A

15560it [00:30, 283.91it/s][A[A

15589it [00:31, 283.37it/s][A[A

15619it [00:31, 286.32it/s][A[A

15648it [00:31, 286.04it/s][A[A

15678it [00:31, 288.41it/s][A[A

15707it [00:31, 274.95it/s][A[A

15735it [00:31, 267.31it/s][A[A

15763it [00:31, 269.17it/s][A[A

15791it [00:31, 270.39it/s][A[A

15819it [00:31, 268.

21675it [00:56, 633.64it/s][A[A

21744it [00:56, 648.38it/s][A[A

21811it [00:57, 654.51it/s][A[A

21877it [00:57, 654.97it/s][A[A

21945it [00:57, 660.31it/s][A[A

22018it [00:57, 662.48it/s][A[A

22145it [00:57, 839.29it/s][A[A

22274it [00:57, 970.37it/s][A[A

22418it [00:57, 1108.27it/s][A[A

22547it [00:57, 1160.22it/s][A[A

22674it [00:57, 1190.33it/s][A[A

22825it [00:57, 1284.83it/s][A[A

22968it [00:58, 1327.07it/s][A[A

23101it [00:58, 416.80it/s] [A[A

23199it [00:59, 278.37it/s][A[A

23272it [01:00, 229.46it/s][A[A

23327it [01:00, 204.96it/s][A[A

23370it [01:00, 184.33it/s][A[A

23404it [01:01, 176.25it/s][A[A

23432it [01:01, 170.25it/s][A[A

23456it [01:01, 161.96it/s][A[A

23477it [01:01, 153.01it/s][A[A

23495it [01:01, 151.72it/s][A[A

23512it [01:01, 147.20it/s][A[A

23528it [01:02, 135.05it/s][A[A

23543it [01:02, 134.12it/s][A[A

23557it [01:02, 133.05it/s][A[A

23571it [01:02, 124.90it/s][A[A

23586it [01:02

38687it [01:26, 574.99it/s][A[A

38748it [01:26, 582.35it/s][A[A

38807it [01:26, 570.68it/s][A[A

38865it [01:26, 563.69it/s][A[A

38922it [01:27, 551.38it/s][A[A

38979it [01:27, 554.14it/s][A[A

39078it [01:27, 678.28it/s][A[A

39209it [01:27, 862.24it/s][A[A

39339it [01:27, 990.52it/s][A[A

39461it [01:27, 1057.75it/s][A[A

39574it [01:27, 1078.48it/s][A[A

39690it [01:27, 1102.51it/s][A[A

39811it [01:27, 1123.67it/s][A[A

39924it [01:27, 1101.90it/s][A[A

40035it [01:28, 1066.90it/s][A[A

40164it [01:28, 1131.08it/s][A[A

40297it [01:28, 1187.96it/s][A[A

40432it [01:28, 1231.50it/s][A[A

40566it [01:28, 1261.85it/s][A[A

40719it [01:28, 1341.11it/s][A[A

40859it [01:28, 1358.09it/s][A[A

41012it [01:28, 1408.63it/s][A[A

41154it [01:28, 1367.05it/s][A[A

41294it [01:29, 1375.52it/s][A[A

41432it [01:29, 1358.22it/s][A[A

41570it [01:29, 1354.55it/s][A[A

41715it [01:29, 1381.36it/s][A[A

41858it [01:29, 1393.42it/s][A[A

4

51650it [01:54, 1147.30it/s][A[A

51800it [01:54, 1250.38it/s][A[A

51949it [01:55, 1320.68it/s][A[A

52087it [01:55, 1337.43it/s][A[A

52222it [01:55, 1290.29it/s][A[A

52352it [01:55, 1194.59it/s][A[A

52474it [01:55, 1183.41it/s][A[A

52610it [01:55, 1231.67it/s][A[A

52759it [01:55, 1304.98it/s][A[A

52906it [01:55, 1352.09it/s][A[A

53054it [01:55, 1389.19it/s][A[A

53207it [01:56, 1430.39it/s][A[A

53353it [01:56, 1437.34it/s][A[A

53498it [01:56, 1431.29it/s][A[A

53642it [01:56, 1388.42it/s][A[A

53782it [01:56, 1328.62it/s][A[A

53916it [01:56, 1307.43it/s][A[A

54048it [01:56, 1281.81it/s][A[A

54177it [01:56, 1144.74it/s][A[A

54295it [01:56, 1000.62it/s][A[A

54400it [01:57, 927.63it/s] [A[A

54497it [01:57, 805.60it/s][A[A

54582it [01:57, 735.74it/s][A[A

54659it [01:57, 697.10it/s][A[A

54731it [01:57, 658.90it/s][A[A

54798it [01:57, 639.07it/s][A[A

54863it [01:57, 626.44it/s][A[A

54926it [01:58, 609.38it/s][A[A


72258it [02:20, 1461.05it/s][A[A

72405it [02:20, 1457.44it/s][A[A

72557it [02:20, 1447.99it/s][A[A

72712it [02:20, 1475.71it/s][A[A

72866it [02:21, 1494.24it/s][A[A

73023it [02:21, 1513.92it/s][A[A

73175it [02:21, 1485.65it/s][A[A

73329it [02:21, 1499.75it/s][A[A

73486it [02:21, 1519.39it/s][A[A

73642it [02:21, 1530.87it/s][A[A

73798it [02:21, 1539.13it/s][A[A

73953it [02:21, 1469.93it/s][A[A

74101it [02:21, 1192.42it/s][A[A

74229it [02:22, 978.47it/s] [A[A

74339it [02:22, 884.76it/s][A[A

74436it [02:22, 789.47it/s][A[A

74522it [02:22, 744.58it/s][A[A

74601it [02:22, 707.49it/s][A[A

74675it [02:22, 689.72it/s][A[A

74746it [02:22, 685.69it/s][A[A

74816it [02:23, 665.37it/s][A[A

74884it [02:23, 649.13it/s][A[A

74950it [02:23, 648.63it/s][A[A

75025it [02:23, 675.41it/s][A[A

75120it [02:23, 751.58it/s][A[A

75210it [02:23, 793.51it/s][A[A

75305it [02:23, 838.42it/s][A[A

75400it [02:23, 870.16it/s][A[A

75494i

93149it [02:47, 741.99it/s][A[A

93226it [02:48, 657.33it/s][A[A

93295it [02:48, 620.21it/s][A[A

93360it [02:48, 615.09it/s][A[A

93423it [02:48, 605.15it/s][A[A

93485it [02:48, 593.63it/s][A[A

93545it [02:48, 587.43it/s][A[A

93605it [02:48, 582.93it/s][A[A

93664it [02:48, 573.30it/s][A[A

93722it [02:48, 562.51it/s][A[A

93779it [02:49, 559.72it/s][A[A

93838it [02:49, 564.60it/s][A[A

93898it [02:49, 568.24it/s][A[A

93955it [02:49, 562.20it/s][A[A

94035it [02:49, 630.07it/s][A[A

94157it [02:49, 794.88it/s][A[A

94311it [02:49, 1011.13it/s][A[A

94460it [02:49, 1144.32it/s][A[A

94609it [02:49, 1242.44it/s][A[A

94758it [02:49, 1300.27it/s][A[A

94895it [02:50, 1318.35it/s][A[A

95044it [02:50, 1368.68it/s][A[A

95194it [02:50, 1406.39it/s][A[A

95342it [02:50, 1427.24it/s][A[A

95493it [02:50, 1450.11it/s][A[A

95639it [02:50, 1409.29it/s][A[A

95781it [02:50, 1401.81it/s][A[A

95922it [02:50, 1382.83it/s][A[A

96061it 

110539it [03:14, 1019.24it/s][A[A

110644it [03:14, 952.97it/s] [A[A

110742it [03:15, 956.00it/s][A[A

110841it [03:15, 953.51it/s][A[A

110942it [03:15, 958.46it/s][A[A

111043it [03:15, 970.21it/s][A[A

111146it [03:15, 987.11it/s][A[A

111246it [03:15, 974.35it/s][A[A

111344it [03:15, 936.65it/s][A[A

111439it [03:15, 937.95it/s][A[A

111534it [03:15, 936.71it/s][A[A

111633it [03:15, 951.09it/s][A[A

111729it [03:16, 947.88it/s][A[A

111824it [03:16, 921.94it/s][A[A

111917it [03:16, 900.36it/s][A[A

112008it [03:16, 864.19it/s][A[A

112095it [03:16, 622.23it/s][A[A

112167it [03:16, 517.42it/s][A[A

112228it [03:17, 463.61it/s][A[A

112281it [03:17, 436.77it/s][A[A

112329it [03:17, 415.88it/s][A[A

112374it [03:17, 394.73it/s][A[A

112416it [03:17, 382.84it/s][A[A

112456it [03:17, 366.13it/s][A[A

112494it [03:17, 350.38it/s][A[A

112532it [03:17, 356.85it/s][A[A

112568it [03:18, 357.53it/s][A[A

112604it [03:18, 344.41it/

125644it [03:41, 255.61it/s][A[A

125673it [03:41, 263.78it/s][A[A

125700it [03:41, 253.37it/s][A[A

125729it [03:41, 261.55it/s][A[A

125758it [03:41, 268.40it/s][A[A

125785it [03:41, 267.98it/s][A[A

125812it [03:41, 264.86it/s][A[A

125839it [03:42, 262.58it/s][A[A

125866it [03:42, 234.08it/s][A[A

125890it [03:42, 222.92it/s][A[A

125914it [03:42, 225.79it/s][A[A

125939it [03:42, 229.64it/s][A[A

125967it [03:42, 241.49it/s][A[A

125996it [03:42, 254.13it/s][A[A

126069it [03:42, 390.27it/s][A[A

126146it [03:42, 499.58it/s][A[A

126226it [03:42, 586.35it/s][A[A

126304it [03:43, 641.02it/s][A[A

126379it [03:43, 673.13it/s][A[A

126447it [03:43, 659.24it/s][A[A

126514it [03:43, 660.47it/s][A[A

126595it [03:43, 698.56it/s][A[A

126683it [03:43, 750.88it/s][A[A

126759it [03:43, 717.70it/s][A[A

126835it [03:43, 729.24it/s][A[A

126916it [03:43, 751.09it/s][A[A

127000it [03:44, 776.47it/s][A[A

127078it [03:44, 763.70it/s]

129483it [04:25, 10.50it/s][A[A

129485it [04:25, 11.20it/s][A[A

129487it [04:25, 11.53it/s][A[A

129489it [04:25, 11.67it/s][A[A

129491it [04:25, 12.03it/s][A[A

129493it [04:25, 12.45it/s][A[A

129495it [04:26, 12.51it/s][A[A

129497it [04:26, 12.63it/s][A[A

129499it [04:26, 12.79it/s][A[A

129501it [04:26, 12.93it/s][A[A

129503it [04:26, 12.73it/s][A[A

129505it [04:26, 12.82it/s][A[A

129507it [04:27, 12.56it/s][A[A

129509it [04:27, 12.57it/s][A[A

129511it [04:27, 12.78it/s][A[A

129513it [04:27, 12.99it/s][A[A

129515it [04:27, 12.90it/s][A[A

129517it [04:27, 12.92it/s][A[A

129519it [04:27, 13.16it/s][A[A

129521it [04:28, 12.82it/s][A[A

129523it [04:28, 12.21it/s][A[A

129525it [04:28, 12.33it/s][A[A

129527it [04:28, 12.46it/s][A[A

129529it [04:28, 12.83it/s][A[A

129531it [04:29, 11.27it/s][A[A

129533it [04:29, 10.45it/s][A[A

129535it [04:29, 10.96it/s][A[A

129537it [04:29, 11.43it/s][A[A

129539it [04:29, 11.

129947it [05:02,  9.43it/s][A[A

129948it [05:02,  9.39it/s][A[A

129949it [05:02,  9.37it/s][A[A

129950it [05:03,  9.34it/s][A[A

129951it [05:03,  9.32it/s][A[A

129952it [05:03,  9.31it/s][A[A

129953it [05:03,  9.30it/s][A[A

129955it [05:03, 10.81it/s][A[A

129957it [05:03, 11.52it/s][A[A

129959it [05:03, 12.09it/s][A[A

129961it [05:03, 12.23it/s][A[A

129963it [05:04, 12.42it/s][A[A

129965it [05:04, 12.64it/s][A[A

129967it [05:04, 12.67it/s][A[A

129969it [05:04, 12.82it/s][A[A

129971it [05:04, 12.95it/s][A[A

129973it [05:04, 12.79it/s][A[A

129975it [05:05, 12.82it/s][A[A

129977it [05:05, 12.49it/s][A[A

129979it [05:05, 12.63it/s][A[A

129981it [05:05, 12.90it/s][A[A

129983it [05:05, 12.83it/s][A[A

129985it [05:05, 13.05it/s][A[A

129987it [05:05, 13.17it/s][A[A

129989it [05:06, 13.01it/s][A[A

129991it [05:06, 13.25it/s][A[A

129993it [05:06, 13.20it/s][A[A

129995it [05:06, 13.05it/s][A[A

129997it [05:06, 13.

142598it [05:31, 606.95it/s][A[A

142660it [05:31, 584.18it/s][A[A

142720it [05:31, 550.45it/s][A[A

142776it [05:31, 520.45it/s][A[A

142829it [05:31, 508.09it/s][A[A

142880it [05:31, 508.24it/s][A[A

142935it [05:32, 518.84it/s][A[A

142987it [05:32, 509.21it/s][A[A

143038it [05:32, 495.80it/s][A[A

143088it [05:32, 460.96it/s][A[A

143135it [05:32, 451.46it/s][A[A

143181it [05:32, 442.01it/s][A[A

143226it [05:32, 439.92it/s][A[A

143271it [05:32, 430.91it/s][A[A

143315it [05:32, 426.44it/s][A[A

143359it [05:32, 428.16it/s][A[A

143402it [05:33, 419.40it/s][A[A

143444it [05:33, 392.70it/s][A[A

143484it [05:33, 385.19it/s][A[A

143523it [05:33, 381.44it/s][A[A

143562it [05:33, 377.06it/s][A[A

143600it [05:33, 374.99it/s][A[A

143638it [05:33, 353.70it/s][A[A

143677it [05:33, 363.08it/s][A[A

143715it [05:33, 359.57it/s][A[A

143758it [05:34, 378.27it/s][A[A

143797it [05:34, 374.32it/s][A[A

143837it [05:34, 378.58it/s]

152100it [06:00, 268.33it/s][A[A

152157it [06:00, 349.72it/s][A[A

152218it [06:01, 423.86it/s][A[A

152279it [06:01, 467.52it/s][A[A

152338it [06:01, 500.58it/s][A[A

152399it [06:01, 528.71it/s][A[A

152461it [06:01, 548.97it/s][A[A

152524it [06:01, 570.53it/s][A[A

152587it [06:01, 585.94it/s][A[A

152648it [06:01, 589.18it/s][A[A

152708it [06:01, 581.01it/s][A[A

152767it [06:02, 578.60it/s][A[A

152830it [06:02, 591.02it/s][A[A

152891it [06:02, 596.11it/s][A[A

152951it [06:02, 583.16it/s][A[A

153010it [06:02, 488.45it/s][A[A

153062it [06:02, 295.40it/s][A[A

153103it [06:03, 233.74it/s][A[A

153136it [06:03, 204.43it/s][A[A

153163it [06:03, 187.07it/s][A[A

153186it [06:03, 175.26it/s][A[A

153207it [06:03, 166.10it/s][A[A

153226it [06:04, 158.21it/s][A[A

153243it [06:04, 152.86it/s][A[A

153259it [06:04, 145.08it/s][A[A

153274it [06:04, 143.64it/s][A[A

153289it [06:04, 141.37it/s][A[A

153304it [06:04, 138.85it/s]

158972it [06:30, 65.05it/s][A[A

158979it [06:31, 64.11it/s][A[A

158986it [06:31, 62.94it/s][A[A

158993it [06:31, 61.95it/s][A[A

159000it [06:31, 60.18it/s][A[A

159151it [06:31, 449.97it/s][A[A

159304it [06:31, 748.24it/s][A[A

159450it [06:31, 949.06it/s][A[A

159607it [06:31, 1126.69it/s][A[A

159762it [06:31, 1249.38it/s][A[A

159917it [06:31, 1335.33it/s][A[A

160053it [06:32, 1296.63it/s][A[A

160185it [06:32, 1234.97it/s][A[A

160321it [06:32, 1268.41it/s][A[A

160471it [06:32, 1333.51it/s][A[A

160606it [06:32, 1319.35it/s][A[A

160748it [06:32, 1348.09it/s][A[A

160902it [06:32, 1403.57it/s][A[A

161047it [06:32, 1414.86it/s][A[A

161189it [06:32, 1407.75it/s][A[A

161331it [06:33, 1366.24it/s][A[A

161469it [06:33, 1351.84it/s][A[A

161605it [06:33, 1344.61it/s][A[A

161740it [06:33, 1336.37it/s][A[A

161896it [06:33, 1401.55it/s][A[A

162037it [06:33, 723.60it/s] [A[A

162146it [06:34, 297.34it/s][A[A

162226it [06:35