In [23]:
from gensim.models import Word2Vec
import numpy as np
import json
import struct

In [24]:
data_file = './data/vpn_data_small.json'
bins_file = './bins/bins_small.json'
max_seq_len = 16
NPRINT_LINE_LEN = 114

In [25]:
with open(bins_file, 'r') as f_bin:
    bins_data = json.load(f_bin)

# pkt_len_intervals = [bins['intervals'] for bins in bins_data['packet_len']]
# time_intervals = [bins['intervals'] for bins in bins_data['time']]
pkt_len_intervals = bins_data['packet_len']['intervals']
time_intervals = bins_data['time']['intervals']

In [26]:
def get_seqs(data_file_name):
    with open(data_file_name, 'r') as f:
        data = json.load(f)['data']
        
    def find_interval(value, intervals):
        for idx, [start, end] in enumerate(intervals):
            if start <= value <= end:
                return idx  # 返回所在区间的下标
        return None
        
    data_dic = {}
    data_dic['time'] = []
    data_dic['pkt_len'] = []
    
    for idx in range(len(data)):
        item = data[idx]

        metadata = np.array(list(item['meta'].values()), dtype=np.float32)
        length = min(metadata[1],max_seq_len)

        seq_time = []
        seq_pkt_len = []

        im = bytes.fromhex(item['nprint'])

        line = im[0:NPRINT_LINE_LEN]
        
        count = 0
        for i in range(0, len(im), NPRINT_LINE_LEN):

            line = im[i:i+NPRINT_LINE_LEN]
            
            time_h,time_l, pkt_len = struct.unpack("IIh", line[:10])
            # time = time_h
            time_l //= 1e4
            time = time_h + time_l/100
            
            time_id = find_interval(time,time_intervals)
            pkt_len_id = find_interval(pkt_len,pkt_len_intervals)
            
            seq_time.append(time_id)
            seq_pkt_len.append(pkt_len_id)
            count += 1
            if count >= max_seq_len:
                break
            
        data_dic['time'].append(seq_time)
        data_dic['pkt_len'].append(seq_pkt_len)
        
    return data_dic

In [27]:
def get_wv_model(seqs, vector_size, window, min_count = 1, sg = 1):
    seq_str = [[str(c) for c in seq] for seq in seqs]
    model = Word2Vec(sentences=seq_str, vector_size=vector_size, window=window, min_count=min_count, sg=sg)
    return model

In [28]:
# 示例包长序列
sequences_dic = get_seqs(data_file)

model_dic = {}

for key, seq in sequences_dic.items():
    model_dic[key] = get_wv_model(seq,8,5)
    model_dic[key].save(f"./wordvec/{key}.model")

In [29]:
for key, seq in sequences_dic.items():
    print(key)
    for i in range(10):
        li = []
        for num in seq[i]:
            li.append(model_dic[key].wv[str(num)].tolist())
        print(seq[i])
        if key == 'time':
            print([time_intervals[seq[i][j]] for j in range(len(seq[i]))])
        else:
            print([pkt_len_intervals[seq[i][j]] for j in range(len(seq[i]))])
        print(li)

time
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[[0.0, 0.42], [0.0, 0.42], [0.0, 0.42], [0.0, 0.42], [0.0, 0.42], [0.0, 0.42], [0.0, 0.42], [0.0, 0.42], [0.0, 0.42], [0.0, 0.42], [0.0, 0.42], [0.0, 0.42], [0.0, 0.42], [0.0, 0.42], [0.0, 0.42], [0.0, 0.42]]
[[0.07093805074691772, 0.08542165905237198, 0.07086850702762604, 0.34168606996536255, -0.23805473744869232, -0.27636468410491943, 0.22778695821762085, 0.18342997133731842], [0.07093805074691772, 0.08542165905237198, 0.07086850702762604, 0.34168606996536255, -0.23805473744869232, -0.27636468410491943, 0.22778695821762085, 0.18342997133731842], [0.07093805074691772, 0.08542165905237198, 0.07086850702762604, 0.34168606996536255, -0.23805473744869232, -0.27636468410491943, 0.22778695821762085, 0.18342997133731842], [0.07093805074691772, 0.08542165905237198, 0.07086850702762604, 0.34168606996536255, -0.23805473744869232, -0.27636468410491943, 0.22778695821762085, 0.18342997133731842], [0.07093805074691772, 0.08542165905237198, 0.070

In [30]:
set = {}
    
for key, seqs in sequences_dic.items():
    set[key] = []
    for seq in seqs:
        for v in seq:
            if v not in set[key]:
                set[key].append(v)
    set[key] = sorted(set[key])


In [31]:
json_str = json.dumps(set)
with open('data_set_small.json', 'w') as file:
    file.write(json_str)

In [32]:
word_vec_metrics = {}
for key, seq in set.items():
    word_vec_metrics[key] = []
    for i in seq:
        word_vec_metrics[key].append(model_dic[key].wv[str(i)].tolist())

json_str = json.dumps(word_vec_metrics)
with open('./wordvec/word_vec_small.json', 'w') as file:
    file.write(json_str)
        