# 读取完整训练数据集
# train_label = ['access-control', 'arithmetic', 'reentrancy', 'safe']


In [None]:
import pandas as pd

cur_df = pd.read_csv("train_dataset.csv",delimiter='\t')
cur_df.drop(['Unnamed: 0'], axis=1, inplace=True)
cur_df['swc_id'] = cur_df['swc_id'].apply(eval)

cur_df  # 67190 rows * 3 columns ['bytecode','swc_id','opcodes']

# 新增一列标签索引值


In [None]:
train_df = cur_df  # .sample(n=10)
train_df['label_idx'] = '[]'

train_df

# 对标签进行编码，插入'label_idx'列


In [None]:
from sklearn import preprocessing

enc=preprocessing.LabelEncoder()   
enc=enc.fit(['access-control', 'arithmetic', 'reentrancy', 'safe'])  # 训练LabelEncoder,[0,1,2,3]

for i,row in train_df.iterrows():
    swc_id = row['swc_id']
    label_idx = enc.transform(swc_id).tolist()
    train_df.loc[i,'label_idx'] = str(label_idx)

train_df['label_idx'] = train_df['label_idx'].apply(eval)

train_df

# 预处理数据和标签


In [None]:
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


opcodes = train_df['opcodes'].tolist()
labels= train_df['label_idx'].tolist()

tokenizer = Tokenizer(num_words=None)  # num_words:None或整数,处理的最大单词数量。少于此数的单词丢掉
tokenizer.fit_on_texts(opcodes)
sequences = tokenizer.texts_to_sequences(opcodes)  # 得到单词的索引，受num_words影响

MAX_SEQUENCE_LENGTH = int(np.mean([len(op.split()) for op in opcodes]))
print(MAX_SEQUENCE_LENGTH)

opcodes_idx = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)  #将长度不足xx的用 0 填充（在前端填充）

mlb = MultiLabelBinarizer()
labels_encoded = mlb.fit_transform(labels)

print(opcodes_idx.shape)
print(labels_encoded.shape)


# 训练base和分支模型


In [None]:
import time
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, GRU, BatchNormalization, Dropout, Dense
from tensorflow.keras.models import Model

stime = time.time()

# 定义base网络结构
def base_model(input_dim, embedding_dim, gru_units):
    inputs = Input(shape=(input_dim,))
    embedding = Embedding(input_dim, embedding_dim)(inputs)
    gru = GRU(gru_units)(embedding)
    bn = BatchNormalization()(gru)
    dropout = Dropout(0.5)(bn)
    return inputs, dropout

# 定义分支网络结构
def branch_model(input_layer, units, num_classes):
    dense = Dense(units, activation='relu')(input_layer)
    bn = BatchNormalization()(dense)
    dropout = Dropout(0.5)(bn)
    output = Dense(num_classes, activation='sigmoid')(dropout)
    return output

# 构建模型
def build_model(input_dim, embedding_dim, gru_units, units_list, num_classes_list):
    inputs, base_output = base_model(input_dim, embedding_dim, gru_units)
    outputs = []
    for units, num_classes in zip(units_list, num_classes_list):
        branch_output = branch_model(base_output, units, num_classes)
        outputs.append(branch_output)
    model = Model(inputs=inputs, outputs=outputs)
    return model

# 构建数据集
x_train = opcodes_idx  
y_train_list = [np.array(line) for line in labels_encoded.T.reshape(4,67190,1).tolist()]  # 假设有四个标签，每个标签是二分类

# 定义模型参数
input_dim = 6208
embedding_dim = 1000
gru_units = 128
units_list = [128, 128, 128, 128]  # 每个分支的隐藏单元数量
num_classes_list = [1, 1, 1, 1]  # 每个分支的输出类别数量，这里假设是二分类任务

# 构建模型
model = build_model(input_dim, embedding_dim, gru_units, units_list, num_classes_list)

# 编译模型
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# 训练模型
model.fit(x_train, y_train_list, batch_size=128, epochs=20)

# 保存模型的权重
model.save_weights('model_weights.h5')

# 保存模型的结构
model_json = model.to_json()
with open('model_structure.json', 'w') as json_file:
    json_file.write(model_json)

print(time.time()-stime)