In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from gensim.models import word2vec  # 导入gensim包
from tensorflow.python.keras.models import Sequential
import re
from nltk.stem import WordNetLemmatizer

def clean_text(text):
    text = re.sub('[^a-zA-Z]', ' ', text)
    words = text.lower().split()
    lemmatizer = WordNetLemmatizer()
    lem_words = [lemmatizer.lemmatize(w, pos='n') for w in words]
    stopwords = {}.fromkeys([line.rstrip() for line in open('F:\\PycharmProjects\\NVDproject\\nvdcve\\stopwords.txt')])
    eng_stopwords = set(stopwords)
    words = [w for w in lem_words if w not in eng_stopwords]
    return words

years = ['2020','2021','2022']
infix = ''
infix = str(years[0])
for i in range(1,len(years)):
    infix += '-'+str(years[i])

# cwe_min_count = 500
cwe_min_count = 700
infix+='_'+str(cwe_min_count)

vec_len = 100
# vec_len = 200
# vec_len = 300
min_count = 1
window_len = 5
dense_unit = 128
wv_model_path  = '..//..//models//wv//'+infix+"_"+str(vec_len)+"_"+str(min_count)+"_"+str(window_len)+'.pkl'
label_path = '..\\..\\data\\clean\\nvdcve-1.1-'+infix+'_labels.csv'
n=30

cwe_count = 18  # 2020,2021,2022_700

def get_label_one_hot(list):
    values = np.array(list)
    n_values = np.max(values) + 1
    return np.eye(n_values)[values]

from tensorflow.keras.layers import Conv1D,Conv2D, BatchNormalization, Activation, MaxPool1D,MaxPool2D, Dropout, Flatten, Dense, GRU
from tensorflow.keras import Model

class TextCNN(Model):
    def __init__(self):
        super(TextCNN, self).__init__()
        self.c1 = Conv2D(filters=12, kernel_size=(3, vec_len), padding='same')  # 卷积层
        self.b1 = BatchNormalization()  # BN层
        self.a1 = Activation('relu')  # 激活层
        self.p1 = MaxPool2D(pool_size=(2, 2), strides=2, padding='same')  # 池化层
        self.d1 = Dropout(0.2)  # dropout层

        self.flatten = Flatten()
        self.f1 = Dense(dense_unit, activation='relu')
        self.d2 = Dropout(0.2)
        self.f2 = Dense(cwe_count, activation='softmax')

    def call(self, x):
        x = self.c1(x)
        x = self.b1(x)
        x = self.a1(x)
        x = self.p1(x)
        x = self.d1(x)

        x = self.flatten(x)
        x = self.f1(x)
        x = self.d2(x)
        y = self.f2(x)
        return y




if __name__ == '__main__':
    wv_model = word2vec.Word2Vec.load(wv_model_path)
    dataset = pd.read_csv(label_path, header=None)
    dataset.columns = ['cve_id', 'cwe_id', 'descript', 'label']
    dataset['contents'] = dataset['descript'].apply(clean_text)
    train_dataset = []
    for line in dataset['contents']:
        length = len(line)
        if length > n:
            line = line[:n]
            word2vec_matrix = (wv_model.wv[line])
            train_dataset.append(word2vec_matrix)
        else:
            word2vec_matrix = (wv_model.wv[line])
            pad_length = n - length
            pad_matrix = np.zeros([pad_length, vec_len]) + 1e-10
            word2vec_matrix = np.concatenate([word2vec_matrix, pad_matrix], axis=0)
            train_dataset.append(word2vec_matrix)
    # train_dataset = np.expand_dims(train_dataset, 3)
    label_dataset = get_label_one_hot(dataset['label'])

In [2]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(train_dataset, label_dataset, test_size=0.2, random_state=217)
batch_size = 32
train_data = tf.data.Dataset.from_tensor_slices((x_train, y_train)).batch(batch_size)

In [3]:
train_data

<BatchDataset shapes: ((None, 30, 100), (None, 18)), types: (tf.float64, tf.float64)>

In [15]:
# for i in x_test:
#     print(i.shape)
for i in x_train:
    print(i.shape)
print("x_train_len",len(train_data))
# for i in y_test:
#     print(i.shape)

(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)
(30, 100)


In [None]:
model = Sequential()
model.add(Conv1D(256, 3, padding='same', strides=1, activation='relu'))
model.add(MaxPool1D(pool_size=2))
model.add(GRU(256))
model.add(Dense(cwe_count,activation="softmax"))
model.compile(optimizer=tf.optimizers.Adam(1e-3),
                  loss=tf.losses.categorical_crossentropy, metrics=['accuracy'])
model.fit(train_data, epochs=10)
score = model.evaluate(np.array(x_test), y_test)
print('last score:', score)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10

In [39]:
print(y_test.shape)
print(np.shape(x_test))
model.evaluate(x_test,y_test)

(6834, 18)
(6834, 30, 100)


In [52]:
# score = model.evaluate(x_test, y_test)
# print('last score:', score)
x_test = np.array(x_test)
print(x_test[:2].shape)
print(model.predict(x_test[:2]))
score = model.evaluate(x_test,y_test)
print(score)

(2, 30, 100)
[[4.4170562e-08 1.8492882e-07 1.4880713e-06 1.7856941e-08 4.5001389e-08
  1.8838587e-07 5.9186033e-05 6.9560074e-06 8.3876337e-08 1.6891009e-08
  4.5630381e-07 2.6459647e-08 8.6402792e-08 9.9987423e-01 3.8792794e-05
  1.3154709e-06 2.6136324e-06 1.4269384e-05]
 [9.6406261e-08 7.2534234e-10 4.9477876e-08 3.3674036e-07 1.5675055e-06
  1.6289020e-08 4.8774386e-06 2.0043501e-06 2.6883624e-08 2.1965800e-09
  3.2655305e-07 1.2462021e-05 1.8913033e-09 3.9191826e-05 1.2439579e-05
  6.0230474e-09 9.9992263e-01 3.9611377e-06]]
[0.8508539199829102, 0.8160667419433594]
