In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
from keras import layers
from keras.models import load_model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.callbacks import TensorBoard
from sklearn import metrics

Using TensorFlow backend.
  from ._conv import register_converters as _register_converters


In [2]:
# 读取train.csv和test.csv数据
df_train = pd.read_csv('./data/torpeda_train_test/train.csv')
df_test = pd.read_csv('./data/torpeda_train_test/test.csv')

# 获取训练集中的url和label
url_train = df_train['url']
label_train = df_train['label']
# 获取测试集中的url和label
url_test = df_test['url']
label_test = df_test['label']

print("=================================================================================================")
print("训练集中第一条数据：")
print("url: %s" % url_train[0])
print("label: %s\n" % label_train[0])

print("测试集中第一条数据：")
print("url: %s" % url_test[0])
print("label: %s\n" % label_test[0])
print("=================================================================================================")

训练集中第一条数据：
url: POST /tienda1/publico/pagar.jsp?precio=85&B1=Pasar%2Bpor%2Bcaja%27%2C%270%27%2C%270%27%29%3Bwaitfor+delay+%270%3A0%3A15%27%3B--
label: SQLi

测试集中第一条数据：
url: GET /tienda1/publico/caracteristicas.jsp?id=d%27z%220
label: SQLi



In [3]:
labels_type = pd.Series(label_train).value_counts().keys().tolist()
# 构建标签类型字典 (Bây giờ ta sử dụng 'labels_index' thay vì 'type')
type_dict = dict([(labels_type[i], i) for i in range(len(labels_type))])
print("标签类型字典：")
print(type_dict)

标签类型字典：
{'SQLi': 0, 'anomalous': 1, 'normal': 2, 'XSS': 3, 'SSI': 4, 'BufferOverflow': 5, 'CRLFi': 6, 'XPath': 7, 'LDAPi': 8, 'FormatString': 9}


In [4]:
import json
# 字符级分词，在训练集上拟合
tokenizer = Tokenizer(filters='\t\n', char_level=True)
tokenizer.fit_on_texts(url_train)
# 构建词典，并保存
num_words = len(tokenizer.word_index)+1
vocab = tokenizer.word_index
print("字典的大小为%d" % num_words)
print("字典：")
print(vocab)
with open("./tokenizer/vocab.json", 'w') as f:
    json.dump(vocab, f, ensure_ascii=False)

字典的大小为98
字典：
{'%': 1, '2': 2, '0': 3, 'i': 4, '3': 5, 'r': 6, '=': 7, 'e': 8, 'o': 9, 'a': 10, '&': 11, 'c': 12, 'm': 13, '1': 14, '7': 15, 'd': 16, 'n': 17, 'C': 18, '5': 19, '6': 20, 't': 21, 'p': 22, 'L': 23, 's': 24, '9': 25, '8': 26, 'l': 27, '/': 28, ',': 29, '4': 30, 'N': 31, 'b': 32, 'U': 33, 'u': 34, 'O': 35, 'g': 36, '.': 37, 'S': 38, 'T': 39, 'E': 40, 'A': 41, 'j': 42, 'P': 43, ' ': 44, 'R': 45, 'w': 46, 'B': 47, '?': 48, 'F': 49, 'D': 50, 'v': 51, 'f': 52, 'I': 53, '+': 54, '-': 55, 'x': 56, 'z': 57, 'h': 58, 'H': 59, 'M': 60, '#': 61, 'y': 62, 'G': 63, ';': 64, 'K': 65, 'Z': 66, 'J': 67, 'V': 68, 'Y': 69, 'k': 70, 'X': 71, 'W': 72, 'Q': 73, '<': 74, '>': 75, 'q': 76, '"': 77, '_': 78, '@': 79, '*': 80, ':': 81, '(': 82, ')': 83, "'": 84, '!': 85, '[': 86, ']': 87, '{': 88, '}': 89, '`': 90, '\r': 91, '$': 92, '~': 93, '|': 94, '\\': 95, '^': 96, '\n': 97}


In [5]:
# 将label转化为one-hot形式
def get_one_hot_value(s):
    return [0 if i!= type_dict[s] else 1 for i in range(10)]

In [6]:
# 获取url的长度
url_train_lens = [len(u) for u in url_train]
url_test_lens = [len(u) for u in url_test]
# 查看97%长度的大小
print(np.percentile(np.array(url_train_lens),97))
print(np.percentile(np.array(url_test_lens),97))

603.239999999998
604.8299999999981


In [7]:
# 观察后定义url最大的长度为600
max_len = 600
# 将文本转为序列
seq_train = tokenizer.texts_to_sequences(url_train)
seq_test = tokenizer.texts_to_sequences(url_test)
# 将序列的长度统一为max_len
X_train = sequence.pad_sequences(seq_train, maxlen=max_len)
X_test = sequence.pad_sequences(seq_test, maxlen=max_len)
# 将标签转化为one-hot
Y_train = [get_one_hot_value(l) for l in label_train]
Y_test =  [get_one_hot_value(l) for l in label_test]
print("=================================================================================================")
print("举例：")
print("向量化前：")
print("url: %s" % url_train[0])
print("label: %s\n" % label_train[0])
print("向量化后：")
print("url_vec: " + str(X_train[0]))
print("label_one_hot: " + str(Y_train[0]))
print("=================================================================================================")

举例：
向量化前：
url: POST /tienda1/publico/pagar.jsp?precio=85&B1=Pasar%2Bpor%2Bcaja%27%2C%270%27%2C%270%27%29%3Bwaitfor+delay+%270%3A0%3A15%27%3B--
label: SQLi

向量化后：
url_vec: [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  

In [8]:
# 回调
tb_callback = TensorBoard(log_dir='./logs', embeddings_freq=1)

In [9]:
# 搭建网络
model = Sequential()
model.add(layers.Embedding(num_words, 64, input_length=max_len))
model.add(layers.Conv1D(32, 7, activation='relu'))
model.add(layers.MaxPooling1D(5))
model.add(layers.Conv1D(32, 7, activation='relu'))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(10, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 600, 64)           6272      
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 594, 32)           14368     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 118, 32)           0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 112, 32)           7200      
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 32)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 10)                330       
Total params: 28,170
Trainable params: 28,170
Non-trainable params: 0
_________________________________________________________________
None


In [10]:
# 训练
model.fit(X_train, Y_train, validation_split=0.25, epochs=6, batch_size=128, callbacks=[tb_callback])

Train on 38919 samples, validate on 12974 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<keras.callbacks.History at 0x7f11b41f0da0>

In [11]:
# 评估模型
model.evaluate(X_test, Y_test, batch_size=128)



[0.015338407208173194, 0.9948291366906474]

In [12]:
# 将softmax的概率值转化为标签名
def props_to_labels(props_matrix):
    labels = []
    for props_vector in props_matrix:
        idx = np.argmax(props_vector)
        label = labels_type[idx]
        labels.append(label)
    return labels

In [13]:
Y_test_pred = model.predict(X_test)
label_test_pred = props_to_labels(Y_test_pred)
print("混淆矩阵：")
print(metrics.confusion_matrix(label_test, label_test_pred, labels_type))
print("f1-score:")
print(metrics.f1_score(label_test, label_test_pred, labels_type, average='micro'))
print("acc-score:")
print(metrics.accuracy_score(label_test, label_test_pred, labels_type))
print("recall-score:")
print(metrics.recall_score(label_test, label_test_pred, labels_type, average='micro'))
print("classification report:")
print(metrics.classification_report(label_test, label_test_pred, labels_type))

混淆矩阵：
[[12904     0     0     0     0     0     0     0     0     0]
 [    0  4901    37     0     0     0     0     0     0     0]
 [    1    13  2495     0     0     0     0     0     0     0]
 [    1     2     0  1435     0     0     0     7     0     0]
 [    0     0     9     1   125     0     0     0     0     0]
 [    0     0     0     0     2   122     0     0     0     0]
 [    0     0     0     0     0     0    98     0     0     0]
 [   29     0     0     2     6     0     0    16     0     0]
 [    0     0     0     0     1     0     0     0    21     0]
 [    0     0     1     0     1     0     0     2     0     8]]
f1-score:
0.9948291366906474
acc-score:
0.9948291366906474
recall-score:
0.9948291366906474
classification report:
                precision    recall  f1-score   support

          SQLi       1.00      1.00      1.00     12904
     anomalous       1.00      0.99      0.99      4938
        normal       0.98      0.99      0.99      2509
           XSS       1.

In [14]:
import pickle

# 保存模型
model.save_weights('./model/cnn_weights_nckh.h5')
model.save('./model/cnn_clf_nckh.h5')
with open('./model/cnn_clf_nckh.json', 'w') as f:
    f.write(model.to_json())

# 把tokenzier保存成pickle格式
with open('./tokenizer/tokenizer.pickle_nckh', 'wb') as handle: 
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL) 