In [1]:
import sys
import os
import json
import pandas as pd
import numpy
import optparse
from keras.callbacks import TensorBoard
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from collections import OrderedDict

Using TensorFlow backend.


In [2]:
dataframe = pd.read_csv('good_url.csv', names=['url'])
dataframe['label']=0
dataframe.head()

Unnamed: 0,url,label
0,/103886/,0
1,/rcanimal/,0
2,/458010b88d9ce/,0
3,/cclogovs/,0
4,/using-localization/,0


In [3]:
dataframe1 = pd.read_csv('bad_url.csv', names=['url'])
dataframe1['label']=1
dataframe1.head()

Unnamed: 0,url,label
0,/top.php?stuff='uname >q36497765 #,1
1,/h21y8w52.nsf?<script>cross_site_scripting.nas...,1
2,"/ca000001.pl?action=showcart&hop=\""><script>al...",1
3,/scripts/edit_image.php?dn=1&userfile=/etc/pas...,1
4,/javascript/mta.exe,1


In [4]:
dataset=pd.concat([dataframe,dataframe1])

In [5]:
dataset=dataset.sample(frac=1).values

In [6]:
X = dataset[:,0]
Y = dataset[:,1]

In [7]:
for i in range(len(X)):
    if type(X[i])==float:
        X[i]=str(X[i])

In [8]:
tokenizer = Tokenizer(filters='\t\n', char_level=True)
tokenizer.fit_on_texts(X)

In [9]:
word_dict_file = 'build/word-dictionary.json'

if not os.path.exists(os.path.dirname(word_dict_file)):
    os.makedirs(os.path.dirname(word_dict_file))

with open(word_dict_file, 'w') as outfile:
    json.dump(tokenizer.word_index, outfile, ensure_ascii=False)

In [10]:
num_words = len(tokenizer.word_index)+1
X = tokenizer.texts_to_sequences(X)
X[0]

[1, 32, 2, 20, 2, 4, 8, 7, 6, 10, 5, 1, 14, 18, 6, 12, 6]

In [11]:
max_log_length = 1024
train_size = int(len(dataset) * .75)

In [12]:
# padding
X_processed = sequence.pad_sequences(X, maxlen=max_log_length)
# 划分样本集
X_train, X_test = X_processed[0:train_size], X_processed[train_size:len(X_processed)]
Y_train, Y_test = Y[0:train_size], Y[train_size:len(Y)]

In [13]:
model = Sequential()
model.add(Embedding(num_words, 32, input_length=max_log_length))
model.add(Dropout(0.5))
model.add(LSTM(64, recurrent_dropout=0.5))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [14]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1024, 32)          4576      
_________________________________________________________________
dropout_1 (Dropout)          (None, 1024, 32)          0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                24832     
_________________________________________________________________
dropout_2 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total params: 29,473
Trainable params: 29,473
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.fit(X_train, Y_train, validation_split=0.25, epochs=3, batch_size=128)

Instructions for updating:
Use tf.cast instead.
Train on 754593 samples, validate on 251532 samples
Epoch 1/3
 80000/754593 [==>...........................] - ETA: 1:26:05 - loss: 0.0957 - acc: 0.9769

In [None]:
# Evaluate model
score, acc = model.evaluate(X_test, Y_test, verbose=1, batch_size=128)
print("Model Accuracy: {:0.2f}%".format(acc * 100))

In [None]:
# Save model
model.save_weights('securitai-lstm-weights.h5')
model.save('securitai-lstm-model.h5')
with open('securitai-lstm-model.json', 'w') as outfile:
    outfile.write(model.to_json())

In [None]:
#testing
df_black = pd.read_csv('bad_url.csv',names=['url'],nrows=20000)
df_black['label']=1

In [None]:
X_waf = df_black['url'].values.astype('str')
Y_waf = df_black['label'].values.astype('str')

In [None]:
X_sequences = tokenizer.texts_to_sequences(X_waf)
X_processed = sequence.pad_sequences(X_sequences, maxlen=max_log_length)

In [None]:
score, acc = model.evaluate(X_processed, Y_waf, verbose=1, batch_size=128)
print("Model Accuracy: {:0.2f}%".format(acc * 100))