In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from string import printable

from keras.models import  Model, model_from_json, load_model
from keras import regularizers
from keras.layers.core import Dense, Dropout, Activation
from keras.layers import *
from keras.preprocessing import sequence
from keras.optimizers import Adam
from pathlib import Path
import json

import warnings
warnings.filterwarnings("ignore")

# Data preparations

In [None]:
DATA = '/kaggle/input/new-data-set-33/NEW_MERGE_DataSET_WITH_DROP.csv'
df = pd.read_csv(DATA)
df.head()

In [None]:
df1=pd.DataFrame()
df1['url']=df['url']
df1['label']=df['label']

In [None]:
max_len=75
url_int_tokens = [[printable.index(x) + 1 for x in url if x in printable] for url in df1.url]
X = sequence.pad_sequences(url_int_tokens, maxlen=max_len)
target = np.array(df1.label)
print('Matrix dimensions of X: ', X.shape, 'Vector dimension of target: ', target.shape)

In [None]:
len(df1[df1['label']==1])/len(df1['label'])

In [None]:
X_train, X_test, target_train, target_test = model_selection.train_test_split(X, target, test_size=0.2, random_state=42)

# Model Preparation

In [None]:
def print_layers_dims(model):
    l_layers = model.layers
    for i in range(len(l_layers)):
        print(l_layers[i])
        print('Input Shape: ', l_layers[i].input_shape, 'Output Shape: ', l_layers[i].output_shape)


def save_model(fileModelJSON,fileWeights):
    if Path(fileModelJSON).is_file():
        os.remove(fileModelJSON)
    json_string = model.to_json()
    with open(fileModelJSON,'w' ) as f:
        json.dump(json_string, f)
    if Path(fileWeights).is_file():
        os.remove(fileWeights)
    model.save_weights(fileWeights)
    
def load_model(fileModelJSON,fileWeights):
    with open(fileModelJSON, 'r') as f:
        model_json = json.load(f)
        model = model_from_json(model_json)
    model.load_weights(fileWeights)
    return model

In [None]:
def lstm_conv(max_len=75, emb_dim=32, max_vocab_len=100, lstm_output_size=32, W_reg=regularizers.l2(1e-4)):
    main_input = Input(shape=(max_len,), dtype='int32', name='main_input')
    emb = Embedding(input_dim=max_vocab_len, output_dim=emb_dim, input_length=max_len,
                embeddings_regularizer=W_reg)(main_input) 
    emb = Dropout(0.25)(emb)

    conv = Convolution1D(kernel_size=5, filters=256, padding='same')(emb)
    conv = ReLU()(conv)

    conv = MaxPooling1D(pool_size=4)(conv)
    conv = Dropout(0.5)(conv)

    lstm = LSTM(lstm_output_size)(conv)
    lstm = Dropout(0.5)(lstm)
    
    output = Dense(1, activation='sigmoid', name='output')(lstm)

    model = Model(inputs=[main_input], outputs=[output])
    adam = Adam(lr=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
    model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])
    return model


In [None]:
epochs = 25
batch_size = 20

model = lstm_conv()
history=model.fit(X_train, target_train, epochs, batch_size,validation_split=0.02)


In [None]:
import matplotlib.pyplot as plt
print(history.history.keys())
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
loss, accuracy = model.evaluate(X, target, verbose=1)

print('\nFinal Cross-Validation Accuracy', accuracy, '\n')
print_layers_dims(model)

In [None]:
model.summary()

In [None]:
DATA_HOME = 'data'
model_name = "1DConvLSTM"
save_model(DATA_HOME + model_name + ".json", DATA_HOME + model_name + ".h5")
model = load_model(DATA_HOME + model_name + ".json", DATA_HOME + model_name + ".h5")

In [None]:
l_layers = model.layers
weights = l_layers[1].get_weights()
weights[0].shape

In [None]:
test_url_mal = "mizhibuluo.com/kpgv0nhtm"
test_url_benign = "ubuntulinux.org/server/hyperscale"

In [None]:
max_len=75
url_int_tokens = [[printable.index(x) + 1 for x in url if x in printable]for url in [test_url_mal,test_url_benign]]
X = sequence.pad_sequences(url_int_tokens, maxlen=max_len)

In [None]:
target_proba = model.predict(X, batch_size=1)


In [None]:
def print_result(proba):
    if proba > 0.5:
        return 1
    else:
        return 0

In [None]:
pred=[]
for i in target_proba:
    pred.append(print_result(i))

In [None]:
print(pred)