In [None]:
import pandas as pd
import os
import numpy as np
from keras.models import Sequential
from tensorflow.keras.layers import LSTM, GRU, Dense, Embedding, Dropout, GlobalAveragePooling1D, Flatten, SpatialDropout1D, Bidirectional
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
data = pd.read_csv('/content/drive/MyDrive/dga_dataset/onlydomainnames_balanceddataset.csv')
print(data)

                        domain  label
0        seraphinaaraminta.net      1
1            garrismasonry.com      0
2            tomatohome.com.tw      0
3                salespype.com      0
4              irmetalwork.com      0
...                        ...    ...
1999995       modirepishro.com      0
1999996    mfqbnlssnxnmiafy.eu      1
1999997       prettylisten.net      1
1999998       n0x2rc12zwdm.top      1
1999999           follo3me.com      0

[2000000 rows x 2 columns]


In [None]:
y = data[['label']].values
del data['label']

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data["domain"], y,stratify=y, test_size=0.2, random_state=42)

In [None]:
# Defining pre-processing parameters
max_len = 50
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>' # out of vocabulary token
vocab_size = 500

In [None]:
tokenizer = Tokenizer(num_words = vocab_size,char_level = False,oov_token = oov_tok)
tokenizer.fit_on_texts(X_train)

In [None]:
# Get the word_index
word_index = tokenizer.word_index
total_words = len(word_index)
total_words

1475298

In [None]:
training_sequences = tokenizer.texts_to_sequences(X_train)
training_padded = pad_sequences(training_sequences,maxlen = max_len,padding = padding_type,truncating = trunc_type)

In [None]:
testing_sequences = tokenizer.texts_to_sequences(X_test)
testing_padded = pad_sequences(testing_sequences,maxlen = max_len,padding = padding_type,truncating = trunc_type)

In [None]:
print('Shape of training tensor: ', training_padded.shape)
print('Shape of testing tensor: ', testing_padded.shape)

Shape of training tensor:  (1600000, 50)
Shape of testing tensor:  (400000, 50)


##Model (Dense Model)

In [None]:
# Define parameter
vocab_size = 500
embedding_dim = 16
drop_value = 0.2
n_dense = 24
# Define Dense Model Architecture
model = Sequential()
model.add(Embedding(vocab_size,embedding_dim,input_length = max_len))
model.add(GlobalAveragePooling1D())
model.add(Dense(24, activation='relu'))
model.add(Dropout(drop_value))
model.add(Dense(1, activation='sigmoid'))

In [None]:
model1.compile(loss = 'binary_crossentropy', optimizer = 'adam' , metrics = ['accuracy'])

In [None]:
num_epochs = 30
early_stop = EarlyStopping(monitor='val_loss', patience=3)
history = model1.fit(training_padded,y_train,epochs=num_epochs, validation_data=(testing_padded, y_test),callbacks =[early_stop],verbose=2)

In [None]:
model.evaluate(testing_padded, y_test)

## Model1 (Long Short Term Memory (LSTM))

In [None]:
# Define parameter
n_lstm = 128
drop_lstm = 0.2
# Define LSTM Model
model1 = Sequential()
model1.add(Embedding(vocab_size, embedding_dim, input_length=max_len))
model1.add(SpatialDropout1D(drop_lstm))
model1.add(LSTM(n_lstm, return_sequences=False))
model1.add(Dropout(drop_lstm))
model1.add(Dense(1, activation='sigmoid'))

In [None]:
model1.compile(loss = 'binary_crossentropy',
               optimizer = 'adam',
               metrics = ['accuracy'])

In [None]:
num_epochs = 30
early_stop = EarlyStopping(monitor='val_loss', patience=2)
history = model1.fit(training_padded,y_train,epochs=num_epochs, validation_data=(testing_padded, y_test),callbacks =[early_stop],verbose=2)

Epoch 1/30
50000/50000 - 3688s - loss: 0.6180 - accuracy: 0.6480 - val_loss: 0.4984 - val_accuracy: 0.7264 - 3688s/epoch - 74ms/step
Epoch 2/30


In [None]:
model1.evaluate(testing_padded, y_test)