# DGA domain classifier (Keras-ONNX)


This tutorial is related to the GDA domain classifier using H2O engine tutorial but in this case is used Keras as machine learning engine.

In Devo, it's neccesary to convert the Keras model to ONNX format in order to productionalize your model.

## Requirements

Table ``demo.ecommerce.data`` in Devo.

## Install

In [None]:
!pip install devo-sdk
!pip install devo-mlmodelmanager
!pip install tensorflow
!pip install tf2onnx
!pip install scikit-learn
!pip install numpy
!pip install pandas

## Imports

In [None]:
import os
import math
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
import tf2onnx

from collections import Counter
from sklearn.preprocessing import LabelEncoder
from devo.api import Client, ClientConfig, SIMPLECOMPACT_TO_OBJ
from devo_ml.modelmanager import create_client_from_token, engines

## Setup

In [None]:
# A valid Devo access token
DEVO_TOKEN = ''

# URL of Devo API, e.g. https://apiv2-us.devo.com/search/query/
DEVO_API_URL = ''

# URL of Devo ML Model Manager, e.g. https://api-us.devo.com/mlmodelmanager/
DEVO_MLMM_URL = ''

# The domain to connect to, e.g. self
DOMAIN = ''

# The name of the model
MODEL_NAME = 'dga_classifier_onnx'

# The description of the models
MODEL_DESCRIPTION = 'DGA domain classifier (Keras-ONNX)'

# File to store the onnx model
MODEL_FILE = f'{MODEL_NAME}.onnx'

# The URL of a dataset to build the model
DATASET_URL = "https://devo-ml-models-public-demos.s3.eu-west-3.amazonaws.com/legit_dga/dataset.csv"

VOWELS = "aeiouAEIOU"

In [None]:
# fix random seed for reproducibility
seed = 42
np.random.seed(seed)

### Encoding the Output/Response Variable

In [None]:
domains = pd.read_csv(DATASET_URL, ';')

In [None]:
'''
Prepare data set
    1. Domain length
    2. Shannon entropy
    3. Vowel proportion
    4. Malicious flag
'''

def entropy(s):
    l = len(s)
    return -sum(map(lambda a: (a/l)*math.log2(a/l), Counter(s).values()))


domains = domains[~domains['subclass'].isna()]
domains['length'] = domains['domain'].str.len()
domains['entropy'] = domains['domain'].apply(lambda row: entropy(row))
domains['vowel_proportion'] = 0
for v in VOWELS:
    domains['vowel_proportion'] += domains['domain'].str.count(v)
domains['vowel_proportion'] /= domains['length']
domains['malicious'] = domains['class'] != 'legit'

In [None]:
domains.head()

In [None]:
Y = domains['malicious']
X = domains.drop(['host', 'domain', 'class', 'subclass', 'malicious'], axis=1)
print("Shape of Input  features: {}".format(X.shape))
print("Shape of Output features: {}".format(Y.shape))

In [None]:
lbl_clf = LabelEncoder()
Y_encoded = lbl_clf.fit_transform(Y)

#Keras requires your output feature to be one-hot encoded values.
Y_final = tf.keras.utils.to_categorical(Y_encoded)

print("Therefore, our final shape of output feature will be {}".format(Y_final.shape))

## ML model

In [None]:
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Dense(
    10,
    input_dim=3,
    activation=tf.nn.relu,
    kernel_initializer='he_normal',
    kernel_regularizer=tf.keras.regularizers.l2(0.01)
))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.Dropout(0.3))
model.add(tf.keras.layers.Dense(
    7,
    activation=tf.nn.relu,
    kernel_initializer='he_normal',
    kernel_regularizer=tf.keras.regularizers.l1_l2(l1=0.001, l2=0.001)
))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.Dropout(0.3))
model.add(tf.keras.layers.Dense(
    5,
    activation=tf.nn.relu,
    kernel_initializer='he_normal',
    kernel_regularizer=tf.keras.regularizers.l1_l2(l1=0.001, l2=0.001)
))
model.add(tf.keras.layers.Dense(2, activation=tf.nn.softmax))


In [None]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(X , Y_final , epochs=10,  batch_size=7)

## Transform to ONNX

In [None]:
onnx_model = tf2onnx.convert.from_keras(model, opset=13, output_path=MODEL_FILE)

## Register the model in Devo

In [None]:
mlmm = create_client_from_token(DEVO_MLMM_URL, DEVO_TOKEN)

In [None]:
mlmm.add_model(
    MODEL_NAME,
    engines.ONNX,
    MODEL_FILE,
    description=MODEL_DESCRIPTION,
    force=True
)

## Classify DGA domains

In [None]:
# use in the query the mlevalmodel operator to evaluate the model

query = f'''from demo.ecommerce.data
  select split(referralUri, "/",2) as domain,
  float(length(domain)) as length,
  shannonentropy(domain) as entropy,
  float(countbyfilter(domain, "{VOWELS}")) as vowel_proportion,
  at(mlevalmodel("{DOMAIN}", "{MODEL_NAME}", [float4(length), float4(vowel_proportion)]),0) as res,
  ifthenelse(res>0.5, "false", "true") as isMalicious
'''

In [None]:
api = Client(
    auth={"token": DEVO_TOKEN},
    address=DEVO_API_URL,
    config=ClientConfig(
        response="json/simple/compact",
        stream=True,
        processor=SIMPLECOMPACT_TO_OBJ
    )
)

In [None]:
response = api.query(query=query, dates={'from': "now()-1*hour()"})
for row in response:
    print("domain: ",row['domain'], "isMalicious:", row['isMalicious'])
