# DGA domain scoring (GBM-ONNX)

This tutorial shows how to perform real-time `DGA` domain classification using
a machine learning model to have as output the classification probability
`(score)`.

We will use the Gradient Boosting algorithm from the `scikit-learn` library to
create a model capable of detecting whether a domain is malicious. Then we will
transform the model into `ONNX` format in order to aggregate the scoring of the
classification. Finally, we will register the model in **ML Model Manager** to
enable it in the Devo Platform and exploit it through Devo query engine.

# Requirements

Table ``demo.ecommerce.data`` in Devo.

## Install

In [None]:
!pip install \
    devo-sdk \
    devo-mlmodelmanager \
    numpy \
    onnx \
    onnxruntime \
    pandas \
    scikit-learn \
    skl2onnx

## Imports

In [None]:
import os
import math
import time
import numpy as np
import pandas as pd

from onnx import TensorProto
from onnx.defs import ONNX_ML_DOMAIN
from onnx.helper import make_node, make_tensor_value_info
from onnxruntime import InferenceSession
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from skl2onnx import convert_sklearn, to_onnx
from skl2onnx.common.data_types import FloatTensorType
from devo.api import Client, ClientConfig, SIMPLECOMPACT_TO_OBJ
from devo_ml.modelmanager import create_client_from_token, engines

## Setup

In [None]:
# A valid Devo access token
DEVO_TOKEN = '<your_token_here>'

# URL of Devo API, e.g. https://apiv2-us.devo.com/search/query/
DEVO_API_URL = '<devo_api_url_here>'

# URL of Devo ML Model Manager, e.g. https://api-us.devo.com/mlmodelmanager/
DEVO_MLMM_URL = '<devo_mlmm_url_here>'

# The domain to connect to, e.g. self
DOMAIN = '<your_domain_here>'

# The name of the model
MODEL_NAME = 'dga_scoring'

# The description of the models
MODEL_DESCRIPTION = 'DGA domain label scoring'

# File to store the onnx model
MODEL_FILE = f'{MODEL_NAME}.onnx'

# The URL of a dataset to build the model
DATASET_URL = "https://devo-ml-models-public-demos.s3.eu-west-3.amazonaws.com/legit_dga/dataset.csv"

# Random seed to initialize random variables
RANDOM_SEED = 42

## Build model

In [None]:
# Load dataset
df = pd.read_csv(DATASET_URL, sep=';')
df.head()

In [None]:
def entropy(text):
    """Helper function to calculate the Shannon entropy of a text."""
    prob = [float(text.count(c)) / len(text) for c in set(text)]
    return -sum([p * math.log(p) / math.log(2.0) for p in prob])

In [None]:
# Prepare dataset
df = df[~df['subclass'].isna()]
df['length'] = df['domain'].apply(lambda x: len(x))
df['vowel_proportion'] = df['domain'].apply(lambda x: sum([x.lower().count(v) for v in 'aeiou']) / len(x))
df['entropy'] = df['domain'].apply(lambda x: entropy(x))
df['malicious'] = df['class'].apply(lambda x: int(x != 'legit'))
df.head()

In [None]:
X_data = df[['length', 'vowel_proportion', 'entropy']].values
y_data = df['malicious'].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, random_state=RANDOM_SEED)

In [None]:
# Train model
model = GradientBoostingClassifier(random_state=RANDOM_SEED)
model = model.fit(X_train, y_train)

In [None]:
# Validate how good is the model
pred_test = model.predict(X_test)
score = f1_score(y_test, pred_test)
print(f'F1-Score: {score:.4f}')

## Transform into ONNX

In [None]:
# Transform to ONNX format
onnx_model = to_onnx(
    model,
    X_train.astype(np.float32),
    target_opset=13,
)

# Remove all defined outputs, we will define them in the next steps
while onnx_model.graph.output:
    _ = onnx_model.graph.output.pop()

# Remove node ZipMap since it won't be necessary
n_nodes = len(onnx_model.graph.node)
for i in range(n_nodes):
    if onnx_model.graph.node[i].name == 'ZipMap':
        del onnx_model.graph.node[i]
        break

In [None]:
node = make_node(
    'Constant',
    inputs=[],
    outputs=['output_pos'],
    value_int=0,
)
onnx_model.graph.node.append(node)

node = make_node(
    'ArrayFeatureExtractor',
    inputs=['probabilities', 'output_pos'],
    outputs=['output_probability_at'],
    domain=ONNX_ML_DOMAIN,
)
onnx_model.graph.node.append(node)

onnx_model.graph.output.append(
    make_tensor_value_info(
        name='output_probability_at',
        elem_type=TensorProto.FLOAT,
        shape=[-1, 1],
    )
)

In [None]:
# For debug purposes, check that the model works correctly

# Predict with ONNX model
session = InferenceSession(onnx_model.SerializeToString())
input_name = session.get_inputs()[0].name
result = session.run(None, {input_name: X_test.astype(np.float32)})
onnx_scores = result[0].reshape(-1)

# Predict with model
scores = model.predict_proba(X_test)[:, 0]

# Compare predictions
threshold = 1e-3
prediction_validation = (np.abs(scores - onnx_scores) < threshold).all()

In [None]:
# For debug purposes, display output of the ONNX model
onnx_scores

In [None]:
# Save model
with open(MODEL_FILE, 'wb') as fp:
    fp.write(onnx_model.SerializeToString())

## Register the model

In [None]:
# Create the Devo MLMM client and register the model
mlmm_client = create_client_from_token(DEVO_MLMM_URL, DEVO_TOKEN)
mlmm_client.add_model(
    MODEL_NAME,
    engines.ONNX,
    MODEL_FILE,
    description=MODEL_DESCRIPTION,
    force=True,
)

## Scoring domains

In [None]:
query = f'''from demo.ecommerce.data
select
    eventdate,
    split(referralUri, "/", 2) as domain
group by domain every -
select
    float4(length(domain)) as length,
    float4(shannonentropy(domain)) as entropy,
    float4(countbyfilter(domain, "aeiouAEIOU") / length) as vowel_proportion,
    at(mlevalmodel(
        "{DOMAIN}",
        "{MODEL_NAME}",
        [length, vowel_proportion, entropy]
    ), 0) as score
'''

In [None]:
# For debug purposes, display query
print(query)

In [None]:
# Create a Devo API client
api = Client(
    auth={'token': DEVO_TOKEN},
    address=DEVO_API_URL,
    config=ClientConfig(
        response='json/simple/compact',
        stream=True,
        processor=SIMPLECOMPACT_TO_OBJ,
    ),
)

response = api.query(
    query=query,
    dates={'from': 'now() - 1 * hour()', 'to': 'now()'}
)

for row in response:
    print(row)
