# Ecommerce Clustering (kmeans)

This tutorial shows how to perform a classification task based on unsupervised training using the kmeans algorithm.

The main goal of this tutorial is about how to export your model to ONNX format supported by Devo platform.

ONNX is an open format to represent different machine learning models. There are many frameworks like pytorch, libsvm, keras, mxnet, tensorflow, etc. whose models can be exported to onnx.

In this example we are going to show how to create a model using kmeans from sklearn library and exporting the model to ONNX. Finally, the model is evaluated using the devo query engine in order to classify the entity behaviour in table demo.ecommerce.

# Requirements

Table *demo.ecommerce.data* in Devo.

# Install

In [None]:
!pip install devo-sdk
!pip install devo-mlmodelmanager
!pip install scikit-learn
!pip install onnx
!pip install numpy
!pip install pandas
!pip install skl2onnx

# Imports

In [None]:
import numpy as np
import pandas as pd
import onnx

from onnx import helper, TensorProto
from onnx.tools import update_model_dims
from sklearn.cluster import KMeans
from skl2onnx import convert_sklearn, to_onnx
from devo.api import Client, ClientConfig, JSON,  SIMPLECOMPACT_TO_OBJ
from devo_ml.modelmanager import create_client_from_token, engines

# Setup

In [None]:
# A valid Devo access token
TOKEN = ''

# URL of Devo API, e.g. https://apiv2-us.devo.com/search/query/
DEVO_API_URL = ''

# URL of Devo ML Model Manager, e.g. https://api-us.devo.com/mlmodelmanager/
DEVO_MLMM_URL = ''

# The domain to connect to, e.g. self
DOMAIN = ''

# The name of the model
NAME = 'ecommerce_cluster'

# The description of the models
DESCRIPTION = 'Demo of ecommerce clustering'

# File to store the onnx model
MODEL_FILE = f'{NAME}.onnx'

# ML model

### Query data from Devo

In [None]:
api = Client(
    auth={'token': TOKEN},
    address=DEVO_API_URL,
    config=ClientConfig(
        response='json/simple/compact',
        stream=True,
        processor=SIMPLECOMPACT_TO_OBJ
    )
)

In [None]:
query = '''from demo.ecommerce.data where isnotnull(clientIpAddress)
select
    hour(eventdate) as hour,
    minute(eventdate) as minute,
    second(eventdate) as second,
    clientIpAddress,
    userAgent
group every 8h by clientIpAddress
select
    str(clientIpAddress) as sourceIp,
    float4(size(collectcompact(hour))) as unique_hours,
    float4(size(collectcompact(minute))) as unique_mins,
    float4(size(collectcompact(second))) as unique_seconds,
    float4(size(collectcompact(userAgent))) as unique_user_agents,
    float4(avg(bytesTransferred)) as bytestransferred
'''

In [None]:
response = api.query(
    query=query,
    dates={'from': 'today() - 2 * day()', 'to': 'today() - 1 * day()'}
)

In [None]:
raw_data = pd.DataFrame(
    response,
    columns=[
        'sourceIp',
        'unique_hours',
        'unique_mins',
        'unique_seconds',
        'unique_user_agents',
        'bytestransferred',
    ],
)

In [None]:
raw_data.head()

In [None]:
raw_data.describe()

### Create and train model

In [None]:
train_data = raw_data.select_dtypes(include=np.number).to_numpy()

In [None]:
# Train K-Means model
model = KMeans(
    n_clusters=3,
    init='k-means++',
    verbose=0,
    max_iter=300,
    random_state=42
).fit(train_data)

In [None]:
# Transform to ONNX format
model_onnx = to_onnx(
    model,
    train_data.astype(np.float32),
    target_opset=13,
)
_ = model_onnx.graph.output.pop(1)  # Output: scores (discarded)
_ = model_onnx.graph.output.pop(0)  # Output: label (discarded)

In [None]:
# Last output should be float to work in Devo
cast_node = helper.make_node(
    'Cast',
    inputs=['label'],
    outputs=['label_cast'],
    name='output_label_cast',
    to=TensorProto.FLOAT,
)
model_onnx.graph.node.append(cast_node)
model_onnx.graph.output.append(
    helper.make_tensor_value_info(
        name='label_cast',
        elem_type=TensorProto.FLOAT,
        shape=[-1],
    )
)

In [None]:
# Expand last dimension, so it has two dimensions: batch and item
# It's required only for the kmeans in sklearn, other algorithms like linear regression
# do not require this conversion
model_onnx = onnx.compose.expand_out_dim(model_onnx, dim_idx=1)
model_onnx = update_model_dims.update_inputs_outputs_dims(
    model_onnx,
    {'X': [-1, 5]},
    {'label_cast': [-1, 1]},
)

In [None]:
# Save model to file
with open(MODEL_FILE, 'wb') as fp:
    fp.write(model_onnx.SerializeToString())

# Register the model in Devo

In [None]:
# Create the Devo MLMM client and register the model
mlmm_client = create_client_from_token(DEVO_MLMM_URL, TOKEN)
mlmm_client.add_model(
    NAME,
    engines.ONNX,
    MODEL_FILE,
    description=DESCRIPTION,
    force=True
)

# Use the model

In [None]:
query = f'''from demo.ecommerce.data where isnotnull(clientIpAddress)
select
    hour(eventdate) as hour,
    minute(eventdate) as minute,
    second(eventdate) as second,
    clientIpAddress,
    userAgent
group every 8h by clientIpAddress
select
    str(clientIpAddress) as sourceIp,
    float4(size(collectcompact(hour))) as unique_hours,
    float4(size(collectcompact(minute))) as unique_mins,
    float4(size(collectcompact(second))) as unique_seconds,
    float4(size(collectcompact(userAgent))) as unique_user_agents,
    float4(avg(bytesTransferred)) as bytestransferred,
    at(mlevalmodel(
        "{DOMAIN}",
        "{NAME}",
        [unique_hours, unique_mins, unique_seconds, unique_user_agents, bytestransferred]), 0) as label,
    ifthenelse(label = 0.0, "IU", ifthenelse(label = 1.0, "AU", "MU")) as type
'''

In [None]:
response = api.query(
    query=query,
    dates={'from': 'today() - 1 * day()'}
)

In [None]:
for row in response:
    print("IP:", row['sourceIp'], "type", row['type'])
