## Train Model

In [1]:
import numpy as np
import pandas as pd

from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import metrics
import tensorflow as tf

In [2]:
maxLen = 128
epochs = 50
dim_embedding = 50
batch_size = 4
dropout_rate = 0.25
num_LSTM_cell = 64
trainDataFileName = 's3://smle-experiments/datasets/phishing_email/splunk_train.json'

In [3]:
xTrain = []
yTrain = []
df = pd.read_json(trainDataFileName,  lines = True)
for idx, row in df.iterrows():
    label = np.zeros(1)
    if row['isPhishing'] == 'True':
        label[0] = 1
    yTrain.append(label)
    aMessage = row['From'] + ' ' + row['Subject'] + ' ' + row['Content']
    anEvent = np.array([32]*maxLen)
    p = 0
    for c in aMessage:
        v = ord(c)
        if v < 32 or v > 126:
            continue
        anEvent[p] = v
        p += 1
        if p >= maxLen:
            break
    xTrain.append(anEvent)
xTrain = np.array(xTrain)
yTrain = np.array(yTrain)



In [4]:
model = Sequential()
model.add(Embedding(128, dim_embedding, input_length=maxLen))
model.add(SpatialDropout1D(dropout_rate))
model.add(LSTM(num_LSTM_cell, dropout=dropout_rate, recurrent_dropout=dropout_rate))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
history = model.fit(xTrain, yTrain, epochs=epochs, batch_size=batch_size, validation_split=0.2, 
                    callbacks=[EarlyStopping(monitor='val_loss',patience=7, min_delta=0.00001)])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50


## Publish Model

In [5]:
import os
import smle
import warnings

warnings.filterwarnings('ignore')
from smle.context import Context
cwd = os.getcwd()

config = {  
    'username': '',
    
    'model_storage_type': 's3', 
    'model_storage_address': "s3.us-west-2.amazonaws.com",
    'model_storage_bucket': 'smle-experiments',
    'model_storage_access_key': '',
    'model_storage_secret_key': '',
    'model_storage_secure': False,
}
smle_context = Context(config)

%load_ext spl2_kernel

In [6]:
model_path = "models/phishing_email"
model_name = "phishing_email"

!rm -rf /tmp/{model_name}
sample_data = pd.read_csv('s3://smle-experiments/datasets/phishing_email/sample_file.csv')
smle_context.publish(model, model_name=model_name, path = model_path, sample= sample_data, input_names=["ml_in"], output_names=["ml_out"], tf2onnx=["--opset", "10"])

Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
INFO:tensorflow:Assets written to: /tmp/phishing_email/assets


In [7]:
model_dir = "/tmp/{:s}".format(model_name)
smle_context.inspect(model_dir)

Metadata:
 {
    "modelName": "phishing_email",
    "inputFields": [
        {
            "name": "ml_in",
            "type": "floatTensor",
            "size": 128
        }
    ],
    "outputFields": [
        {
            "name": "ml_out",
            "type": "floatTensor",
            "size": 1
        }
    ],
    "smle": {
        "modelType": "tensorflow",
        "published": "2020-10-22T06:15:25.712925+00:00"
    }
}
ONNX model specs:
 {
    "Inputs": [
        {
            "name": "ml_in",
            "type": "tensor(float)",
            "shape": [
                "unk__152",
                128
            ]
        }
    ],
    "Outputs": [
        {
            "name": "ml_out",
            "type": "tensor(float)",
            "shape": [
                "unk__153",
                1
            ]
        }
    ]
}


## SPL2 apply model for inference

In [8]:
%%spl2_add_params from_python
phishing_email = dict(
    model_path = "s3://smle-experiments/models/phishing_email",
    model_name = "phishing_email",
)

In [9]:
%%spl2 -q phishing_email
| from read_json("s3://smle-experiments/datasets/phishing_email/splunk_test.json")
| eval eventLine=concat(From, " ", Subject, " ", Content, " ", "                                                                                                                                ")
| where eventLine IS NOT NULL
| eval mapC = {" ":32,"!":33,"\"":34,"#":35,"$$":36,"%":37,"&":38,"'":39,"(":40,")":41,"*":42,"+":43,",":44,"-":45,".":46,"/":47,"0":48,"1":49,"2":50,"3":51,"4":52,"5":53,"6":54,"7":55,"8":56,"9":57,":":58,";":59,"<":60,"=":61,">":62,"?":63,"@":64,"A":65,"B":66,"C":67,"D":68,"E":69,"F":70,"G":71,"H":72,"I":73,"J":74,"K":75,"L":76,"M":77,"N":78,"O":79,"P":80,"Q":81,"R":82,"S":83,"T":84,"U":85,"V":86,"W":87,"X":88,"Y":89,"Z":90,"[":91,"\\":92,"]":93,"^":94,"_":95,"`":96,"a":97,"b":98,"c":99,"d":100,"e":101,"f":102,"g":103,"h":104,"i":105,"j":106,"k":107,"l":108,"m":109,"n":110,"o":111,"p":112,"q":113,"r":114,"s":115,"t":116,"u":117,"v":118,"w":119,"x":120,"y":121,"z":122,"{":123,"|":124,"}":125,"~":126}
| eval ml_in = for_each(
        iterator(mvrange(1,129), "i"),
        cast(map_get(mapC, substr(eventLine, i, 1)), "float") )
| apply_model connection_id="" path="$model_path" name="$model_name"
| eval probability = mvindex(ml_out, 0) 
| where probability > 0.5
| eval start_time = Date, end_time = Date, entities = "TBD", body = "TBD"
| select probability, body, entities, start_time, end_time
;

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5.0), HTML(value='')))

 Finished.                     

Unnamed: 0,end_time,start_time,body,entities,probability
0,"Thu, 14 Mar 2002 14:34:48 -0800 (PST)","Thu, 14 Mar 2002 14:34:48 -0800 (PST)",TBD,TBD,0.999014
1,"Mon, 21 May 2001 03:27:00 -0700 (PDT)","Mon, 21 May 2001 03:27:00 -0700 (PDT)",TBD,TBD,0.999733
2,"Fri, 4 May 2001 14:48:00 -0700 (PDT)","Fri, 4 May 2001 14:48:00 -0700 (PDT)",TBD,TBD,0.999192
3,"Thu, 1 Nov 2018 17:20:06 +0000","Thu, 1 Nov 2018 17:20:06 +0000",TBD,TBD,0.998415
4,"Wed, 23 Aug 2000 05:12:00 -0700 (PDT)","Wed, 23 Aug 2000 05:12:00 -0700 (PDT)",TBD,TBD,0.999215
...,...,...,...,...,...
218,"Wednesday, 9 May 2018 at 07:34","Wednesday, 9 May 2018 at 07:34",TBD,TBD,0.999302
219,"Mon, 25 Jun 2018 21:42:28 +0200","Mon, 25 Jun 2018 21:42:28 +0200",TBD,TBD,0.999290
220,,,TBD,TBD,0.987345
221,"Fri, 11 May 2001 03:38:00 -0700 (PDT)","Fri, 11 May 2001 03:38:00 -0700 (PDT)",TBD,TBD,0.999448





<spl2_kernel.spl2_runner.SPL2Job at 0x7f0839efe410>

### Note the versions of SMLE and Tensorflow used

In [10]:
!pip show smle tensorflow tf2onnx

Name: smle
Version: 0.2.12
Summary: Splunk Machine Learning Environment
Home-page: https://cd.splunkdev.com/ML/smle
Author: SplunkMLTeam
Author-email: mleng@splunk.com
License: Splunk Software License
Location: /opt/conda/lib/python3.7/site-packages
Requires: scipy, onnx, minio, packaging, toml, arrow, pandas, onnxruntime, python-json-logger, numpy, boto3
Required-by: 
---
Name: tensorflow
Version: 2.3.0
Summary: TensorFlow is an open source machine learning framework for everyone.
Home-page: https://www.tensorflow.org/
Author: Google Inc.
Author-email: packages@tensorflow.org
License: Apache 2.0
Location: /opt/conda/lib/python3.7/site-packages
Requires: termcolor, keras-preprocessing, six, protobuf, google-pasta, astunparse, tensorflow-estimator, scipy, numpy, gast, opt-einsum, tensorboard, absl-py, wrapt, h5py, wheel, grpcio
Required-by: 
---
Name: tf2onnx
Version: 1.6.3
Summary: Tensorflow to ONNX converter
Home-page: https://github.com/onnx/tensorflow-onnx
Author: onnx@microsoft.co