# CLX Cheat Sheets sample code

(c) 2020 NVIDIA, Blazing SQL

Distributed under Apache License 2.0

In [1]:
import cudf
import dask_cudf
import s3fs
from os import path

from clx.analytics.cybert import Cybert

---
# CyBERT
---

## Model

In [2]:
CLX_S3_BASE_PATH = 'rapidsai-data/cyber/clx'
HF_S3_BASE_PATH = 'models.huggingface.co/bert/raykallen/cybert_apache_parser'

MODEL_DIR = '../models/CyBERT'
DATA_DIR = '../data'
CONFIG_FILENAME = 'config.json'
MODEL_FILENAME = 'pytorch_model.bin'
APACHE_SAMPLE_CSV = 'apache_sample_1k.csv'

In [3]:
if not path.exists(f'{MODEL_DIR}/{MODEL_FILENAME}'):
    fs = s3fs.S3FileSystem(anon=True)
    fs.get(
          f'{HF_S3_BASE_PATH}/{MODEL_FILENAME}'
        , f'{MODEL_DIR}/{MODEL_FILENAME}'
    )

if not path.exists(f'{MODEL_DIR}/{CONFIG_FILENAME}'):
    fs = s3fs.S3FileSystem(anon=True)
    fs.get(
          f'{HF_S3_BASE_PATH}/{CONFIG_FILENAME}'
        , f'{MODEL_DIR}/{CONFIG_FILENAME}'
    )
    
if not path.exists(APACHE_SAMPLE_CSV):
    fs = s3fs.S3FileSystem(anon=True)
    fs.get(
        f'{CLX_S3_BASE_PATH}/{APACHE_SAMPLE_CSV}'
        , f'{DATA_DIR}/{APACHE_SAMPLE_CSV}')

#### clx.analytics.cybert.Cybert.load_model()

In [4]:
cybert = Cybert()
cybert.load_model(
    f'{MODEL_DIR}/{MODEL_FILENAME}'
    , f'{MODEL_DIR}/{CONFIG_FILENAME}'
)

#### clx.analytics.cybert.Cybert.inference()

In [5]:
logs_df = cudf.read_csv(f'{DATA_DIR}/{APACHE_SAMPLE_CSV}')
parsed_df, confidence_df = cybert.inference(logs_df["raw"])

In [6]:
parsed_df.head()

Unnamed: 0,time_received,error_level,error_message,remote_host,other,request_method,request_url,request_http_ver,status,response_bytes_clf,request_header_user_agent,request_header_referer,X
0,[Sun Dec 04 20:22:49 2005],[notice],workerEnv.init () ok/etc/httpd/conf/workers2 .,,,,,,,,,,
1,[01/Sep/2019:03:28:00 +0200],,,193.106.31.130,---,POST,/administrator/index.php,HTTP/1.0,200.0,4481.0,Mozilla/4.0 (compatible.MSIE...; Windows NT...),,
2,[29/Sep/2019:19:41:25 +0200],,,100.1.14.108,---,GET,/components/com.users/dispacher.php,HTTP/1.1,404.0,240.0,python-requests/2.22.0,,
3,[06/Nov/2019:03:15:15 +0100],,,13.84.43.203,---,GET,//administrator/index.php,HTTP/1.1,200.0,4270.0,Mozilla/5.0 (Windows NT 10.0.Win64.x64.rv:65.0...,,
4,[18/Feb/2016:12:38:21 +0100],,,90.188.40.9,---,GET,/administrator/,HTTP/1.1,200.0,4263.0,Mozilla/5.0 (Windows NT.. 1) AppleWebKit/537.3...,,


In [7]:
confidence_df.head()

Unnamed: 0,time_received,error_level,error_message,remote_host,other,request_method,request_url,request_http_ver,status,response_bytes_clf,request_header_user_agent,request_header_referer
0,0.999948,0.99959,0.999615,,,,,,,,,
1,0.999973,,,0.999966,0.999914,0.999903,0.999774,0.999946,0.999914,0.999932,0.999905,
2,0.999973,,,0.999964,0.999892,0.999912,0.999836,0.999945,0.99992,0.999927,0.999888,
3,0.999973,,,0.999963,0.999904,0.999903,0.999735,0.999945,0.9999,0.999925,0.99991,
4,0.999974,,,0.999956,0.999904,0.999911,0.999841,0.999944,0.999892,0.999928,0.999872,


#### clx.analytics.cybert.Cybert.preprocess()

In [8]:
logs_df = cudf.read_csv(f'{DATA_DIR}/{APACHE_SAMPLE_CSV}')
input_ids, attention_masks, meta = cybert.preprocess(logs_df["raw"])

In [9]:
input_ids

tensor([[  164,  3477, 13063,  ...,     0,     0,     0],
        [21781,   119,  9920,  ...,     0,     0,     0],
        [ 1620,   119,   122,  ...,     0,     0,     0],
        ...,
        [ 1620,   119,   122,  ...,     0,     0,     0],
        [21801,   119,  1851,  ...,     0,     0,     0],
        [ 1620,   119,   122,  ...,     0,     0,     0]], device='cuda:0')

In [10]:
attention_masks

tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')

In [11]:
meta

array([[  0,   0,  36],
       [  1,   0,  69],
       [  2,   0,  61],
       ...,
       [997,   0,  78],
       [998,   0,  48],
       [999,   0,  74]], dtype=uint32)

# DGA Detector

## Model

In [12]:
import os
import wget
import time
import cudf
import torch
import shutil
import zipfile
import numpy as np
from datetime import datetime
from sklearn.metrics import accuracy_score, average_precision_score
from clx.analytics.dga_dataset import DGADataset
from clx.analytics.dga_detector import DGADetector
from cuml.preprocessing.model_selection import train_test_split
from clx.utils.data.dataloader import DataLoader

In [13]:
dga = {
    "source": "DGA",
    "url": "https://data.netlab.360.com/feeds/dga/dga.txt",
    "compression": None,
    "storage_path": "../data/dga_feed",
}
benign = {
    "source": "Benign",
    "url": "http://s3.amazonaws.com/alexa-static/top-1m.csv.zip",
    "compression": "zip",
    "storage_path": "../data/top-1m",
}

In [14]:
def unpack(compression_type, filepath, output_dir):
     if compression_type == 'zip':
        with zipfile.ZipFile(filepath, 'r') as f:
            f.extractall(output_dir)
        os.remove(filepath)

In [15]:
def download_file(f):
    output_dir = f['storage_path']
    filepath = f'{output_dir}/{f["url"].split("/")[-1]}'
    
    if not os.path.exists(filepath):
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        print(f'Downloading {f["url"]}...')
        filepath = wget.download(f['url'], out=output_dir)
        
        print(f'Unpacking {filepath}')
        unpack(f['compression'], filepath, output_dir)
    print(f'{f["source"]} data is stored to location {output_dir}')

In [16]:
download_file(dga)
download_file(benign)

DGA data is stored to location ../data/dga_feed
Downloading http://s3.amazonaws.com/alexa-static/top-1m.csv.zip...
Unpacking ../data/top-1m/top-1m.csv.zip
Benign data is stored to location ../data/top-1m


In [17]:
def load_input_data(dga, benign):
    dga_df = cudf.read_csv(
        dga['storage_path'] + '/*'
        , names=['generator', 'domain', 'dt_from', 'dt_to']
        , usecols=['domain']
        , skiprows=18
        , delimiter='\t'
    )
    dga_df['type'] = 0
    
    benign_df = cudf.read_csv(
        benign['storage_path'] + '/*'
        , names=["line_num","domain"]
        , usecols=['domain']
    )
    benign_df['type'] = 1
    
    input_df = cudf.concat([benign_df, dga_df], ignore_index=True)
    return input_df

def create_df(domain_df, type_series):
    df = cudf.DataFrame()
    df['domain'] = domain_df['domain'].reset_index(drop=True)
    df['type'] = type_series.reset_index(drop=True)
    return df

def create_dir(dir_path):
    print("Verify if directory `%s` already exists." % (dir_path))
    if not os.path.exists(dir_path):
        print("Directory `%s` does not exist." % (dir_path))
        print("Creating directory `%s` to store trained models." % (dir_path))
        os.makedirs(dir_path)
        
def cleanup_cache():
    # release memory.
    torch.cuda.empty_cache()

In [18]:
input_df = load_input_data(dga, benign)

(
    domain_train
    , domain_test
    , type_train
    , type_test
) = train_test_split(input_df, 'type', train_size=0.7)

train_df = domain_train['domain'].reset_index(drop=True)
train_labels = type_train.reset_index(drop=True)

test_df = create_df(domain_test, type_test)

#### clx.analytics.dga_detector.DGADetector.init_model()

In [19]:
LR = 0.001
N_LAYERS = 3
CHAR_VOCAB = 128
HIDDEN_SIZE = 100
N_DOMAIN_TYPE = 2

In [20]:
dd = DGADetector(lr=LR)
dd.init_model(
      n_layers=N_LAYERS
    , char_vocab=CHAR_VOCAB
    , hidden_size=HIDDEN_SIZE
    , n_domain_type=N_DOMAIN_TYPE
)

#### clx.analytics.dga_detector.DGADetector.train_model()	Yes

In [21]:
batch_size = 10000
train_dataset = {'features': train_df, 'labels': train_labels}
test_dataset = DataLoader(DGADataset(test_df), batch_size)

In [22]:
def train_and_eval(dd, train_dataset, test_dataset, epoch, model_dir):
    print("Initiating model training")
    create_dir(model_dir)
    max_accuracy = 0
    prev_model_file_path = ""
    for i in range(1, epoch + 1):
        print("---------")
        print("Epoch: %s" % (i))
        print("---------")
        dd.train_model(train_dataset['features'], train_dataset['labels'])
        accuracy = dd.evaluate_model(test_dataset)
        now = datetime.now()
        output_filepath = (
            model_dir
            + "/"
            + "rnn_classifier_{}.pth".format(now.strftime("%Y-%m-%d_%H_%M_%S"))
        )
        if accuracy > max_accuracy:
            dd.save_model(output_filepath)
            max_accuracy = accuracy
            if prev_model_file_path:
                os.remove(prev_model_file_path)
            prev_model_file_path = output_filepath
            
    print("Model with highest accuracy (%s) is stored to location %s" % (max_accuracy, prev_model_file_path))
    return prev_model_file_path

In [23]:
%%time
epoch = 2
model_dir='../models/DGA_Detector'
model_filepath = train_and_eval(dd, train_dataset, test_dataset, epoch, model_dir)
cleanup_cache()

Initiating model training
Verify if directory `../models/DGA_Detector` already exists.
---------
Epoch: 1
---------


Epoch:   0%|          | 0/5 [00:00<?, ?it/s]



Epoch:  20%|██        | 1/5 [00:30<02:02, 30.57s/it]

Test set: Accuracy: 290513/393589 (0.7381125996915564)



Epoch:  40%|████      | 2/5 [01:01<01:31, 30.53s/it]

Test set: Accuracy: 320549/393589 (0.8144257080355396)



Epoch:  60%|██████    | 3/5 [01:31<01:01, 30.54s/it]

Test set: Accuracy: 351367/393589 (0.8927256605240492)



Epoch:  80%|████████  | 4/5 [02:02<00:30, 30.58s/it]

Test set: Accuracy: 365812/393589 (0.929426381326714)



Epoch: 100%|██████████| 5/5 [02:32<00:00, 30.57s/it]

Test set: Accuracy: 374521/393589 (0.9515535241076352)






Test set: Accuracy: 533862/562270 (0.949476230280826)

---------
Epoch: 2
---------


Epoch:   0%|          | 0/5 [00:00<?, ?it/s]



Epoch:  20%|██        | 1/5 [00:30<02:02, 30.61s/it]

Test set: Accuracy: 366785/393589 (0.931898503261016)



Epoch:  40%|████      | 2/5 [01:01<01:31, 30.57s/it]

Test set: Accuracy: 375728/393589 (0.9546201748524477)



Epoch:  60%|██████    | 3/5 [01:31<01:01, 30.59s/it]

Test set: Accuracy: 382510/393589 (0.9718513474716011)



Epoch:  80%|████████  | 4/5 [02:02<00:30, 30.59s/it]

Test set: Accuracy: 383916/393589 (0.9754236017774887)



Epoch: 100%|██████████| 5/5 [02:32<00:00, 30.59s/it]

Test set: Accuracy: 385439/393589 (0.9792931204886316)






Test set: Accuracy: 550491/562270 (0.9790509897380262)

Model with highest accuracy (0.9790509897380262) is stored to location ../models/DGA_Detector/rnn_classifier_2021-02-22_21_10_39.pth
CPU times: user 19min 1s, sys: 17.4 s, total: 19min 19s
Wall time: 5min 13s


#### clx.analytics.dga_detector.DGADetector.evaluate_model()

In [24]:
accuracy = dd.evaluate_model(DataLoader(DGADataset(test_df), 10000))

Test set: Accuracy: 550491/562270 (0.9790509897380262)



#### clx.analytics.dga_detector.DGADetector.predict()

In [25]:
dd = DGADetector()
dd.load_model('../models/DGA_Detector/rnn_classifier_2021-02-22_20_54_32.pth')

pred_results = []
true_results = []
for partition in test_dataset.get_chunks():
    pred_results.append(list(dd.predict(partition['domain']).values_host))
    true_results.append(list(partition['type'].values_host))
pred_results = np.concatenate(pred_results)
true_results = np.concatenate(true_results)
accuracy_score = accuracy_score(pred_results, true_results)
print('Model accuracy: %s'%(accuracy_score))
cleanup_cache()

Model accuracy: 0.989625980400875


# Phishing Detector

## Model

In [26]:
import cudf;
from cuml.preprocessing.model_selection import train_test_split
from clx.analytics.sequence_classifier import SequenceClassifier
import s3fs;
from os import path

In [27]:
DATA_DIR = '../data/phishing'
CLAIR_TSV = "Phishing_Dataset_Clair_Collection.tsv"
SPAM_TSV = "spam_assassin_spam_200_20021010.tsv"
EASY_HAM_TSV = "spam_assassin_easyham_200_20021010.tsv"
HARD_HAM_TSV = "spam_assassin_hardham_200_20021010.tsv"
ENRON_TSV = "enron_10000.tsv"

S3_BASE_PATH = "rapidsai-data/cyber/clx"

In [28]:
def maybe_download(f, output_dir):
    if not path.exists(f'{output_dir}/{f}'):
        print(f'Downloading: {f}')
        fs = s3fs.S3FileSystem(anon=True)
        fs.get(S3_BASE_PATH + "/" + f, f'{output_dir}/{f}')
        
def read_dataset(f, data_dir):
    maybe_download(f, data_dir)
    return cudf.read_csv(
        f'{data_dir}/{f}'
        , delimiter='\t'
        , header=None
        , names=['label', 'email']
    )

In [29]:
dfclair = read_dataset(CLAIR_TSV, DATA_DIR)
dfspam = read_dataset(SPAM_TSV, DATA_DIR)
dfeasyham = read_dataset(EASY_HAM_TSV, DATA_DIR)
dfhardham = read_dataset(HARD_HAM_TSV, DATA_DIR)
dfenron = read_dataset(ENRON_TSV, DATA_DIR)

#### clx.analytics.phishing_detector.PhishingDetector.init_model()

In [30]:
phish_detect = SequenceClassifier()
phish_detect.init_model(model_or_path='bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

#### clx.analytics.phishing_detector.PhishingDetector.train_model()

In [31]:
df_all = cudf.concat([
    dfclair
    , dfspam
    , dfeasyham
    , dfhardham
    , dfenron
])

(
    X_train
    , X_test
    , y_train
    , y_test
) = train_test_split(df_all, 'label', train_size=0.8)

phish_detect.train_model(X_train, y_train, epochs=1)

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Train loss: 0.06824535569360499


Epoch: 100%|██████████| 1/1 [02:06<00:00, 126.05s/it]

Validation Accuracy: 0.9948326771653543





#### clx.analytics.phishing_detector.PhishingDetector.evaluate_model()

In [32]:
phish_detect.evaluate_model(X_test['email'], y_test)

0.9944510503369005

#### clx.analytics.phishing_detector.PhishingDetector.save_model()

In [33]:
phish_detect.save_model('../models/phishing')

#### clx.analytics.phishing_detector.PhishingDetector.predict()

In [34]:
phish_detect_trained = SequenceClassifier()
phish_detect_trained.init_model(model_or_path='../models/phishing')

phish_detect_trained.predict(X_test['email'])

(0     0
 1     0
 2     1
 3     1
 4     0
      ..
 17    0
 18    1
 19    0
 20    0
 21    0
 Length: 5046, dtype: uint8,
 0     0.016377
 1     0.122532
 2     0.983655
 3     0.983209
 4     0.025541
         ...   
 17    0.015206
 18    0.984037
 19    0.014542
 20    0.062153
 21    0.014792
 Length: 5046, dtype: float32)