In [9]:
# coding:utf-8
import warnings
warnings.filterwarnings('ignore')
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

import json
#j = json.loads('{"one" : "1", "two" : "2", "three" : "3"}')
import time

from tldextract import TLDExtract
extract = TLDExtract(suffix_list_urls=None)

from keras.models import load_model
from keras import regularizers
from keras import optimizers
from keras.models import Sequential

from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM, GRU
from keras.layers import Bidirectional, Conv1D, MaxPool1D, Flatten

from sklearn.cross_validation import train_test_split

from keras.utils import to_categorical

import numpy as np
import pickle

In [10]:
SRC_IP_IDX = 3-1               
DST_IP_IDX = 4-1               
SRC_PORT_IDX = 5-1             
DST_PORT_IDX = 6-1             
PROTOCOL_IDX = 7-1             
DNS_QUERY_NAME_IDX = 55-1 # domain
DNS_REQUEST_TYPE = 56-1
DNS_DOMAIN_TTL = 59-1
DNS_REPLY_IPV4IP = 60-1        
DNS_REPLY_IPV6IP = 61-1        
DNS_REPLY_RRTYPE = 62-1        
DNS_REQUEST_LEN  = 88-1        
DNS_REPLY_LENGTH = 90-1

def iterbrowse(path):
    for home, dirs, files in os.walk(path):
        for filename in files:
            yield os.path.join(home, filename)
            
def extract_domain(domain):
    suffix = {'.com','.la','.io', '.co', '.cn','.info', '.net',
              '.org','.me', '.mobi', '.us', '.biz', '.xxx', '.ca',
              '.co.jp', '.com.cn', '.net.cn', '.org.cn', '.mx','.tv',
              '.ws', '.ag', '.com.ag', '.net.ag', '.org.ag','.am',
              '.asia', '.at', '.be', '.com.br', '.net.br', '.name', 
              '.live', '.news', '.bz', '.tech', '.pub', '.wang', 
              '.space', '.top', '.xin', '.social', '.date', '.site', 
              '.red', '.studio', '.link', '.online', '.help', '.kr', 
              '.club', '.com.bz', '.net.bz', '.cc', '.band', '.market',
              '.com.co', '.net.co', '.nom.co', '.lawyer', '.de', '.es',
              '.com.es', '.nom.es', '.org.es', '.eu', '.wiki', 
              '.design', '.software', '.fm', '.fr', '.gs', '.in', 
              '.co.in', '.firm.in', '.gen.in', '.ind.in', '.net.in', 
              '.org.in', '.it', '.jobs', '.jp', '.ms', '.com.mx', '.nl',
              '.nu','.co.nz','.net.nz', '.org.nz', '.se', '.tc', '.tk',
              '.tw', '.com.tw', '.idv.tw', '.org.tw', '.hk', '.co.uk',
              '.me.uk', '.org.uk', '.vg','.in-addr.arpa'}

    domain = domain.lower()
    names = domain.split(".")
    if len(names) >= 3:
        if ("."+".".join(names[-2:])) in suffix:
            return ".".join(names[-3:]), ".".join(names[:-3])
        elif ("."+names[-1]) in suffix:
            return ".".join(names[-2:]), ".".join(names[:-2])
    #print ("New domain suffix found. Use tld extract domain...")

    pos = domain.rfind("/")
    if pos >= 0: # maybe subdomain contains /, for dns tunnel tool
        ext = extract(domain[pos+1:])
        subdomain = domain[:pos+1] + ext.subdomain
    else:
        ext = extract(domain)
        subdomain = ext.subdomain
    if ext.suffix:
        mdomain = ext.domain + "." + ext.suffix
    else:
        mdomain = ext.domain
    return mdomain, subdomain


def filter_metadata_dns(data):
    if(len(data) < 91):
        return False

    protol  = data[PROTOCOL_IDX]
    dstport = data[DST_PORT_IDX]
    dstip   = data[DST_IP_IDX]
    qname   = data[DNS_QUERY_NAME_IDX]

    if '' == qname or '' == dstip:
        return False
    if '17' == protol and ('53' == dstport):
        return True
    return False


def metadata2_domain_data(log): 
    data = log.split('^')
    if not filter_metadata_dns(data):
        return None, None
    domain = data[DNS_QUERY_NAME_IDX]
    mdomain, subdomain = extract_domain(domain)
    return (mdomain, subdomain)


def get_local_data(tag="labeled"):
    data_path = "./sample_data"
    black_data, white_data = [], []    
    for dir_name in ("black", "cdn", "white"):
        dir_path = "%s/%s_%s" % (data_path, tag, dir_name)

        for path in iterbrowse(dir_path):
            print( path)
            with open(path) as f:
                for line in f:
                    mdomain, subdomain = metadata2_domain_data(line)
                    if subdomain is not None:
                        if "white" in path:
                            white_data.append(subdomain)
                        elif "cdn" in path:
                            white_data.append(subdomain)
                        elif "black" in path and "pcap" in path:
                            black_data.append(subdomain)
                        else:
                            pass
                            #print ("pass path:", path)
                    #else:
                    #    print ("unknown line:", line, " in file:", path)
    return black_data, white_data


class LABEL(object):
    white = 0
    cdn = 0
    black = 1

def pad_sequences(X, maxlen, value=0):
    S=[]
    for x in X:
        xlen = len(x)
        if xlen < maxlen:
            x.extend([value]*(maxlen-xlen))
        else:
            x = x[:maxlen]
        S.append(x)
    return S

def get_data():
    black_x, white_x = get_local_data()
    black_y, white_y = [LABEL.black]*len(black_x),[LABEL.white]*len(white_x)

    X = black_x + white_x
    labels = black_y + white_y

    # Generate a dictionary of valid characters
    valid_chars = {x:idx+1 for idx, x in enumerate(set(''.join(X)))}

    max_features = len(valid_chars) + 1
    print("max_features:", max_features)
    maxlen = np.max([len(x) for x in X])
    print("max_len:", maxlen)
    maxlen = min(maxlen, 256)

    # Convert characters to int and pad
    X = [[valid_chars[y] for y in x] for x in X]
    X = pad_sequences(X, maxlen=maxlen, value=0.)

    # Convert labels to 0-1
    Y = labels
    
    volcab_file = "volcab.pkl"
    output = open(volcab_file, 'wb') 
    # Pickle dictionary using protocol 0.
    data = {"valid_chars": valid_chars,
            "max_len": maxlen, 
            "volcab_size": max_features}
    pickle.dump(data, output)
    output.close()

    return X, Y, maxlen, max_features

def build_model_BiRNN(max_len, volcab_size):
    """Build Bi-RNN model"""
    model = Sequential()
    model.add(Embedding(input_dim=volcab_size,
                        output_dim=64,
                        input_length=max_len))
    model.add(Bidirectional(GRU(16)))
    
    model.add(Dense(16, activation='relu'))
    model.add(Dropout(0.25))
    model.add(Dense(1))
    model.add(Activation('sigmoid'))
    model.compile(loss='binary_crossentropy',
                  optimizer='rmsprop',
                  metrics=['accuracy'])
    return model


def get_cnn_model(max_len, volcab_size):
    """Build CNN model"""
    model = Sequential()
    model.add(Embedding(input_dim=volcab_size,
                        output_dim=64,
                        input_length=max_len))
    model.add(Conv1D(128,
                     3,
                     padding='valid',
                     activation="relu", 
                     kernel_regularizer=regularizers.l2(0.01),
                     activity_regularizer=regularizers.l1(0.01)))
    model.add(MaxPool1D(2))
    model.add(Conv1D(128,
                     4,
                     padding='valid',
                     activation="relu", 
                     kernel_regularizer=regularizers.l2(0.01),
                     activity_regularizer=regularizers.l1(0.01)))
    model.add(MaxPool1D(2))
    model.add(Conv1D(128,
                     5,
                     padding='valid',
                     activation="relu", 
                     kernel_regularizer=regularizers.l2(0.01),
                     activity_regularizer=regularizers.l1(0.01)))
    model.add(MaxPool1D(2))
    model.add(Flatten())
    model.add(Dropout(0.5))
    model.add(Dense(16,activation="relu"))
    model.add(Dense(1,activation='sigmoid'))
    model.compile(loss='binary_crossentropy',
                  optimizer='rmsprop',
                  metrics=['accuracy'])
    return model

def run():
    X, Y, max_len, volcab_size = get_data()

    print( "X len:", len(X), "Y len:", len(Y))
    trainX, testX, trainY, testY = train_test_split(X, Y, test_size=0.2, 
                                                    random_state=42)
    print( trainX[:1])
    print( trainY[:1])
    print( testX[-1:])
    print( testY[-1:])

    model = get_cnn_model(max_len, volcab_size)
    model.fit(trainX, trainY, 
              validation_data=(testX, testY), 
              verbose=1,
              batch_size=32)

    
    filename = 'finalized_model.keras'
    model.save(filename)

    model.load(filename)
    print( "Just review 3 sample data test result:")
    result = model.predict(testX[0:3])
    print( result)



In [240]:
X, Y, max_len, volcab_size = get_data()

./sample_data/labeled_black/dnscat2_when_exec_command_rm_file.txt
./sample_data/labeled_black/ozyman_idle3.pcap.txt
./sample_data/labeled_black/download_dnscat2_file12.pcap.txt
./sample_data/labeled_black/iodine_direct_ssh4_base32.pcap.txt
./sample_data/labeled_black/iodine_direct_ssh6_base128.pcap.txt
./sample_data/labeled_black/dns2tcp_sendfile9.pcap.txt
./sample_data/labeled_black/iodine_direct_scp17_base128.pcap.txt
./sample_data/labeled_black/dnscat2_when_idle.txt
./sample_data/labeled_black/iodine_direct_ssh9_base32_again.pcap.txt
./sample_data/labeled_black/nbtoo_dnscat_file7.pcap.txt
./sample_data/labeled_black/iodine_idle_direct_idle44.pcap.txt
./sample_data/labeled_black/download_dnscat2_file13.pcap.txt
./sample_data/labeled_black/dnscapy_scp2.pcap.txt
./sample_data/labeled_black/dns2tcp_cmd.pcap.txt
./sample_data/labeled_black/dnscapy_scp.pcap.txt
./sample_data/labeled_black/tcp-over-dns-idle.pcap.txt
./sample_data/labeled_black/iodine_direct_ssh6_base64u.pcap.txt
./sample_d

In [241]:
print( "X len:", len(X), "Y len:", len(Y))
trainX, test_X, trainY, test_Y = train_test_split(X, Y, test_size=0.2, 
                                                    random_state=42)

trainX, testX, trainY, testY = train_test_split(trainX, trainY, test_size=0.2, 
                                                    random_state=42)

X len: 4439 Y len: 4439


In [242]:
trainX=np.mat(trainX)
testX=np.mat(testX)

trainY=np.mat(trainY).flatten().T
testY=np.mat(testY).flatten().T


In [243]:
model = get_cnn_model(max_len, volcab_size)

model = build_model_BiRNN(max_len, volcab_size)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_20 (Embedding)     (None, 256, 64)           2752      
_________________________________________________________________
bidirectional_2 (Bidirection (None, 32)                7776      
_________________________________________________________________
dense_39 (Dense)             (None, 16)                528       
_________________________________________________________________
dropout_20 (Dropout)         (None, 16)                0         
_________________________________________________________________
dense_40 (Dense)             (None, 1)                 17        
_________________________________________________________________
activation_2 (Activation)    (None, 1)                 0         
Total params: 11,073
Trainable params: 11,073
Non-trainable params: 0
_________________________________________________________________


In [254]:
model.fit(trainX, trainY, 
          validation_data=(testX, testY), 
          verbose=1,
          batch_size=3551,
          epochs=20)


Train on 3551 samples, validate on 888 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f36887872e8>

In [257]:
timestamp = time.strftime("%Y%m%d-%H%M%S", time.localtime(time.time()))

model.save("./models/BiLST-"+timestamp+".module")
model.save("./models/BiLST-final.module")

print("Just review 2 sample data test result:")

result = model.predict_classes(testX[0:2])
print(result)

Just review 2 sample data test result:
[[0]
 [0]]


In [25]:
from keras.models import load_model

def get_predict_data():
    data_path = "./xshell_data"
    black_data = []
    for path in iterbrowse(data_path):
        with open(path) as f:
            for line in f:
                mdomain, subdomain = metadata2_domain_data(line)
                if subdomain is not None:
                    black_data.append(subdomain)
    return black_data


org_X = []

def get_xshell_data():
    global org_X
    org_X = get_predict_data()
    labels = [LABEL.black]*len(org_X)

    volcab_file = "volcab.pkl"
    assert os.path.exists(volcab_file)
    pkl_file = open(volcab_file, 'rb')
    data = pickle.load(pkl_file)
    valid_chars, maxlen, max_features = data["valid_chars"], data["max_len"], data["volcab_size"]

    # Convert characters to int and pad
    X = [[valid_chars[y] if y in valid_chars else 0 for y in x] for x in org_X]
    X = pad_sequences(X, maxlen=maxlen, value=0.)

    # Convert labels to 0-1
    Y = labels
    return X, Y, maxlen, max_features


def run():
    testX, testY, max_len, volcab_size = get_xshell_data()
    print( "X len:", len(testX), "Y len:", len(testY))
    print( testX[-1:])
    print( testY[-1:])

    filename = 'finalized_model.tflearn'
    model = load_model("./models/BiLST-final.module")

    predictions = model.predict(testX)
    
    cnt = 0
    global org_X
    for i,p in enumerate(predictions):
        #if abs(p[2]-testY[i][2]) < 0.1:
        if p[2]>p[1] and p[1]>p[0]:
            cnt += 1
        else:
            print( "found data not detected:")
            print( "original subdomain:", org_X[i])
            print( "prediction compare:", p, testY[i])
    print( "Dectected cnt:", cnt, "total:", len(predictions))
    print( "Dectect Rate is:", cnt/(len(predictions)+.0))

In [26]:
testX, testY, max_len, volcab_size = get_xshell_data()
print( "X len:", len(testX), "Y len:", len(testY))
print( testX[-1:])
print( testY[-1:])
testX = np.mat(testX)
testY = np.mat(testY)
print(type(testX))
print(type(testY))



X len: 1180 Y len: 1180
[[39, 17, 16, 17, 16, 42, 12, 18, 18, 9, 26, 2, 25, 41, 42, 17, 12, 16, 5, 42, 35, 14, 25, 25, 15, 2, 33, 42, 37, 17, 16, 14, 25, 5, 35, 4, 41, 11, 35, 17, 35, 25, 41, 18, 16, 5, 41, 32, 39, 19, 25, 9, 2, 41, 37, 10, 37, 14, 22, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.

In [30]:
model = load_model("./models/BiLST-final.module")

In [31]:
predictions = model.predict_classes(testX)

In [32]:
cnt = 0

for i,p in enumerate(predictions):
    #print(i,p)
    #if abs(p[2]-testY[i][2]) < 0.1:
    if p[0]==1:
        cnt += 1
    else:
        continue
        print( "found data not detected:")
        print( "original subdomain:", org_X[i])
        #print( "prediction compare:", p[0], testY[i])
print( "Dectected cnt:", cnt, "total:", len(predictions))
print( "Dectect Rate is:", cnt/(len(predictions)+.0))

Dectected cnt: 910 total: 1180
Dectect Rate is: 0.7711864406779662


In [33]:
from kafka import KafkaConsumer


#model = load_model("./models/BiLST-20180928-183836.module")

volcab_file = "volcab.pkl"
pkl_file = open(volcab_file, 'rb')
data = pickle.load(pkl_file)
valid_chars, maxlen, max_features = data["valid_chars"], data["max_len"], data["volcab_size"]


In [None]:
consumer = KafkaConsumer('dns')
for message in consumer:
    # message value and key are raw bytes -- decode if necessary!
    # e.g., for unicode: `message.value.decode('utf-8')`
    dns_log = json.loads(message.value.decode('utf-8'))
    dns_query = dns_log["query"]
    
    org_X = extract_domain(dns_query)
    X = [[valid_chars[y] if y in valid_chars else 0 for y in x] for x in org_X]
    X = pad_sequences(X, maxlen=maxlen, value=0.)
    
    X = np.mat(X)
    
    rs = model.predict_classes(X)[0][0]
    
    print(rs, dns_query)

In [9]:
a = [12,13,4,5]
b = [1,22,345,4]
c = a + b
print(c)
a.pop()
print(c)

[12, 13, 4, 5, 1, 22, 345, 4]
[12, 13, 4, 5, 1, 22, 345, 4]


In [14]:
print(pad_sequences([1,4,5],10))

[1, 4, 5, 0, 0, 0, 0, 0, 0, 0]
