In [1]:
import numpy as np
import pymongo
#from sklearn import neural_network
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV, learning_curve
from sklearn.metrics import accuracy_score, make_scorer
from sklearn.tree import DecisionTreeClassifier
import sys
import math
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import SMOTENC

# Load data

In [2]:
client = pymongo.MongoClient("mongodb://localhost:27017/")
mydb = client["flowdb"]

In [3]:
p_train_coll = mydb["testP_agg"]
p_test_coll = mydb["trainP_agg"]
b_train_coll = mydb["trainB_agg"]
b_test_coll = mydb["testB_agg"]

p_train = []
p_test = []
b_train = []
b_test = []
p_train_l = []
p_test_l = []
b_train_l = []
b_test_l = []

In [4]:
def getMD(doc):
    tmp = []
    if doc['sp'][0] == None:
        tmp.append(-1)
    else:
        tmp.append(doc['sp'][0])
        
    if doc['dp'][0] == None:
        tmp.append(-1)
    else:
        tmp.append(doc['dp'][0])
    #tmp.append(doc['sp'][0])
    #tmp.append(doc['dp'][0])
    if doc['Source ASN'] == None:
        tmp.append(-1)
    else:
        tmp.append(doc['Source ASN'])
    if doc['Dest ASN'] == None:
        tmp.append(-1)
    else:
        tmp.append(doc['Dest ASN'])
    tmp.append(doc['bytes_in'])
    tmp.append(doc['bytes_out'])
    tmp.append(doc['total_bytes'])
    tmp.append(doc['num_pkts_in'])
    tmp.append(doc['num_pkts_out'])
    tmp.append(doc['total_num_pkts'])
    tmp.append(doc['# of flows'])
    for i in range(256):
        if (doc['byte_dist'][i] != None):
            try:
                tmp.append(doc['byte_dist'][i]/doc['total_bytes'])
            except:
                tmp.append(0)
        else:
            tmp.append(0)
    return tmp

In [5]:
def getIFTBuckets(doc):
    tmp = []
    ifts = doc['ift']
    length = len(ifts)
    tmp.append(length)
    #tmp.append(np.mean(ifts))
    num = 15
    buckets = [0] * num
    sum = 0
    avg_ift = 0
    for ift in ifts:
        if ift > 0:
           logift = math.log(ift, 2)
        if ift == 0:
            buckets[0] += 1
        elif logift <= 0:
            buckets[0] += 1
            sum += ift
        elif logift >= 14:
            buckets[14] += 1
            sum += ift
        else:
            buckets[int(math.floor(logift))] += 1
            sum += ift
    
    if length > 1:
        avg_ift = sum / length
    tmp.append(avg_ift)
    tmp.extend(buckets)
    return tmp

In [6]:
def getIndividualFL(doc):
    numRows = 25
    binSize = 200
    transMat = np.zeros((numRows, numRows))
    if len(doc['byte_array']) == 0:
        return list(transMat.flatten())
    elif len(doc['byte_array']) == 1:
        curFlowSize = min(int(doc['byte_array'][0] / binSize), numRows - 1)
        transMat[curFlowSize, curFlowSize] = 1
        return list(transMat.flatten())

    for i in range(1, len(doc['byte_array'])):
        prevFlowSize = min(int(doc['byte_array'][i-1] / binSize), numRows - 1)
        curFlowSize = min(int(doc['byte_array'][i] / binSize), numRows - 1)
        transMat[prevFlowSize, curFlowSize] += 1

    for i in range(numRows):
        if float(np.sum(transMat[i:i+1])) != 0:
            transMat[i:i+1] = transMat[i:i+1] / float(np.sum(transMat[i:i + 1]))
    return list(transMat.flatten())

### set up training and testing data arrays

In [7]:
for doc in p_train_coll.find():
    tmpd = []
    tmp = getMD(doc)
    tmpift = getIFTBuckets(doc)
    #tmpind_fl = getIndividualFL(doc)
    
    
    tmpd.extend(tmp)
    tmpd.extend(tmpift)
    #tmpd.extend(tmpind_fl)
    p_train.append(tmpd)
    p_train_l.append(1)
    

In [8]:
for doc in p_test_coll.find():
    tmpd = []
    tmp = getMD(doc)
    tmpift = getIFTBuckets(doc)
    #tmpind_fl = getIndividualFL(doc)
    
    
    tmpd.extend(tmp)
    tmpd.extend(tmpift)
    #tmpd.extend(tmpind_fl)
    p_test.append(tmpd)
    p_test_l.append(1)
    

In [9]:
for doc in b_train_coll.find():
    tmpd = []
    tmp = getMD(doc)
    tmpift = getIFTBuckets(doc)
    #tmpind_fl = getIndividualFL(doc)
    
    
    tmpd.extend(tmp)
    tmpd.extend(tmpift)
    #tmpd.extend(tmpind_fl)
    b_train.append(tmpd)
    b_train_l.append(0)
    

In [10]:
for doc in b_test_coll.find():
    tmpd = []
    tmp = getMD(doc)
    tmpift = getIFTBuckets(doc)
    #tmpind_fl = getIndividualFL(doc)
    
    
    tmpd.extend(tmp)
    tmpd.extend(tmpift)
    #tmpd.extend(tmpind_fl)
    b_test.append(tmpd)
    b_test_l.append(0)
    

In [11]:
training_set = []
training_set.extend(b_train)
training_set.extend(p_train)
training_labels = []
training_labels.extend(b_train_l)
training_labels.extend(p_train_l)
training_set = np.array(training_set)
training_labels = np.array(training_labels)
b_train = np.array(b_train)
p_train = np.array(p_train)
b_train_l = np.array(b_train_l)
p_train_l = np.array(p_train_l)

In [12]:
test_set = []
test_labels = []
test_set.extend(b_test)
test_set.extend(p_test)
test_labels.extend(b_test_l)
test_labels.extend(p_test_l)
test_set = np.array(test_set)
test_labels = np.array(test_labels)
b_test = np.array(b_test)
p_test = np.array(p_test)
b_test_l = np.array(b_test_l)
p_test_l = np.array(p_test_l)

In [13]:
neural_net = MLPClassifier(learning_rate_init=0.01)
decision_tree = DecisionTreeClassifier(criterion='entropy')
#params = {'learning_rate_init': [0.001, 0.01, 0.1, 1]}
#nn = GridSearchCV(MLPClassifier(), params, scoring=make_scorer(accuracy_score),cv=10, n_jobs=-1)

In [14]:
#nn.fit(training_set, training_labels)
#print nn.cv_results_
print (training_set.shape, training_labels.shape)
smote = SMOTE(1.0)
#smotenc = SMOTENC(1.0)
rs_x, rs_y = smote.fit_sample(training_set, training_labels)
#nc_x, nc_y = smotenc.fit_sample(training_set, training_labels)
print(rs_x.shape, rs_y.shape)
#print(nc_x.shape, nc_y.shape)

((2369, 284), (2369,))
((4318, 284), (4318,))


In [15]:
train_size, train_scores, test_scores = learning_curve(neural_net, training_set, training_labels, cv=5, train_sizes=np.linspace(0.1,1,10), 
                                                       scoring=make_scorer(accuracy_score), n_jobs=-1)

In [16]:
print (train_size)
print (np.mean(train_scores))
print (np.mean(test_scores))

[ 189  379  568  758  947 1137 1326 1516 1705 1895]
0.9889920844327176
0.9062068134985415


In [17]:
neural_net.fit(training_set, training_labels)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.01, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [18]:
print('scores on full test set, just benign, and just pirate, without increasing piracy sample set')
print(neural_net.score(test_set, test_labels))
print(neural_net.score(b_test, b_test_l))
print(neural_net.score(p_test, p_test_l))

0.6905537459283387
0.9904761904761905
0.041237113402061855


In [19]:
train_size, train_scores, test_scores = learning_curve(decision_tree, training_set, training_labels, cv=5, train_sizes=np.linspace(0.1,1,10), 
                                                       scoring=make_scorer(accuracy_score), n_jobs=-1)
print (train_size)
print (np.mean(train_scores))
print (np.mean(test_scores))
train_size, train_scores, test_scores = learning_curve(decision_tree, training_set, training_labels, cv=5, train_sizes=np.linspace(0.1,1,10), 
                                                       scoring=make_scorer(accuracy_score), n_jobs=-1)
print (train_size)
print (np.mean(train_scores))
print (np.mean(test_scores))
decision_tree.fit(training_set, training_labels)

[ 189  379  568  758  947 1137 1326 1516 1705 1895]
0.9999683377308708
0.9075919929349425
[ 189  379  568  758  947 1137 1326 1516 1705 1895]
0.9999683377308708
0.9082695069624713


DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [20]:
print('scores on full test set, just benign, and just pirate, without increasing piracy sample set')
print(decision_tree.score(test_set, test_labels))
print(decision_tree.score(b_test, b_test_l))
print(decision_tree.score(p_test, p_test_l))

0.6384364820846905
0.6904761904761905
0.5257731958762887


In [21]:
neural_net.fit(rs_x, rs_y)
decision_tree.fit(rs_x, rs_y)

print('scores on full test set, just benign, and just pirate')
print(neural_net.score(test_set, test_labels))
print(neural_net.score(b_test, b_test_l))
print(neural_net.score(p_test, p_test_l))
print('scores on full test set, just benign, and just pirate')
print(decision_tree.score(test_set, test_labels))
print(decision_tree.score(b_test, b_test_l))
print(decision_tree.score(p_test, p_test_l))

0.6710097719869706
0.7571428571428571
0.4845360824742268
0.6319218241042345
0.6666666666666666
0.5567010309278351
