In [2]:
from keras.models import Sequential, load_model, save_model
from keras.layers import Dense, Dropout, LeakyReLU
from keras.optimizers import *
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder

validation_part, test_part = 0.2, 0.2
epoches_num = 200
batch_size = 64

np.random.seed = 99
ZERO_LABEL, POS_LABEL = 'BENIGN', 'PortScan'

Using Theano backend.


In [3]:
def shuffle_data(x, y):
    '''Shuffles data inplace'''
    indeces = np.random.shuffle(np.arange(x.shape[0]))
    x[:], y[:] = x[indeces], y[indeces]

def scale(data):
    '''Scales numpy array, deletes nan, inf'''
    inf, not_zero_ind = np.isinf(data), np.nonzero(data)[0]
    data = np.nan_to_num(data)
    data[inf] = data[np.logical_not(inf)].max()
    data[data == 0] = 0 if not_zero_ind.size == 0 else data[not_zero_ind].mean()
    return StandardScaler().fit_transform(data) / 2 + 0.5

def gen(train=True, data=None):
    if data is None:
        global X_train, y_train, X_val, y_val
    else:
        X_train, y_train, X_val, y_val = data
    x, y = (X_train, y_train) if train else (X_val, y_val)
    while True:
        shuffle_data(x, y)
        for i in range(x.shape[0] // batch_size):
            yield x[i * batch_size: (i+1) * batch_size], y[i * batch_size: (i+1) * batch_size]

In [4]:
def str_arr_to_one_hot(arr, max_one_hot_len=200):
    temp_arr = LabelEncoder().fit_transform(arr)
    one_hot_len = len(set(temp_arr))
    return (True, to_categorical(temp_arr)) if one_hot_len < max_one_hot_len else (False, one_hot_len)

def data_to_one_hot(data):
    ans, not_used = [], []
    for column in data.columns:
        data_arr = data[column].as_matrix()
        if type(data_arr[0]) is str and column != 'Flow Bytes/s':
            is_used, data_arr = str_arr_to_one_hot(data_arr)
            if not is_used:
                not_used.append([column, data_arr])
                continue
        else:
            data_arr = scale(data_arr.astype(np.float64).reshape(-1, 1))
        ans.append(data_arr)
    if not_used:
        print("Didn't use columns:")
        print(not_used)
    return np.concatenate(ans, axis=1)

def load_data(filename):
    '''Returns (x, y) if labels are in the file, else (x, IDnums)'''
    data = pd.read_csv(filename, index_col='IDNum')
    x_one_hot = data_to_one_hot(data[ [c for c in data.columns if c != 'Label'] ])
    if 'Label' in data.columns:
        y_one_hot = data_to_one_hot(pd.DataFrame(data['Label']))        
        fir_elem_lab = data['Label'].as_matrix()[0]
        if (fir_elem_lab == ZERO_LABEL and y_one_hot[0, 1] == 1) or \
                (fir_elem_lab == POS_LABEL and y_one_hot[0, 0] == 1):
            y_one_hot = y_one_hot[:, ::-1]
        return x_one_hot, y_one_hot
    return x_one_hot, np.array(data.index)

In [4]:
X, Y = load_data('FWHA-PortScan-Train.csv')
shuffle_data( X, Y )
val_part = int(validation_part * X.shape[0])
X_train, X_val, y_train, y_val = X[val_part:], X[:val_part], Y[val_part:], Y[:val_part]

Didn't use columns:
[['Flow ID', 168626], ['Source IP', 3425], ['Destination IP', 4244], ['Flow Packets/s', 56389]]


In [5]:
model = Sequential()
model.add(Dense( 2000, input_dim=X.shape[1] )) #, activation='relu'
model.add(Dropout(0.4))
model.add(LeakyReLU())

model.add(Dense(1000))
model.add(Dropout(0.4))
model.add(LeakyReLU())

model.add(Dense(500))
model.add(Dropout(0.5))
model.add(LeakyReLU())

model.add(Dense(2, activation='softmax'))

#opt = Adadelta(lr=0.01)
opt = SGD(lr=0.0001, nesterov=True)
#opt = Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])
#model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 2000)              460000    
_________________________________________________________________
dropout_1 (Dropout)          (None, 2000)              0         
_________________________________________________________________
leaky_re_lu_1 (LeakyReLU)    (None, 2000)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 1000)              2001000   
_________________________________________________________________
dropout_2 (Dropout)          (None, 1000)              0         
_________________________________________________________________
leaky_re_lu_2 (LeakyReLU)    (None, 1000)              0         
_________________________________________________________________
dense_3 (Dense)              (None, 500)               500500    
__________

In [6]:
model.fit_generator(gen(),
                    steps_per_epoch=X_train.shape[0] // batch_size,
                    epochs=epoches_num,
                    validation_data=gen(train=False),
                    validation_steps=X_val.shape[0] // batch_size,
                    callbacks=[ModelCheckpoint("compet_models_minor/mlp_{epoch:d}-{acc:.2f}.hdf5", save_best_only=True)])

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200


Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78/200
Epoch 79/200
Epoch 80/200
Epoch 81/200
Epoch 82/200
Epoch 83/200
Epoch 84/200
Epoch 85/200
Epoch 86/200
Epoch 87/200
Epoch 88/200
Epoch 89/200
Epoch 90/200
Epoch 91/200
Epoch 92/200
Epoch 93/200
Epoch 94/200
Epoch 95/200
Epoch 96/200
Epoch 97/200
Epoch 98/200
Epoch 99/200
Epoch 100/200
Epoch 101/200
Epoch 102/200
Epoch 103/200
Epoch 104/200
Epoch 105/200
Epoch 106/200
Epoch 107/200
Epoch 108/200
Epoch 109/200
Epoch 110/200
Epoch 111/200
Epoch 112/200
Epoch 113/200
Epoch 114/200
Epoch 115/200
Epoch 116/200
Epoch 117/200
Epoch 118/200
Epoch 119/200
Epoch 120/200
Epoch 121/200
Epoch 122/200
Epoch 123/200
Epoch 124/200
Epoch 125/200
Epoch 126/200
Epoch 127/200
Epoch 128/200
Epoch 129/200
Epoch 130/200
Epoch 131/200
Epoch 132/200
Epoch 133/200
Epoch 134/200
Epo

Epoch 180/200
Epoch 181/200
Epoch 182/200
Epoch 183/200
Epoch 184/200
Epoch 185/200
Epoch 186/200
Epoch 187/200
Epoch 188/200
Epoch 189/200
Epoch 190/200
Epoch 191/200
Epoch 192/200
Epoch 193/200
Epoch 194/200
Epoch 195/200
Epoch 196/200
Epoch 197/200
Epoch 198/200
Epoch 199/200
Epoch 200/200

StopIteration: 

In [5]:
X_test, IDnums = load_data('FWHA-PortScan-Test.csv')
model = load_model('mlp_198-1.00.hdf5')
y_test = np.argmax(model.predict(X_test), axis=1)

Didn't use columns:
[['Flow ID', 229683], ['Source IP', 3667], ['Destination IP', 4401], ['Flow Packets/s', 68847]]


In [6]:
answers = np.vstack((IDnums, [POS_LABEL * y + ZERO_LABEL * (1-y) for y in y_test])).T


In [7]:
answers

array([['170367', 'BENIGN'],
       ['170368', 'BENIGN'],
       ['170369', 'BENIGN'],
       ..., 
       ['456831', 'BENIGN'],
       ['456832', 'BENIGN'],
       ['456833', 'BENIGN']], 
      dtype='<U21')

In [None]:
np.savetxt('answers_2.csv', answers, fmt='%s,%s', header='IDNum,Label', comments='')