In [107]:
## import libraries
import numpy as np
np.random.seed(123)

import pandas as pd
import subprocess
from scipy.sparse import csr_matrix, hstack
from sklearn.metrics import mean_absolute_error,matthews_corrcoef,classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import KFold
from sklearn.utils import shuffle
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers.normalization import BatchNormalization
from keras.layers.core import Activation
from keras.layers.advanced_activations import PReLU
from keras.callbacks import CSVLogger,EarlyStopping, ModelCheckpoint

In [2]:
## Batch generators ##################################################################################################################################

def batch_generator(X, y, batch_size, shuffle):
    #chenglong code for fiting from generator (https://www.kaggle.com/c/talkingdata-mobile-user-demographics/forums/t/22567/neural-network-for-sparse-matrices)
    number_of_batches = np.ceil(X.shape[0]/batch_size)
    counter = 0
    sample_index = np.arange(X.shape[0])
    if shuffle:
        np.random.shuffle(sample_index)
    while True:
        batch_index = sample_index[batch_size*counter:batch_size*(counter+1)]
        X_batch = X[batch_index,:].toarray()
        y_batch = y[batch_index]
        counter += 1
        yield X_batch, y_batch
        if (counter == number_of_batches):
            if shuffle:
                np.random.shuffle(sample_index)
            counter = 0

def batch_generatorp(X, batch_size, shuffle):
    number_of_batches = X.shape[0] / np.ceil(X.shape[0]/batch_size)
    counter = 0
    sample_index = np.arange(X.shape[0])
    while True:
        batch_index = sample_index[batch_size * counter:batch_size * (counter + 1)]
        X_batch = X[batch_index, :].toarray()
        counter += 1
        yield X_batch
        if (counter == number_of_batches):
            counter = 0


## Read Data

In [3]:
Train = pd.read_csv('Bidirectional_Botnet_Training_Final_Flow_Based_Features.csv')
Test = pd.read_csv('Bidirectional_Botnet_Test_Final_Flow_Based_Features.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
Train.head(3)

Unnamed: 0,FlowNo.,Info,udp_Length,Destination,APL,AvgPktPerSec,Destination Port,Answer RRs,TCP Segment Len,Differentiated Services Field,...,NumPackets,Source Port,StdDevLen,Next sequence number,SameLenPktRatio,FPL,Duration,Time to live,Sequence number,NPEx
0,211023,80 > 4546 [ACK] Seq=4700 Ack=120 Win=5840 Len=0,,192.168.2.109,60.0,0.0,4546,,0.0,0x00,...,1,80,0.0,,1.0,60,0.0,41,4700,1
1,442699,2301 > 80 [ACK] Seq=435 Ack=8199 Win=16349 L...,,74.55.1.4,60.0,0.144511,80,,0.0,0x00,...,2,2301,0.0,,0.5,60,13.8398,128,435,2
2,79732,4683 > 80 [SYN] Seq=0 Win=16384 Len=0 MSS=14...,,125.6.164.43,61.0,9.5004,80,,0.0,0x00,...,4,4683,1.0,,0.5,62,0.421035,128,0,4


In [5]:
list(Train.columns)

['FlowNo.',
 'Info',
 'udp_Length',
 'Destination',
 'APL',
 'AvgPktPerSec',
 'Destination Port',
 'Answer RRs',
 'TCP Segment Len',
 'Differentiated Services Field',
 'IOPR',
 'Source',
 'Length',
 'Time',
 'IAT',
 'reconnects',
 'tcp_Flags',
 'Protocols in frame',
 'No.',
 'Protocol',
 'BytesEx',
 'isNull',
 'BitsPerSec',
 'isBot',
 'IP_Flags',
 'NumPackets',
 'Source Port',
 'StdDevLen',
 'Next sequence number',
 'SameLenPktRatio',
 'FPL',
 'Duration',
 'Time to live',
 'Sequence number',
 'NPEx']

In [6]:
Train['Time to live'].head(10)

0         41
1        128
2        128
3        128
4        128
5    128,239
6        128
7        128
8        128
9        128
Name: Time to live, dtype: object

In [7]:
features = ['FlowNo.',
 'APL',
 'AvgPktPerSec',
# 'Destination Port',
 'IOPR',
 'Length',
 'IAT',
 'Protocol',
 'BytesEx',
 'BitsPerSec',
 'NumPackets',
# 'Source Port',
 'StdDevLen',
 'SameLenPktRatio',
 'FPL',
 'Duration',
 'NPEx']

In [8]:
target = ['isBot']

In [9]:
for x in features:
    print x,type(Train[x][0]),type(Test[x][0])

FlowNo. <type 'numpy.int64'> <type 'numpy.int64'>
APL <type 'numpy.float64'> <type 'numpy.float64'>
AvgPktPerSec <type 'numpy.float64'> <type 'numpy.float64'>
IOPR <type 'numpy.float64'> <type 'numpy.int64'>
Length <type 'numpy.int64'> <type 'numpy.int64'>
IAT <type 'numpy.float64'> <type 'numpy.float64'>
Protocol <type 'str'> <type 'str'>
BytesEx <type 'numpy.int64'> <type 'numpy.int64'>
BitsPerSec <type 'numpy.float64'> <type 'numpy.float64'>
NumPackets <type 'numpy.int64'> <type 'numpy.int64'>
StdDevLen <type 'numpy.float64'> <type 'numpy.float64'>
SameLenPktRatio <type 'numpy.float64'> <type 'numpy.float64'>
FPL <type 'numpy.int64'> <type 'numpy.int64'>
Duration <type 'numpy.float64'> <type 'numpy.float64'>
NPEx <type 'numpy.int64'> <type 'numpy.int64'>


In [10]:
id_Train = Train['FlowNo.']
id_Test = Test['FlowNo.']

In [11]:
ntrain = Train.shape[0]

In [12]:
Tr_Te = pd.concat((Train,Test),axis=0)

In [15]:
num_features = ['FlowNo.',
 'APL',
 'AvgPktPerSec',
# 'Destination Port',
 'IOPR',
 'Length',
 'IAT',
 'BytesEx',
 'BitsPerSec',
 'NumPackets',
# 'Source Port',
 'StdDevLen',
 'SameLenPktRatio',
 'FPL',
 'Duration',
 'NPEx']
cat_features = ['Protocol']

In [16]:
X = []

In [18]:
for x in cat_features:
    temp = pd.get_dummies(Tr_Te[x].astype('category'))
    X.append(temp)

In [19]:
scaler = StandardScaler()
tmp = scaler.fit_transform(Tr_Te[num_features])
X.append(tmp)

In [20]:
Y = Tr_Te['isBot']

In [21]:
del(Tr_Te,Train,Test)

In [29]:
cpX = X[0]
for i in range(1,len(X)):
    cpX = np.hstack((cpX,X[i]))
    
X = cpX
print X.shape


(859478, 121)


In [31]:
X_train = X[:ntrain,:]
X_test = X[ntrain:,:]
Y_train = Y[:ntrain]
Y_test = Y[ntrain:]

In [32]:
del(X,cpX)

In [33]:
X_train

array([[ 0.        ,  0.        ,  0.        , ..., -0.30760879,
        -0.00177444, -0.01383582],
       [ 0.        ,  0.        ,  0.        , ..., -0.30760879,
        -0.00176461, -0.01130548],
       [ 0.        ,  0.        ,  0.        , ..., -0.29266384,
        -0.00177414, -0.0062448 ],
       ..., 
       [ 0.        ,  0.        ,  0.        , ...,  0.1332672 ,
        -0.00177413, -0.01636615],
       [ 0.        ,  0.        ,  0.        , ...,  0.02865256,
        -0.00177444, -0.01636615],
       [ 0.        ,  0.        ,  0.        , ...,  0.3051341 ,
        -0.00176804, -0.01636615]])

In [34]:
Y_train

0         1
1         0
2         1
3         0
4         1
5         0
6         1
7         1
8         0
9         0
10        0
11        1
12        0
13        0
14        1
15        1
16        1
17        1
18        1
19        0
20        0
21        0
22        0
23        0
24        0
25        1
26        1
27        1
28        1
29        1
         ..
504014    0
504015    0
504016    1
504017    1
504018    0
504019    1
504020    0
504021    1
504022    1
504023    1
504024    1
504025    1
504026    0
504027    1
504028    1
504029    0
504030    1
504031    1
504032    0
504033    1
504034    1
504035    0
504036    0
504037    0
504038    0
504039    0
504040    1
504041    1
504042    1
504043    0
Name: isBot, dtype: int64

In [35]:
print len(X_train),len(Y_train)
print len(X_test),len(Y_test)

504044 504044
355434 355434


In [89]:
from keras import backend as K
def matthews_correlation(y_true, y_pred):
    '''Calculates the Matthews correlation coefficient measure for quality
    of binary classification problems.
    '''
    y_pred_pos = K.round(K.clip(y_pred, 0, 1))
    y_pred_neg = 1 - y_pred_pos

    y_pos = K.round(K.clip(y_true, 0, 1))
    y_neg = 1 - y_pos

    tp = K.sum(y_pos * y_pred_pos)
    tn = K.sum(y_neg * y_pred_neg)

    fp = K.sum(y_neg * y_pred_pos)
    fn = K.sum(y_pos * y_pred_neg)

    numerator = (tp * tn - fp * fn)
    denominator = K.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))

    return numerator / (denominator + K.epsilon())


In [98]:
def nn_model():
    model = Sequential()
    
    model.add(Dense(100, input_dim = X_train.shape[1], init = 'he_normal'))
    model.add(Activation('sigmoid'))
    model.add(BatchNormalization())
    model.add(Dropout(0.4))
        
    model.add(Dense(50, init = 'he_normal'))
    model.add(Activation('sigmoid'))
    model.add(BatchNormalization())    
    model.add(Dropout(0.2))
    
    #model.add(Dense(50, init = 'he_normal'))
    #model.add(PReLU())
    #model.add(BatchNormalization())    
    #model.add(Dropout(0.2))
    
    model.add(Dense(1, init = 'he_normal'))
    model.compile(loss = 'binary_crossentropy', optimizer = 'adam',metrics=['accuracy','fbeta_score','matthews_correlation'])
    return(model)

In [144]:
model = nn_model()

In [145]:
csv_logger = CSVLogger('DL/log.txt')
earlyStopping = EarlyStopping(monitor='val_acc', patience=4, verbose=2, mode='min')

In [146]:
model.fit(X_train,Y_train,nb_epoch=50,batch_size=128,callbacks=[csv_logger,earlyStopping],validation_data=(X_test,Y_test),verbose=1)

Train on 504044 samples, validate on 355434 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50


<keras.callbacks.History at 0x7f3c36f06350>

In [147]:
y_pred = model.predict_classes(X_train)
y_pred = np.reshape(y_pred,(y_pred.shape[0]))



In [148]:
true_pred = np.array(Y_train)

In [149]:
def print_metr(y_pred,y_true):
    print '\n',classification_report(y_pred,y_true)

In [150]:
print_metr(y_pred,true_pred)


             precision    recall  f1-score   support

          0       0.72      0.66      0.69    241921
          1       0.71      0.76      0.73    262123

avg / total       0.71      0.71      0.71    504044



In [151]:
pred_test = model.predict_classes(X_test)
true_test = np.reshape(Y_test,(Y_test.shape[0]))
print_metr(pred_test,true_test)


             precision    recall  f1-score   support

          0       0.50      0.59      0.54    103315
          1       0.82      0.76      0.79    252119

avg / total       0.73      0.71      0.72    355434



In [108]:
confusion_matrix(pred_test,true_test)

array([[ 75458,  70628],
       [ 46304, 163044]])

In [153]:
sum(Y_test)

233672

In [154]:
len(Y_test)-sum(Y_test)

121762

In [155]:
print sum(Y_train),len(Y_train)-sum(Y_train)

281275 222769


In [1]:
import graphlab as gl

In [2]:
SF = gl.SFrame.read_csv('ISCX_Botnet-Training.csv',verbose=False)

This non-commercial license of GraphLab Create for academic use is assigned to hmishra2250@gmail.com and will expire on September 18, 2017.


[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1480102912.log


In [3]:
SF.rename({'DestInation Port':'Destination Port'})

No.,Time,Source,Destination,Protocol,Length,Info,Destination Port
1,0.0,CiscoInc_db:19:c3,Broadcast,ARP,60,Who has 147.32.84.165? Tell 147.32.84.1 ...,
2,8.982709,CiscoInc_db:19:c3,Broadcast,ARP,60,Who has 147.32.84.165? Tell 147.32.84.1 ...,
3,50.099564,CiscoInc_db:19:c3,Broadcast,ARP,60,Who has 147.32.84.165? Tell 147.32.84.1 ...,
4,50.369266,54:52:00:00:00:01,Broadcast,ARP,60,Who has 147.32.84.165? Tell 147.32.84.85 ...,
5,51.369054,54:52:00:00:00:01,Broadcast,ARP,60,Who has 147.32.84.165? Tell 147.32.84.85 ...,
6,52.369688,54:52:00:00:00:01,Broadcast,ARP,60,Who has 147.32.84.165? Tell 147.32.84.85 ...,
7,53.08684,CiscoInc_db:19:c3,Broadcast,ARP,60,Who has 147.32.84.165? Tell 147.32.84.1 ...,
8,59.086131,CiscoInc_db:19:c3,Broadcast,ARP,60,Who has 147.32.84.165? Tell 147.32.84.1 ...,
9,160.084662,CadmusCo_b5:b7:19,Broadcast,ARP,60,Gratuitous ARP for 147.32.84.165 (Request) ...,
10,160.084668,CadmusCo_b5:b7:19,Broadcast,ARP,60,Gratuitous ARP for 147.32.84.165 (Request) ...,

Source Port,Protocols in frame,Differentiated Services Field ...,IP_Flags,udp_Length,TCP Segment Len,Sequence number
,eth:ethertype:arp,,,,,
,eth:ethertype:arp,,,,,
,eth:ethertype:arp,,,,,
,eth:ethertype:arp,,,,,
,eth:ethertype:arp,,,,,
,eth:ethertype:arp,,,,,
,eth:ethertype:arp,,,,,
,eth:ethertype:arp,,,,,
,eth:ethertype:arp,,,,,
,eth:ethertype:arp,,,,,

tcp_Flags,Answer RRs,Source GeoIP Country,Source GeoIP Latitude,Source GeoIP Longitude,Destination GeoIP Latitude ...,Destination GeoIP Longitude ...
,,,,,,
,,,,,,
,,,,,,
,,,,,,
,,,,,,
,,,,,,
,,,,,,
,,,,,,
,,,,,,
,,,,,,

Destination GeoIP Country,Time to live,Next sequence number
,,
,,
,,
,,
,,
,,
,,
,,
,,
,,


In [4]:
SF = SF[(SF['Source Port']!='')&(SF['Destination Port']!='')]

In [5]:
len(SF)

9360525

In [6]:
def flow_id(x):
    if x['Source']>x['Destination']:
        return x['Source']+'-'+x['Destination']+'-'+str(x['Source Port'])+'-'+str(x['Destination Port'])+'-'+x['Protocol']
    else:
        return x['Destination']+'-'+x['Source']+'-'+str(x['Destination Port'])+'-'+str(x['Source Port'])+'-'+x['Protocol']
SF['UFid'] = SF.apply(lambda x:flow_id(x))

In [7]:
print len(SF['UFid'].unique())

300921


In [8]:
SF['isFwd'] = SF.apply(lambda x:1 if x['Source']>x['Destination'] else 0)
SF['isBck'] = SF.apply(lambda x:1 if x['Source']<=x['Destination'] else 0)

In [9]:
temp = SF.groupby(['UFid'],{
        'Forward':gl.aggregate.SUM('isFwd'),
        'Backward':gl.aggregate.SUM('isBck')
    })

In [10]:
print len(temp)

300921


In [11]:
len(temp[(temp['Forward']>=1)&(temp['Backward']>=1)])

218095

In [15]:
temp2 = temp[(temp['Forward']>=1)&(temp['Backward']>=1)]

In [16]:
prots = temp2['UFid'].apply(lambda x:x.split('-')[-1])

In [17]:
prots

dtype: str
Rows: ?
['LANMAN', 'DNS', 'TCP', 'HTTP', 'TCP', 'UDP', 'POP', 'DNS', 'HTTP', 'DNS', 'HTTP', 'HTTP', 'DNS', 'DNS', 'TCP', 'HTTP', 'TCP', 'UDP', 'TCP', 'DNS', 'TCP', 'TCP', 'DNS', 'HTTP', 'HTTP', 'HTTP', 'TCP', 'UDP', 'DNS', 'TCP', 'TCP', 'HTTP', 'TCP', 'TCP', 'TCP', 'TCP', 'TCP', 'DNS', 'TCP', 'HTTP', 'LANMAN', 'TCP', 'HTTP', 'DNS', 'HTTP', 'HTTP', 'TCP', 'HTTP', 'TCP', 'TCP', 'HTTP', 'TCP', 'SSHv2', 'TCP', 'DNS', 'TCP', 'IMAP', 'TCP', 'HTTP', 'TCP', 'HTTP', 'DNS', 'HTTP', 'IMAP', 'DNS', 'TCP', 'TCP', 'HTTP', 'POP', 'SMTP', 'HTTP', 'TCP', 'TCP', 'eDonkey', 'TCP', 'DNS', 'TCP', 'TCP', 'TCP', 'TCP', 'TLSv1', 'TCP', 'HTTP', 'TCP', 'DNS', 'HTTP', 'TCP', 'DNS', 'DNS', 'DNS', 'TCP', 'DNS', 'UDP', 'TCP', 'HTTP', 'HTTP', 'TLSv1', 'DNS', 'TCP', 'HTTP', ... ]

In [19]:
for x in set(prots):
    print x,sum(prots==x)

ECHO 1
HTTP 46278
SSLv3 2
POP 2099
TLSv1 2943
SSHv2 2513
ATH 1
eDonkey 176
NBSS 1545
SMB 1545
EGD 1
LLMNR 1
BitTorrent 32
TCP 97648
NBNS 3
LLC 2
ANSI C12.22 1
QUAKEWORLD 2
FTP 232
RDP 1812
COTP 1811
ESP 1
TP 1
SMTP 4906
Gnutella 2
DCERPC 1
PKTC 2
H.225.0 1
DNS 44795
VNC 1
GVSP 8
IRC 39
IMAP 623
UDP 7246
RTCP 2
RSIP 1
MSNMS 7
Pathport 2
LANMAN 1544
QUAKE3 8
Socks 1
SSL 3
VICP 1
SSH 22
QUIC 6
NTP 11
BAT_VIS 1
MiNT 1
TPKT 211
