###Problem Description
Our task is to examine data collected from a network trace and determine whether the network traffic indicates an intrusion or attack or whether it indicates normal network activity.

First we start by reading in the subset of data provided and loading it into a Padas dataframe and define the columns to make it easier to work with.

In [153]:
import os
import pandas as pd
import numpy as np
from scipy.stats import zscore
from collections.abc import Sequence
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import metrics

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras import optimizers

df = pd.read_csv('network_intrusion_data.csv', na_values=['NA','?'])
df.columns = [
'duration',
'protocol_type',
'service',
'flag',
'src_bytes',
'dst_bytes',
'land',
'wrong_fragment',
'urgent',
'hot',
'num_failed_logins',
'logged_in',
'num_compromised',
'root_shell',
'su_attempted',
'num_root',
'num_file_creations',
'num_shells',
'num_access_files',
'num_outbound_cmds',
'is_host_login',
'is_guest_login',
'count',
'srv_count',
'serror_rate',
'srv_serror_rate',
'rerror_rate',
'srv_rerror_rate',
'same_srv_rate',
'diff_srv_rate',
'srv_diff_host_rate',
'dst_host_count',
'dst_host_srv_count',
'dst_host_same_srv_rate',
'dst_host_diff_srv_rate',
'dst_host_same_src_port_rate',
'dst_host_srv_diff_host_rate',
'dst_host_serror_rate',
'dst_host_srv_serror_rate',
'dst_host_rerror_rate',
'dst_host_srv_rerror_rate',
'outcome'
]
print(df.shape)
df.head()

(494020, 42)


Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,root_shell,su_attempted,num_root,num_file_creations,num_shells,num_access_files,num_outbound_cmds,is_host_login,is_guest_login,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,outcome
0,0,tcp,http,SF,239,486,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,8,8,0.0,0.0,0.0,0.0,1.0,0.0,0.0,19,19,1.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,normal.
1,0,tcp,http,SF,235,1337,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,8,8,0.0,0.0,0.0,0.0,1.0,0.0,0.0,29,29,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal.
2,0,tcp,http,SF,219,1337,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,6,6,0.0,0.0,0.0,0.0,1.0,0.0,0.0,39,39,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal.
3,0,tcp,http,SF,217,2032,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,6,6,0.0,0.0,0.0,0.0,1.0,0.0,0.0,49,49,1.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,normal.
4,0,tcp,http,SF,217,2032,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,6,6,0.0,0.0,0.0,0.0,1.0,0.0,0.0,59,59,1.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,normal.


We can already see that there is a good chance some of these rows may contain duplicates (see row 3 and 4 - they're nearly identical), so we want to drop those so as not to heavily weight duplicated data.

Next we want to find columns that don't contain any useful information so we can remove them from dataframe. In particular, let's focus on columns where all values are indentical.

In [154]:
### Drop any duplicate rows
df.drop_duplicates(inplace=True)

### Drop any columns that are idenitcal and therefore offer no valuable information
for (colName, colData) in df.iteritems():
  dummy_val = df.iloc[0][colName]
  if ((colData.values == dummy_val).all()):
    print("Dropping Column: %s" % colName)
    df.drop([colName], axis=1, inplace=True)
print(df.shape)
df.head()

Dropping Column: num_outbound_cmds
Dropping Column: is_host_login
(145585, 40)


Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,root_shell,su_attempted,num_root,num_file_creations,num_shells,num_access_files,is_guest_login,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,outcome
0,0,tcp,http,SF,239,486,0,0,0,0,0,1,0,0,0,0,0,0,0,0,8,8,0.0,0.0,0.0,0.0,1.0,0.0,0.0,19,19,1.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,normal.
1,0,tcp,http,SF,235,1337,0,0,0,0,0,1,0,0,0,0,0,0,0,0,8,8,0.0,0.0,0.0,0.0,1.0,0.0,0.0,29,29,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal.
2,0,tcp,http,SF,219,1337,0,0,0,0,0,1,0,0,0,0,0,0,0,0,6,6,0.0,0.0,0.0,0.0,1.0,0.0,0.0,39,39,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal.
3,0,tcp,http,SF,217,2032,0,0,0,0,0,1,0,0,0,0,0,0,0,0,6,6,0.0,0.0,0.0,0.0,1.0,0.0,0.0,49,49,1.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,normal.
4,0,tcp,http,SF,217,2032,0,0,0,0,0,1,0,0,0,0,0,0,0,0,6,6,0.0,0.0,0.0,0.0,1.0,0.0,0.0,59,59,1.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,normal.


Next we want to label encode the 'outcome' column to be able to distinguish between normal traffic and an intrusion attemp or attack. We put all of the possible outcomes into a list and use that list to create an new list of numeric (binary) outcomes to add to our dataframe.

In [155]:
outcomes = ['normal.',
            'back.',
            'buffer_overflow.',
            'ftp_write.',
            'guess_passwd.',
            'imap.',
            'ipsweep.',
            'land.',
            'loadmodule.',
            'multihop.',
            'neptune.',
            'nmap.',
            'perl.',
            'phf.',
            'pod.',
            'portsweep.',
            'rootkit.',
            'satan.',
            'smurf.',
            'spy.',
            'teardrop.',
            'warezclient.',
            'warezmaster.']

s = df['outcome']
s = s.replace(outcomes,[0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1])

df['outcome'] = s

### Look for any values that aren't a 1 or a 0, so we can check to make sure 
### everything was converted.
for x in df['outcome']:
  if (x != 0 and x != 1):
    print(x)

df

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,root_shell,su_attempted,num_root,num_file_creations,num_shells,num_access_files,is_guest_login,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,outcome
0,0,tcp,http,SF,239,486,0,0,0,0,0,1,0,0,0,0,0,0,0,0,8,8,0.00,0.00,0.0,0.0,1.0,0.0,0.00,19,19,1.0,0.0,0.05,0.00,0.00,0.00,0.0,0.0,0
1,0,tcp,http,SF,235,1337,0,0,0,0,0,1,0,0,0,0,0,0,0,0,8,8,0.00,0.00,0.0,0.0,1.0,0.0,0.00,29,29,1.0,0.0,0.03,0.00,0.00,0.00,0.0,0.0,0
2,0,tcp,http,SF,219,1337,0,0,0,0,0,1,0,0,0,0,0,0,0,0,6,6,0.00,0.00,0.0,0.0,1.0,0.0,0.00,39,39,1.0,0.0,0.03,0.00,0.00,0.00,0.0,0.0,0
3,0,tcp,http,SF,217,2032,0,0,0,0,0,1,0,0,0,0,0,0,0,0,6,6,0.00,0.00,0.0,0.0,1.0,0.0,0.00,49,49,1.0,0.0,0.02,0.00,0.00,0.00,0.0,0.0,0
4,0,tcp,http,SF,217,2032,0,0,0,0,0,1,0,0,0,0,0,0,0,0,6,6,0.00,0.00,0.0,0.0,1.0,0.0,0.00,59,59,1.0,0.0,0.02,0.00,0.00,0.00,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
494015,0,tcp,http,SF,310,1881,0,0,0,0,0,1,0,0,0,0,0,0,0,0,4,5,0.00,0.00,0.0,0.0,1.0,0.0,0.40,86,255,1.0,0.0,0.01,0.05,0.00,0.01,0.0,0.0,0
494016,0,tcp,http,SF,282,2286,0,0,0,0,0,1,0,0,0,0,0,0,0,0,6,6,0.00,0.00,0.0,0.0,1.0,0.0,0.00,6,255,1.0,0.0,0.17,0.05,0.00,0.01,0.0,0.0,0
494017,0,tcp,http,SF,203,1200,0,0,0,0,0,1,0,0,0,0,0,0,0,0,6,18,0.17,0.11,0.0,0.0,1.0,0.0,0.17,16,255,1.0,0.0,0.06,0.05,0.06,0.01,0.0,0.0,0
494018,0,tcp,http,SF,291,1200,0,0,0,0,0,1,0,0,0,0,0,0,0,0,6,12,0.00,0.00,0.0,0.0,1.0,0.0,0.17,26,255,1.0,0.0,0.04,0.05,0.04,0.01,0.0,0.0,0


Next, we need to investigate to see if any of the columns contain missing data. We will use isnull() and any() to find any missing or null values in the dataframe. (none exist after inspection)

In [156]:
df.isnull().any()

duration                       False
protocol_type                  False
service                        False
flag                           False
src_bytes                      False
dst_bytes                      False
land                           False
wrong_fragment                 False
urgent                         False
hot                            False
num_failed_logins              False
logged_in                      False
num_compromised                False
root_shell                     False
su_attempted                   False
num_root                       False
num_file_creations             False
num_shells                     False
num_access_files               False
is_guest_login                 False
count                          False
srv_count                      False
serror_rate                    False
srv_serror_rate                False
rerror_rate                    False
srv_rerror_rate                False
same_srv_rate                  False
d

Next we'd like to discover which columns need to be normalized. We need to define which columns are discrete, which columns are continous and discover if the values of the continous columns are between 0 and 1.

We found this site to be useful in determining the columns that were discrete or continous: https://kdd.ics.uci.edu/databases/kddcup99/task.html

(this information is also included in the pdf provided)


|feature name|description|type|
|---|---|---|
|duration| length (number of seconds) of the connection|continuous
|protocol_type| 	type of the protocol, e.g. tcp, udp, etc.| 	discrete
|service| 	network service on the destination, e.g., http, telnet, etc.| 	discrete
|src_bytes| 	number of data bytes from source to destination| 	continuous
|dst_bytes| 	number of data bytes from destination to source| 	continuous
|flag| 	normal or error status of the connection| 	discrete 
|land| 	1 if connection is from/to the same host/port; 0 otherwise| 	discrete
|wrong_fragment| 	number of ``wrong'' fragments| 	continuous
|urgent| 	number of urgent packets| 	continuous
 
Table 1: Basic features of individual TCP connections.
 
|feature name|	description| 	type
|---|---|---|
|hot| 	number of ``hot'' indicators	|continuous
|num_failed_logins| 	number of failed login attempts 	|continuous
|logged_in| 	1 if successfully logged in; 0 otherwise 	|discrete
|num_compromised| 	number of ``compromised'' conditions 	|continuous
|root_shell| 	1 if root shell is obtained; 0 otherwise 	|discrete
|su_attempted| 	1 if ``su root'' command attempted; 0 otherwise 	|discrete
|num_root| 	number of ``root'' accesses 	|continuous
|num_file_creations| 	number of file creation operations 	|continuous
|num_shells| 	number of shell prompts 	|continuous
|num_access_files| 	number of operations on access control files 	|continuous
|num_outbound_cmds|	number of outbound commands in an ftp session 	|continuous
|is_hot_login| 	1 if the login belongs to the ``hot'' list; 0 otherwise 	|discrete
|is_guest_login| 	1 if the login is a ``guest''login; 0 otherwise 	|discrete
 
Table 2: Content features within a connection suggested by domain knowledge.
 
|feature name	|description 	|type
|---|---|---|
|count 	|number of connections to the same host as the current connection in the past two seconds 	|continuous
||Note: The following  features refer to these same-host connections.
|serror_rate 	|% of connections that have ``SYN'' errors 	|continuous
|rerror_rate 	|% of connections that have ``REJ'' errors 	|continuous
|same_srv_rate 	|% of connections to the same service 	|continuous
|diff_srv_rate 	|% of connections to different services 	|continuous
|srv_count 	|number of connections to the same service as the current connection in the past two seconds 	|continuous
||Note: The following features refer to these same-service connections.	
|srv_serror_rate 	|% of connections that have ``SYN'' errors 	|continuous
|srv_rerror_rate 	|% of connections that have ``REJ'' errors 	|continuous
|srv_diff_host_rate 	|% of connections to different hosts 	|continuous 

After doing some research, we decided that the following columns were unnecessary and would not affect our results:
- duration
- src_bytes
- dst_bytes
- urgent

In [157]:
df.drop(columns=['duration','src_bytes','dst_bytes','urgent'], inplace=True)

df.head()

Unnamed: 0,protocol_type,service,flag,land,wrong_fragment,hot,num_failed_logins,logged_in,num_compromised,root_shell,su_attempted,num_root,num_file_creations,num_shells,num_access_files,is_guest_login,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,outcome
0,tcp,http,SF,0,0,0,0,1,0,0,0,0,0,0,0,0,8,8,0.0,0.0,0.0,0.0,1.0,0.0,0.0,19,19,1.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,0
1,tcp,http,SF,0,0,0,0,1,0,0,0,0,0,0,0,0,8,8,0.0,0.0,0.0,0.0,1.0,0.0,0.0,29,29,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,0
2,tcp,http,SF,0,0,0,0,1,0,0,0,0,0,0,0,0,6,6,0.0,0.0,0.0,0.0,1.0,0.0,0.0,39,39,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,0
3,tcp,http,SF,0,0,0,0,1,0,0,0,0,0,0,0,0,6,6,0.0,0.0,0.0,0.0,1.0,0.0,0.0,49,49,1.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0
4,tcp,http,SF,0,0,0,0,1,0,0,0,0,0,0,0,0,6,6,0.0,0.0,0.0,0.0,1.0,0.0,0.0,59,59,1.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0


Alright, we are almost done with pre-processing. Now all that is left to do is one hot encode all of our categorical columns (protocol, outcome,  service, and flag) and normalize our numeric columns (count, srv_count, dst_host_count,dst_host_srv_count.

In [158]:
# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name, x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)

encode_text_dummy(df,'protocol_type')
encode_text_dummy(df, 'service')
encode_text_dummy(df, 'flag')
df.head()

Unnamed: 0,land,wrong_fragment,hot,num_failed_logins,logged_in,num_compromised,root_shell,su_attempted,num_root,num_file_creations,num_shells,num_access_files,is_guest_login,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,outcome,protocol_type-icmp,protocol_type-tcp,protocol_type-udp,service-IRC,service-X11,service-Z39_50,service-auth,...,service-nnsp,service-nntp,service-ntp_u,service-other,service-pm_dump,service-pop_2,service-pop_3,service-printer,service-private,service-red_i,service-remote_job,service-rje,service-shell,service-smtp,service-sql_net,service-ssh,service-sunrpc,service-supdup,service-systat,service-telnet,service-tftp_u,service-tim_i,service-time,service-urh_i,service-urp_i,service-uucp,service-uucp_path,service-vmnet,service-whois,flag-OTH,flag-REJ,flag-RSTO,flag-RSTOS0,flag-RSTR,flag-S0,flag-S1,flag-S2,flag-S3,flag-SF,flag-SH
0,0,0,0,0,1,0,0,0,0,0,0,0,0,8,8,0.0,0.0,0.0,0.0,1.0,0.0,0.0,19,19,1.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,1,0,0,0,0,0,0,0,0,8,8,0.0,0.0,0.0,0.0,1.0,0.0,0.0,29,29,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,1,0,0,0,0,0,0,0,0,6,6,0.0,0.0,0.0,0.0,1.0,0.0,0.0,39,39,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
3,0,0,0,0,1,0,0,0,0,0,0,0,0,6,6,0.0,0.0,0.0,0.0,1.0,0.0,0.0,49,49,1.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
4,0,0,0,0,1,0,0,0,0,0,0,0,0,6,6,0.0,0.0,0.0,0.0,1.0,0.0,0.0,59,59,1.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [0]:
# Encode a column to a range between normalized_low and normalized_high.
def encode_numeric_range(df, name, normalized_low=-1, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
               * (normalized_high - normalized_low) + normalized_low

In [160]:
for col in ['count', 'srv_count', 'dst_host_count','dst_host_srv_count']:
  encode_numeric_range(df, col)

df.head()

Unnamed: 0,land,wrong_fragment,hot,num_failed_logins,logged_in,num_compromised,root_shell,su_attempted,num_root,num_file_creations,num_shells,num_access_files,is_guest_login,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,outcome,protocol_type-icmp,protocol_type-tcp,protocol_type-udp,service-IRC,service-X11,service-Z39_50,service-auth,...,service-nnsp,service-nntp,service-ntp_u,service-other,service-pm_dump,service-pop_2,service-pop_3,service-printer,service-private,service-red_i,service-remote_job,service-rje,service-shell,service-smtp,service-sql_net,service-ssh,service-sunrpc,service-supdup,service-systat,service-telnet,service-tftp_u,service-tim_i,service-time,service-urh_i,service-urp_i,service-uucp,service-uucp_path,service-vmnet,service-whois,flag-OTH,flag-REJ,flag-RSTO,flag-RSTOS0,flag-RSTR,flag-S0,flag-S1,flag-S2,flag-S3,flag-SF,flag-SH
0,0,0,0,0,1,0,0,0,0,0,0,0,0,-0.968689,-0.968689,0.0,0.0,0.0,0.0,1.0,0.0,0.0,-0.85098,-0.85098,1.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,1,0,0,0,0,0,0,0,0,-0.968689,-0.968689,0.0,0.0,0.0,0.0,1.0,0.0,0.0,-0.772549,-0.772549,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,1,0,0,0,0,0,0,0,0,-0.976517,-0.976517,0.0,0.0,0.0,0.0,1.0,0.0,0.0,-0.694118,-0.694118,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
3,0,0,0,0,1,0,0,0,0,0,0,0,0,-0.976517,-0.976517,0.0,0.0,0.0,0.0,1.0,0.0,0.0,-0.615686,-0.615686,1.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
4,0,0,0,0,1,0,0,0,0,0,0,0,0,-0.976517,-0.976517,0.0,0.0,0.0,0.0,1.0,0.0,0.0,-0.537255,-0.537255,1.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


Now all of our data is ready for processing! Lets begin by setting splitting our data into training and testing sets.


In [0]:
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column. 
    target_type = df[target].dtypes
    target_type = target_type[0] if isinstance(target_type, Sequence) else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df[result].values.astype(np.float32), dummies.values.astype(np.float32)
    else:
        # Regression
        return df[result].values.astype(np.float32), df[target].values.astype(np.float32)

In [162]:
x,y = to_xy(df, 'outcome')
print(df.shape)
print(x.shape)
print(y.shape)

#y = np.delete(y, 0, 1)
print(y.shape)
print(y[0])

(145585, 113)
(145585, 112)
(145585, 2)
(145585, 2)
[1. 0.]


The y set has 2 values in each record. isAnAttack, and isNotAnAttack.

Now we must split the data in to training and testing data.

In [0]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

In [164]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)


(109188, 112)
(109188, 2)
(36397, 112)
(36397, 2)


Alright, now that our data is properly split up, lets make a model!

In [166]:
model = Sequential()
model.add(Dense(10, input_dim=x.shape[1], activation='relu'))
#model.add(Dense(10, activation='tanh'))   
model.add(Dense(y.shape[1], activation='relu'))

adam = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
model.compile(loss='mean_squared_error', optimizer=adam)

monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, verbose=1, mode='auto')

# batch_size: Integer or None. Number of samples per gradient update. If unspecified, batch_size will default to 32.
model.fit(x_train,y_train,validation_data=(x_test,y_test), batch_size= 128, callbacks=[monitor],verbose=2,epochs=1000)

# Predict and measure RMSE
pred = model.predict(x_test)
score = np.sqrt(metrics.mean_squared_error(pred,y_test))
print("Score (RMSE): {}".format(score))


Train on 109188 samples, validate on 36397 samples
Epoch 1/1000
109188/109188 - 2s - loss: 0.1259 - val_loss: 0.0115
Epoch 2/1000
109188/109188 - 2s - loss: 0.0102 - val_loss: 0.0092
Epoch 3/1000
109188/109188 - 1s - loss: 0.0103 - val_loss: 0.0082
Epoch 4/1000
109188/109188 - 2s - loss: 0.0080 - val_loss: 0.0076
Epoch 5/1000
109188/109188 - 2s - loss: 0.0074 - val_loss: 0.0071
Epoch 6/1000
109188/109188 - 2s - loss: 0.0071 - val_loss: 0.0069
Epoch 7/1000
109188/109188 - 2s - loss: 0.0069 - val_loss: 0.0068
Epoch 8/1000
109188/109188 - 2s - loss: 0.0067 - val_loss: 0.0066
Epoch 9/1000
109188/109188 - 2s - loss: 0.0064 - val_loss: 0.0070
Epoch 10/1000
109188/109188 - 2s - loss: 0.0063 - val_loss: 0.0062
Epoch 00010: early stopping
Score (RMSE): 0.07851419597864151


In [167]:
pred = model.predict(x_test)
pred

array([[1.0082037 , 0.        ],
       [0.        , 0.9880713 ],
       [0.98834527, 0.        ],
       ...,
       [0.        , 1.0044284 ],
       [0.99937165, 0.        ],
       [1.0092589 , 0.        ]], dtype=float32)

In [168]:
pred = np.argmax(pred,axis=1) # raw probabilities to choose class (highest probability)
print(pred)

[0 1 0 ... 1 0 0]


In [169]:
y_true= np.argmax(y_test,axis=1) 

score = metrics.accuracy_score(y_true, pred)

print("Accuracy score: {}".format(score))

Accuracy score: 0.9905486715938127


In [170]:
score = metrics.precision_score(y_true, pred, average= "weighted")
print("Precision score: {}".format(score))

Precision score: 0.9906039663646914


In [171]:
score = metrics.f1_score(y_true, pred, average= "weighted")
print("F1 score: {}".format(score))

F1 score: 0.9905355464360532


In [172]:
print(metrics.classification_report(y_true, pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99     21938
           1       1.00      0.98      0.99     14459

    accuracy                           0.99     36397
   macro avg       0.99      0.99      0.99     36397
weighted avg       0.99      0.99      0.99     36397

