In [2]:
import pandas as pd
import tensorflow as tf
import numpy as np
import scipy.io
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from keras.models import Sequential
from keras.callbacks import EarlyStopping
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn import metrics
from keras.layers.core import Dense, Activation

Using TensorFlow backend.


In [3]:
df = pd.read_csv(r'c:\Users\Dataset.csv', index_col=None, na_values=['NA'], sep=',', low_memory=False)

In [4]:
#Convert Malicious packets into true and Benign into false

df['target'] = df[' Label'] != 'BENIGN'
df = df.drop(' Label', axis=1)
df = df.replace(['Infinity'], np.nan)


In [5]:
#cleaning up data and dropping NaN and Infinity Values
df = df.dropna()
assert pd.isnull(df).sum().sum() == 0 
assert pd.isna(df).sum().sum() == 0

#Replace NaN and infinity values 
df['Flow Bytes/s'] = df['Flow Bytes/s'].replace(np.nan, 0)
df.replace(np.nan, 0, inplace=True)

In [6]:
#Split Features 
x = df.drop('target', axis=1)
y = df[['target']]

In [7]:
#Scale the data using Standard scaler with Mean = 0 and SD = 1

# Get column names from features
col_names = x.columns
# Creating Scaler object
scaler_object = preprocessing.StandardScaler()
# Fit data within the scaler object
scaled_df = scaler_object.fit_transform(x)
scaled_df = pd.DataFrame(scaled_df, columns=col_names)

# concat into x feature 
x = pd.DataFrame(scaled_df)

x[0:5]

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,act_data_pkt_fwd,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min
0,-0.35638,-0.654735,-0.01146,-0.009372,-0.089171,-0.007588,-0.377384,-0.17702,-0.34649,-0.366976,...,-0.008567,-1.06835,-0.131771,-0.100491,-0.148812,-0.104465,-0.580707,-0.105787,-0.585977,-0.57149
1,-0.336726,-0.65562,0.001911,-0.005312,-0.062264,-0.007446,-0.25657,-0.294406,-0.285411,-0.228011,...,-0.002979,0.829301,-0.131771,-0.100491,-0.148812,-0.104465,-0.580707,-0.105787,-0.585977,-0.57149
2,-0.355871,-0.655605,0.000574,-0.004297,0.420442,-0.006187,2.219292,-0.294406,1.612049,2.428138,...,-0.004376,0.829301,-0.131771,-0.100491,-0.148812,-0.104465,-0.580707,-0.105787,-0.585977,-0.57149
3,-0.336726,-0.655275,0.009934,0.001794,0.469393,-0.004622,1.785685,-0.294406,0.902531,1.514421,...,0.005403,0.829301,-0.131771,-0.100491,-0.148812,-0.104465,-0.580707,-0.105787,-0.585977,-0.57149
4,-0.355871,-0.655605,-0.000763,-0.004297,0.420442,-0.006186,2.219292,-0.294406,1.83389,2.70187,...,-0.005773,0.829301,-0.131771,-0.100491,-0.148812,-0.104465,-0.580707,-0.105787,-0.585977,-0.57149


In [8]:
# Turn Outcomes into dummy variables 

dummies = pd.get_dummies(y['target'])

# This will turn BENIGN values into 1 being False and all malicious packets into 0 being True. 
y = dummies.values

print(y)

[[1 0]
 [1 0]
 [1 0]
 ...
 [1 0]
 [1 0]
 [1 0]]


In [9]:
#Scale the data using Standard scaler with Mean = 0 and SD = 1

# Get column names from features
#col_names = x.columns
# Creating Scaler object
scaler_object = preprocessing.StandardScaler()
# Fit data within the scaler object
x = scaler_object.fit_transform(x)

#x_train_std = scaler_object.transform(x_train)
#x_test_std = scaler_object.transform(x_test)
# concat into x feature 
#x = pd.DataFrame(scaled_df)

x[0:5]

array([[-3.56379802e-01, -6.54735460e-01, -1.14597467e-02,
        -9.37235674e-03, -8.91708108e-02, -7.58815974e-03,
        -3.77384006e-01, -1.77019589e-01, -3.46489635e-01,
        -3.66976227e-01, -6.34194764e-01, -4.31602931e-01,
        -6.85570487e-01, -6.00592871e-01, -5.83887875e-02,
        -3.08153261e-01, -4.40917193e-01, -5.83028067e-01,
        -5.96029103e-01, -5.01998398e-02, -6.49547665e-01,
        -4.59857667e-01, -5.64036984e-01, -5.93899549e-01,
        -1.16362436e-01, -4.16135027e-01, -2.76194140e-01,
        -3.28408658e-01, -3.55996406e-01, -1.14423506e-01,
        -2.09939380e-01,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00, -1.42045858e-02, -1.16555317e-02,
        -2.98644071e-01, -1.30350124e-01, -2.81287032e-01,
        -6.53453702e-01, -7.38636978e-01, -6.68810514e-01,
        -5.08120498e-01, -3.28554200e-01, -2.09939380e-01,
        -1.83998745e-02, -4.90695577e-01,  1.18238142e+00,
         3.74975820e+00,  0.00000000e+00, -1.84391619e-0

In [10]:
#Perform PCA on the feature vector to get a more precise accuracy on the dataset. Rescale from 78 features to 2 features.

pca=PCA(n_components = 2)
pca.fit(x)
x_pca = pca.transform(x)
print ("original shape:   ", x.shape)
print ("transformed shape:",x_pca.shape)

x = x_pca


original shape:    (691406, 78)
transformed shape: (691406, 2)


In [11]:
#Split into test and training sets. 25% test sample 

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state=42)

In [12]:
from sklearn.neural_network import MLPClassifier

In [13]:
classifier = MLPClassifier(hidden_layer_sizes=(120,60,20), 
                           max_iter=50,activation = 'relu',solver='adam', 
                           early_stopping= True, learning_rate_init = 0.001, alpha = 1e-4, 
                           verbose = 5, random_state=42)
#Uses Log Loss function
#same as Categorical 

In [14]:
#Fitting the training data to the network
classifier.fit(x_train, y_train)

Iteration 1, loss = inf
Validation score: 0.915420
Iteration 2, loss = inf
Validation score: 0.918370
Iteration 3, loss = inf
Validation score: 0.929786
Iteration 4, loss = inf
Validation score: 0.935610
Iteration 5, loss = 0.25811321
Validation score: 0.946062
Iteration 6, loss = inf
Validation score: 0.939621
Iteration 7, loss = inf
Validation score: 0.952503
Iteration 8, loss = inf
Validation score: 0.963052
Iteration 9, loss = 0.20878373
Validation score: 0.965636
Iteration 10, loss = inf
Validation score: 0.945676
Iteration 11, loss = inf
Validation score: 0.967487
Iteration 12, loss = inf
Validation score: 0.964228
Iteration 13, loss = inf
Validation score: 0.955801
Iteration 14, loss = inf
Validation score: 0.963669
Iteration 15, loss = inf
Validation score: 0.967815
Iteration 16, loss = inf
Validation score: 0.968991
Iteration 17, loss = inf
Validation score: 0.970013
Iteration 18, loss = inf
Validation score: 0.967101
Iteration 19, loss = inf
Validation score: 0.961702
Iterati

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=True, epsilon=1e-08,
              hidden_layer_sizes=(120, 60, 20), learning_rate='constant',
              learning_rate_init=0.001, max_iter=50, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=42, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=5, warm_start=False)

In [15]:
#Predicting y for x_validation
y_pred = classifier.predict(x_test)

In [16]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_pred.argmax(axis = 1), y_test.argmax(axis = 1))

In [1]:
cm

NameError: name 'cm' is not defined

In [18]:
#True Positive (TP) : Observation is positive, and is predicted to be positive.
#False Negative (FN) : Observation is positive, but is predicted negative.
#True Negative (TN) : Observation is negative, and is predicted to be negative.
#False Positive (FP) : Observation is negative, but is predicted positive.

TP = 107588
FP = 2281
FN = 2303
TN = 60680

In [19]:
Acc = ((TP + TN)/(TP + TN + FP + FN))

print( "Accuracy", Acc)

Accuracy 0.973480202716775


In [20]:
# Recall 

Recall = ((TP)/(TP + FN))

print("Recall", Recall)

Recall 0.979042869752755


In [21]:
# Precision

Precision = ((TP)/(TP+FP))

print("Precision", Precision)

Precision 0.9792389117949558


In [22]:
#F1 Score 

F1 = ((2*Recall*Precision)/(Recall + Precision))

print ("F1 Score", F1)

F1 Score 0.9791408809610483


In [76]:
#parameter_space = {
  #  'hidden_layer_sizes': [(150,100,50), (100,50,25), (200,)],
  #  'activation': ['tanh', 'relu'],
  #  'solver': ['sgd', 'adam'],
  #  'alpha': [0.0001, 0.05],
  #  'learning_rate': ['constant','adaptive'],
#}

In [None]:
#from sklearn.model_selection import GridSearchCV

#clf = GridSearchCV(classifier, parameter_space, n_jobs=-1, cv=3)
#clf.fit(x_train, y_train)

In [23]:
from keras.models import load_model

model.save('y_pred')  # creates a HDF5 file 'my_model.h5'
del model  # deletes the existing model

# returns a compiled model
# identical to the previous one
model = load_model('y_pred')

NameError: name 'model' is not defined