# Libraries and Directory path

In [2]:
from google.colab import drive
import pandas as pd
import numpy as np
from sklearn.preprocessing import (StandardScaler, OrdinalEncoder,LabelEncoder, MinMaxScaler, OneHotEncoder)

!pip install -q keras  # Installing Keras

from keras.utils import to_categorical
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import Normalizer, MaxAbsScaler , RobustScaler, PowerTransformer
import matplotlib.pyplot as plt
import seaborn as sns

# Linking to Google Drive directive containing NSL-KDD Datasets.
drive.mount('/content/drive') 

Mounted at /content/drive


# Datasets and Features

In [3]:
# train+, test+ and test21- Datasets (csv files) loaded from Google Drive file.
train ='/content/drive/My Drive/Projects | R&D/FYP advice papers/NSL-KDD-Dataset/KDDTrain+.txt'
test ='/content/drive/My Drive/Projects | R&D/FYP advice papers/NSL-KDD-Dataset/KDDTest+.txt'
test21 ='/content/drive/My Drive/Projects | R&D/FYP advice papers/NSL-KDD-Dataset/KDDTest-21.txt'

In [4]:
# Separating Dataset features and parameters into Dataframes.
#
# Network packet parameter labels (features).
featureV=[
  "duration","protocol_type","service","flag","src_bytes","dst_bytes","land","wrong_fragment","urgent","hot",
  "num_failed_logins","logged_in","num_compromised","root_shell","su_attempted","num_root","num_file_creations","num_shells",
  "num_access_files","num_outbound_cmds","is_host_login","is_guest_login","count","srv_count","serror_rate","srv_serror_rate",
  "rerror_rate","srv_rerror_rate","same_srv_rate","diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count", 
  "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate","dst_host_srv_diff_host_rate","dst_host_serror_rate",
  "dst_host_srv_serror_rate","dst_host_rerror_rate","dst_host_srv_rerror_rate","label","difficulty"
  ]

# Network Flag type labels.
flagV=['OTH','RSTOS0','SF','SH','RSTO','S2','S1','REJ','S3','RSTR','S0']

# Network Protocol type labels.
protocol_typeV=['tcp','udp','icmp']

# Network Service type labels.
serviceV=[
  'http','smtp','finger','domain_u','auth','telnet','ftp','eco_i','ntp_u','ecr_i','other','private','pop_3','ftp_data',
  'rje','time','mtp','link','remote_job','gopher','ssh','name','whois','domain','login','imap4','daytime','ctf','nntp',
  'shell','IRC','nnsp','http_443','exec','printer','efs','courier','uucp','klogin','kshell','echo','discard','systat',
  'supdup','iso_tsap','hostnames','csnet_ns','pop_2','sunrpc','uucp_path','netbios_ns','netbios_ssn','netbios_dgm',
  'sql_net','vmnet','bgp','Z39_50','ldap','netstat','urh_i','X11','urp_i','pm_dump','tftp_u','tim_i','red_i','icmp',
  'http_2784','harvest','aol','http_8001'
  ]

# Cyber attack type labels for Binary-class Attack.  
binary_attack=[
  'normal','ipsweep', 'nmap', 'portsweep','satan', 'saint', 'mscan','back', 'land', 'neptune', 'pod', 'smurf',
  'teardrop', 'apache2', 'udpstorm', 'processtable','mailbomb','buffer_overflow', 'loadmodule', 'perl', 'rootkit',
  'xterm', 'ps', 'sqlattack','ftp_write', 'guess_passwd', 'imap', 'multihop','phf', 'spy', 'warezclient',
  'warezmaster','snmpgetattack','named', 'xlock', 'xsnoop','sendmail', 'httptunnel', 'worm', 'snmpguess'
  ]

# Cyber attack type labels for Multi-class Attack.
multiclass_attack={ 
  'normal': 'normal',
  'probe': ['ipsweep.', 'nmap.', 'portsweep.','satan.', 'saint.', 'mscan.'],
  'dos': ['back.', 'land.', 'neptune.', 'pod.', 'smurf.','teardrop.', 'apache2.', 'udpstorm.', 'processtable.','mailbomb.'],
  'u2r': ['buffer_overflow.', 'loadmodule.', 'perl.', 'rootkit.','xterm.', 'ps.', 'sqlattack.'],
  'r2l': ['ftp_write.', 'guess_passwd.', 'imap.', 'multihop.','phf.', 'spy.', 'warezclient.', 'warezmaster.','snmpgetattack.',
            'named.', 'xlock.', 'xsnoop.','sendmail.', 'httptunnel.', 'worm.', 'snmpguess.'
            ]
  }                   

In [5]:
# Reading Comma-Separated values (csv) file into Panda Dataframe variables
# with labeled Axes.
#
# Training and Testing Dataframes.
train_data = pd.read_csv(train, names=featureV)
test_data = pd.read_csv(test, names=featureV)

# Unseen Testing Dataframes.
test_21 = pd.read_csv(test21, names= featureV)

In [6]:
# Query Dataframes according to the supplied Query Expressions.
#
# Remove datapoints with unrequired  service labels fram Train dataframe.
train_data = train_data.query("service != 'aol'")
train_data = train_data.query("service != 'harvest'")
train_data = train_data.query("service != 'http_2784'")
train_data = train_data.query("service != 'http_8001'")
train_data = train_data.query("service != 'red_i'")
train_data = train_data.query("service != 'urh_i'")
train_data = train_data.query("service != 'printer'")
train_data = train_data.query("service != 'rje'")

# Remove datapoints with unrequired  service labels from Test dataframe. 
test_data = test_data.query("service != 'printer'")
test_data = test_data.query("service != 'rje'") 

# Data pre-processing and Validation data

In [7]:
def preprocessing(data,cls,df):
  """Data pre-processing with Dataframe and Attack classification type"""
  data['label'] = data['label'].replace(['normal.','normal'],0)

  if cls=='binary':
    
    # Change 'binary' label from binary_attack Dataframe to Numeral 1.
    for i in range(len(binary_attack)):
      data['label'] = data['label'].replace(binary_attack[i], 1)

  # Change Multi-class type labels to appropriate numerical notation.
  elif cls=='multiclass':

    # Change Multi-class type labels; 'probe' to Numeral 1.
    for i in range(len(multiclass_attack['probe'])):
      data['label'] = data['label'].replace([multiclass_attack['probe'][i],multiclass_attack['probe'][i][:-1]],1)
    
    # Change Multi-class type labels; 'dos' to Numeral 2.
    for i in range(len(multiclass_attack['dos'])):
      data['label'] = data['label'].replace([multiclass_attack['dos'][i],multiclass_attack['dos'][i][:-1]],2)
    
    # Change Multi-class type labels; 'u2r' to Numeral 3.
    for i in range(len(multiclass_attack['u2r'])):
      data['label'] = data['label'].replace([multiclass_attack['u2r'][i],multiclass_attack['u2r'][i][:-1]],3)
    
    # Change Multi-class type labels; 'r21' to Numeral 4.
    for i in range(len(multiclass_attack['r2l'])):
      data['label'] = data['label'].replace([multiclass_attack['r2l'][i],multiclass_attack['r2l'][i][:-1]],4)

  y = data['label']
  x = data.loc[:,'duration':'hot']

  t = x.protocol_type.copy()
  t = pd.get_dummies(t)
  x = x.drop(columns='protocol_type',axis=1)
  x = x.join(t)

  t1 = x.service.copy()
  t1 = pd.get_dummies(t1)
  x = x.drop(columns='service',axis=1)
  x = x.join(t1)

  t2 = x.flag.copy()
  t2 = pd.get_dummies(t2)
  x = x.drop(columns='flag',axis=1)
  x = x.join(t2)

  yt = y.copy()
  yt = pd.get_dummies(yt)

  x = MinMaxScaler(feature_range=(0, 1)).fit_transform(x)

  if df=='train':
    return x,yt
  else:
    return x,y  

In [8]:
#Pre-process Dataframes
#
# Pre-process Training set.
x_train,Y_train = preprocessing(train_data,cls='binary',df='train')

# Pre-process Testing set.
x_test,Y_test = preprocessing(test_data,cls='binary',df='test')

# Pre-process Useen Testing set.
x_21_test, y_21_test = preprocessing(test_21,cls='binary', df='test21')

In [9]:
# Reshape Dataframes into appropriate Dimensions.
#
# x_train for Training set, x_test for Testing set,
# x_21_test for Unseen Testing set.
x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))
x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1))
x_21_test = np.reshape(x_21_test, (x_21_test.shape[0], x_21_test.shape[1], 1))

# Convolutional Neural Network Model

In [10]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout, SimpleRNN , GRU , Activation
from keras.layers.normalization import BatchNormalization
from keras import optimizers
import tensorflow as tf
from keras.layers import Convolution1D, Dense, Dropout, Flatten, MaxPooling1D , AveragePooling1D

In [11]:
# Declaring Sequential Model.
model = Sequential()

# Define Convolutional Network Model to fit Training/Testing Dimensions.
#
# Padding, Max-Pooling; (ReLu) activation Function; Layer dropout rate at 50 %.
model.add(Convolution1D(32, 3, padding="same",activation="relu",input_shape=(x_train.shape[1], 1)))
model.add(MaxPooling1D(pool_size=(4)))
model.add(Dropout(0.5))
#
model.add(Convolution1D(64, 3, padding="same",activation="relu"))
model.add(MaxPooling1D(pool_size=(2)))
model.add(Dropout(0.5))

# Vectorize the Convolutional Network.
model.add(Flatten())

# Enforce Dense Network and Dropout.
model.add(Dense(256, activation="relu"))
model.add(Dropout(0.5))
#
# (Softmax) Activation function for Multi-class output.
model.add(Dense(2, activation="softmax"))

# Fitting and Prediction

In [12]:
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split

In [13]:
# Model Compilation and Fitting.
# 
# (Adam) Optimizer; (Categorical) Crossentropy loss for (Softmax) activation.
model.compile(optimizer='adam',loss='categorical_crossentropy', metrics=['accuracy'])
#
# Fit Model; Train model with train-test split and Cross Validation at batch-size = 128; 100 Epochs.
model.fit(x_train, Y_train, epochs=100, batch_size=128)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x7ff6800d1b10>

# Output

In [14]:
# Perform Prediction.
# 
# Input Network Packet in DataFrame format.
# Save output.
pred = model.predict(x_test)
y_pred = np.argmax(pred, axis=1)
#
# String labeled output.
y_p = ['Malignant' if i==1 else 'Benign' if i==0 else i for i in y_pred]

In [15]:
# Save output as csv file.
output = pd.concat([pd.DataFrame(y_p), pd.DataFrame(y_pred)], axis=1)
output.to_csv('pred.csv')

# Performance Measures

In [16]:
from sklearn.metrics import confusion_matrix,accuracy_score
from sklearn.metrics import (precision_score, recall_score,
                             f1_score, accuracy_score,mean_squared_error,mean_absolute_error)

confusion_matrix(Y_test, y_pred)
accuracy = accuracy_score(Y_test, y_pred)*100
print(accuracy)
print(y_pred)
print(y_21_test)

pred = model.predict(x_21_test)
y_pred= np.argmax(pred, axis = 1)
confusion_matrix(y_21_test, y_pred)
print(y_pred)

acc_21 = accuracy_score(y_21_test, y_pred)* 100
print(acc_21)

recall = recall_score(y_21_test, y_pred , average="binary")
precision = precision_score(y_21_test, y_pred , average="binary")
f1 = f1_score(y_21_test, y_pred, average="binary")
print("F-Score : ", f1*100)
print("Precision : " , precision*100)
print("Recall : ", recall*100)
print("Accuracy : ",acc_21)

79.54716981132076
[1 1 0 ... 1 0 1]
0        1
1        1
2        1
3        0
4        1
        ..
11845    0
11846    0
11847    1
11848    1
11849    1
Name: label, Length: 11850, dtype: int64
[0 1 0 ... 1 1 1]
61.45147679324895
F-Score :  72.66961828407324
Precision :  86.55929304446978
Recall :  62.62115900185605
Accuracy :  61.45147679324895
