In [1]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn.preprocessing
import math
import time
from time import process_time
from time import gmtime, strftime

In [2]:
# load data
data = pd.read_csv('75-25MatlabOutput.csv', delimiter=",")

In [3]:
data.rename(columns = {'id_resp_h':'idresp_h',
                       'id_orig_p' : 'idorig_p',
                       'id_resp_p' : 'idresp_p'}, inplace = True)
data.head()

Unnamed: 0,idorig_p,idresp_h,idresp_p,proto,service,duration,orig_bytes,resp_bytes,conn_state,history,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,label
0,58240,157.167.215.178,22,tcp,,,,,S0,S,1,40,0,0,8
1,52458,159.12.244.89,23,tcp,,,,,S0,S,1,40,0,0,0
2,44385,8.54.193.177,22,tcp,,,,,S0,S,1,40,0,0,8
3,50926,90.64.137.30,23,tcp,,,,,S0,S,1,60,0,0,0
4,45556,34.112.133.28,23,tcp,,3.133484,0.0,0.0,S0,S,3,180,0,0,0


In [4]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

In [5]:
data.idresp_p = le.fit_transform(data.idresp_p)
data.idresp_h = le.fit_transform(data.idresp_h)
data.proto = le.fit_transform(data.proto)
data.service = le.fit_transform(data.service.astype(str))
data.conn_state = le.fit_transform(data.conn_state)
data.history = le.fit_transform(data.history)
data.label = le.fit_transform(data.label)

In [6]:
data.shape
data.head()

Unnamed: 0,idorig_p,idresp_h,idresp_p,proto,service,duration,orig_bytes,resp_bytes,conn_state,history,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,label
0,58240,6136620,7,1,4,,,,6,26,1,40,0,0,8
1,52458,6302577,8,1,4,,,,6,26,1,40,0,0,0
2,44385,17941205,7,1,4,,,,6,26,1,40,0,0,8
3,50926,18938654,8,1,4,,,,6,26,1,60,0,0,0
4,45556,13588799,8,1,4,3.133484,0.0,0.0,6,26,3,180,0,0,0


In [7]:
# split data into X and y
X = data.drop(['label'],axis =1).values
Y = data['label'].values

In [8]:
# split data into train and test sets
seed = 41
test_size = 0.2
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

In [None]:
t1_start = process_time()
# fit model no training data
model = XGBClassifier(max_depth=5, learning_rate=0.1, objective='multi:softmax')
model.fit(X_train, y_train)





In [None]:
t1_stop = process_time()
print("Training Elapsed time:", t1_stop- t1_start) 

In [None]:
t2_start = process_time()
# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, f1_score, recall_score, classification_report

cm=confusion_matrix(Y_test,predictions)
accuracy = accuracy_score(y_test, predictions)
precision = precision_score(Y_test, predictions,average = 'weighted')
f1 = f1_score(Y_test, predictions, average = 'weighted')
recall = recall_score(Y_test, predictions, average = 'weighted')

print(cm)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
print('Precision: ', precision * 100.00)
print('F1: ', f1 * 100.00)
print('Recall: ', recall * 100.00)
print(classification_report(Y_test, predictions))


In [None]:
t2_stop = process_time()
print("Testing Elapsed time:", t2_stop- t2_start) 

In [None]:
from sklearn.metrics import mean_squared_error

rms = np.sqrt(mean_squared_error(Y_test, predictions))
print("RMSE:", rms)

In [None]:
from sklearn.metrics import mean_absolute_error as mae

mae(Y_test, predictions)
print("MAE:" , mae)