In [1]:
import pandas as pd 
import numpy as np 
from pandas import MultiIndex, Int64Index
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import model_from_json

# read data
mydt = pd.read_csv("../dataset/CIC-2018/balanced_data_bruteweb.csv")

print(mydt['Label'].value_counts())

# Create matrix of features and matrix of target variable 
dataset = mydt.values
X = dataset[:,0:79]
y = dataset[:,79]

# convert the scaled array to dataframe 
min_max_scaler = MinMaxScaler()
X_scale = min_max_scaler.fit_transform(X)

X_train = X_scale
y_train = y

#X_train, X_test, y_train, y_test = train_test_split(X_full, y_full, test_size=0.3, random_state=1)

# read test data
mydt2 = pd.read_csv("../dataset/CIC-2018/testbruteweb.csv")

print(mydt2['Label'].value_counts())

# Create matrix of features and matrix of target variable 
dataset2 = mydt2.values
X2 = dataset2[:,0:79]
y2 = dataset2[:,79]

# convert the scaled array to dataframe 
min_max_scaler2 = MinMaxScaler()
X_scale2 = min_max_scaler2.fit_transform(X2)

X_test = X_scale2
y_test = y2

model = XGBClassifier(use_label_encoder=False)
# define the datasets to evaluate each iteration
evalset = [(X_train, y_train), (X_test,y_test)]
# fit the model
model.fit(X_train, y_train, eval_metric=["error", "logloss"], eval_set=evalset)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

%matplotlib inline
# Testing 
cm = confusion_matrix(y_test, y_pred) 
tnr = recall_score(y_test, y_pred) 
fpr = 1 - tnr

print ("confusion_matrix for test data\n",cm)
print(classification_report(y_test,y_pred,labels=np.unique(y_pred)))
print('Accuracy: %.4f' % accuracy_score(y_test, y_pred))
print('Precision: %.4f' % precision_score(y_test, y_pred))
print('Recall: %.4f' % recall_score(y_test, y_pred))
print('F1 Score: %.4f' % f1_score(y_test, y_pred))
print('FPR Score: %.4f' % fpr)

  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index


0.0    13934
1.0    13934
Name: Label, dtype: int64
0    6066
1      75
Name: Label, dtype: int64
[0]	validation_0-error:0.00194	validation_0-logloss:0.43927	validation_1-error:0.01221	validation_1-logloss:0.44509
[1]	validation_0-error:0.00183	validation_0-logloss:0.29893	validation_1-error:0.01189	validation_1-logloss:0.30965
[2]	validation_0-error:0.00079	validation_0-logloss:0.21017	validation_1-error:0.01189	validation_1-logloss:0.22556
[3]	validation_0-error:0.00133	validation_0-logloss:0.15074	validation_1-error:0.01189	validation_1-logloss:0.17047
[4]	validation_0-error:0.00036	validation_0-logloss:0.10912	validation_1-error:0.01221	validation_1-logloss:0.13411
[5]	validation_0-error:0.00036	validation_0-logloss:0.07971	validation_1-error:0.01254	validation_1-logloss:0.10969
[6]	validation_0-error:0.00032	validation_0-logloss:0.05866	validation_1-error:0.01286	validation_1-logloss:0.09349
[7]	validation_0-error:0.00032	validation_0-logloss:0.04340	validation_1-error:0.01286	val

[70]	validation_0-error:0.00000	validation_0-logloss:0.00013	validation_1-error:0.01824	validation_1-logloss:0.08769
[71]	validation_0-error:0.00000	validation_0-logloss:0.00013	validation_1-error:0.01824	validation_1-logloss:0.08743
[72]	validation_0-error:0.00000	validation_0-logloss:0.00013	validation_1-error:0.01824	validation_1-logloss:0.08721
[73]	validation_0-error:0.00000	validation_0-logloss:0.00013	validation_1-error:0.01824	validation_1-logloss:0.08736
[74]	validation_0-error:0.00000	validation_0-logloss:0.00012	validation_1-error:0.01824	validation_1-logloss:0.08723
[75]	validation_0-error:0.00000	validation_0-logloss:0.00012	validation_1-error:0.01824	validation_1-logloss:0.08708
[76]	validation_0-error:0.00000	validation_0-logloss:0.00012	validation_1-error:0.01824	validation_1-logloss:0.08721
[77]	validation_0-error:0.00000	validation_0-logloss:0.00012	validation_1-error:0.01824	validation_1-logloss:0.08708
[78]	validation_0-error:0.00000	validation_0-logloss:0.00012	val

In [None]:
#Plot Precision-Recall Curve 
import matplotlib.pyplot as plt 
from sklearn.metrics import precision_recall_curve

#calculate precision and recall
precision, recall, thresholds = precision_recall_curve(y_test, y_pred)

#create precision recall curve
fig, ax = plt.subplots()
ax.plot(recall, precision, color='blue')
ax.grid(False)

#add axis labels to plot
ax.set_title('Precision-Recall Curve')
ax.set_ylabel('Precision')
ax.set_xlabel('Recall')
#display plot
#plt.savefig('Precision-Recall_curve_XGBoost_CIC_after.png')
plt.show()

In [None]:
# Plot confusion matrix
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(cm, cmap='Oranges', alpha=0.3)
ax.grid(False)
plt.tick_params(axis=u'both', which=u'both',length=0)
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        ax.text(x=j, y=i,s=cm[i, j], va='center', ha='center', size='xx-large')
 
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
#plt.savefig('confusion_matrix_XGBoost_CIC_after.png')
plt.show()

In [None]:
# retrieve performance metrics
results = model.evals_result()
epochs = len(results["validation_0"]["error"])
x_axis = range(0, epochs)
# plot learning curves
fig, ax = plt.subplots(figsize=(10, 5))
ax.grid(False)

plt.plot(x_axis, results['validation_0']['logloss'], label='train')
plt.plot(x_axis, results['validation_1']['logloss'], label='test')
# show the legend
plt.legend()
# show the plot
plt.show()
#plt.savefig('model_loss_XGBoost_CIC_after.png')

In [None]:
# plot classification error
fig, ax = plt.subplots(figsize=(10,5))
ax.plot(x_axis, results["validation_0"]["error"], label="Train")
ax.plot(x_axis, results["validation_1"]["error"], label="Test")
ax.legend()
plt.ylabel("Classification Error")
plt.title("XGBoost Classification Error")
#plt.savefig('model_error_XGBoost_CIC_after.png')
plt.show()

In [None]:
# Plot ROC
from sklearn.metrics import auc, plot_precision_recall_curve, roc_curve, roc_auc_score

false_positive_rate, true_positive_rate, threshold = roc_curve(y_test, y_pred)
print('roc_auc_score: ', roc_auc_score(y_test, y_pred))
plt.subplots(1, figsize=(10,5))
plt.title('ROC cure_b')
plt.plot(false_positive_rate, true_positive_rate)
plt.plot([0, 1], ls="--")
plt.plot([0, 0], [1, 0] , c=".7"), plt.plot([1, 1] , c=".7")
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
#plt.savefig('Roc_curve_XGBoost_CIC_after.png')
plt.show()