In [1]:
##Importing all the required libraries

import numpy as np
import pandas as pd 
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

In [2]:
##Reading the CSV files

df_mon = pd.read_csv("Monday-WorkingHours.pcap_ISCX.csv")
df_tue = pd.read_csv("Tuesday-WorkingHours.pcap_ISCX.csv")
df_wed = pd.read_csv("Wednesday-workingHours.pcap_ISCX.csv")
df_thur_mor = pd.read_csv("Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv")
df_thur_after = pd.read_csv("Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv")
df_fri_mor = pd.read_csv("Friday-WorkingHours-Morning.pcap_ISCX.csv")
df_fri_after1 = pd.read_csv("Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv")
df_fri_after2 = pd.read_csv("Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv")

In [3]:
df = pd.DataFrame()

In [4]:
##Merging all the dataframes into one

df = df.append(df_mon)
df = df.append(df_tue)
df = df.append(df_wed)
df = df.append(df_thur_mor)
df = df.append(df_thur_after)
df = df.append(df_fri_mor)
df = df.append(df_fri_after1)
df = df.append(df_fri_after2)

In [5]:
##Resetting Indices
df.reset_index(drop=True,inplace=True)

In [6]:
##Dropping NaN values
df.dropna(inplace=True)

In [7]:
df.columns = df.columns.str.strip()

In [8]:
##Dropping values with infinity
df.drop(df[df["Flow Bytes/s"] == np.inf].index, inplace=True)

In [9]:
df.reset_index(drop=True,inplace=True)

In [10]:
##Factorizing the labels inorder to calculate the feature importance.
df['Label'] = pd.factorize(df['Label'])[0]

In [11]:
df["Label"].unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14],
      dtype=int64)

In [12]:
X = df.iloc[:,:-1] 
Y = df.iloc[:,-1]

## Feature Selection

In [13]:
##Calculating the importance score of all the features

from sklearn.ensemble import ExtraTreesClassifier
import matplotlib.pyplot as plt
model = ExtraTreesClassifier()
model.fit(X,Y)
print(model.feature_importances_)

[4.51105558e-02 1.54724710e-02 1.23154348e-02 8.40118079e-03
 1.01789302e-02 6.18750128e-03 9.02717955e-03 1.02162384e-02
 1.00363357e-02 8.44466079e-03 2.84124112e-02 3.20908363e-02
 3.47362960e-02 3.72328189e-02 3.91930546e-03 1.31185387e-02
 9.47945496e-03 9.64529496e-03 2.39967703e-02 5.23883633e-03
 1.80403153e-02 1.04605458e-02 2.89777944e-02 3.31372848e-02
 5.76268522e-03 7.90803291e-03 4.64446280e-03 4.43152178e-03
 7.61356313e-03 2.17880948e-03 5.12266172e-03 0.00000000e+00
 4.31806529e-06 0.00000000e+00 9.80363898e-03 7.14381194e-03
 1.65306027e-02 5.99684921e-03 1.70101731e-02 1.31208942e-02
 2.92180604e-02 2.60674420e-02 1.42836210e-02 8.15285893e-03
 5.53791458e-03 1.07702386e-06 6.19548854e-02 2.93802133e-02
 9.60231504e-03 1.48876434e-05 1.44117579e-06 1.42018133e-02
 2.94273611e-02 1.25802887e-02 2.96149277e-02 8.69664467e-03
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 8.62746282e-03 5.66039662e-03
 5.94027264e-03 6.632935

In [14]:
feature_imp = model.feature_importances_

In [15]:
feature_imp_lst = []
for i in range(len(feature_imp)):
    feature_imp_lst.append((df.columns[i],feature_imp[i]))

In [16]:
feature_imp_lst.sort(key=lambda x:x[1],reverse=True)

In [17]:
top_f = feature_imp_lst[:30]  ##Considering top 30 features

In [18]:
l = list((list(zip(*top_f))[0]))
l.append("Label")

In [19]:
new_df = df[list(l)]

In [20]:
X = new_df.iloc[:,:-1]
Y = new_df.iloc[:,-1]

In [21]:
## Train Test Split

In [22]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=42, shuffle=True)

## XgBoost

In [23]:
import xgboost as xgb

In [24]:
## creating the Xgboost specific DMatrix data format from the numpy array
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [25]:
## Using svmlight for less memory consumption, first dump the numpy array into svmlight format and then just pass the filename 
## to DMatrix:
from sklearn.datasets import dump_svmlight_file

dump_svmlight_file(X_train, y_train, 'dtrain.svm', zero_based=True)
dump_svmlight_file(X_test, y_test, 'dtest.svm', zero_based=True)
dtrain_svm = xgb.DMatrix('dtrain.svm')
dtest_svm = xgb.DMatrix('dtest.svm')

In [26]:
## Setting parameters for the XgBoost.
param = {
    'max_depth': 10,  # the maximum depth of each tree
    'eta': 0.3,  # the training step for each iteration
    'num_class': 15}  # the number of classes that exist in this datset
num_round = 20  # the number of training iterations

In [27]:
## Training the model.
bst = xgb.train(param, dtrain, num_round)

In [28]:
## Predicting labels
y_pred = bst.predict(dtest)

In [29]:
print("Accuracy :",accuracy_score(y_test, y_pred))
print("Precision Score :",precision_score(y_test,y_pred,average="weighted"))
print("Recall Score :", recall_score(y_test,y_pred,average="weighted"))
print("F1 Score :", f1_score(y_test,y_pred,average="weighted"))

Accuracy : 0.9987154676961785
Precision Score : 0.9985982190081907
Recall Score : 0.9987154676961785
F1 Score : 0.9986131337564671
