In [17]:
# ----------------------------------------------------------------------------------------------------
# Trains a XGBoost model to predict pirating ips from the daily ip level traffic data.
#
# Author: Mohsen Mohammadi - Nov 2020
# Version: 1
# 
# ----------------------------------------------------------------------------------------------------

import os
import time
import numpy as np
import pandas as pd
import xgboost as xgb

from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

import matplotlib.pyplot as plt 

size = 25
params = {'legend.fontsize': size,
          'figure.figsize': (25,15),
          'axes.labelsize': size,
          'axes.titlesize': size,
          'xtick.labelsize': size*0.75,
          'ytick.labelsize': size*0.75,
          'axes.titlepad': 25}
plt.rcParams.update(params)


# Training

In [131]:
# linear regression feature importance
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier
from matplotlib import pyplot

# define dataset
ip_train = pd.read_csv('/home/jupyter/preprocessing/data/3C/IP_TRAIN_d.csv')
ip_test = pd.read_csv('/home/jupyter/preprocessing/data/3C/IP_TEST_d.csv')

ip_train = ip_train.iloc[:,:-30]
ip_test = ip_test.iloc[:,:-30]

ip_train.columns = range(ip_train.shape[1])
ip_test.columns = range(ip_test.shape[1])
train_x, train_y = ip_train, ip_train.pop(0)
test_x, test_y = ip_test, ip_test.pop(0)
dtrain = xgb.DMatrix(train_x, label=train_y)
dtest = xgb.DMatrix(test_x, label=test_y)

# define the model parameters
# Best: 0.889797 using {'subsample': 0.8999999999999999, 'min_child_weight': 0.6, 'max_depth': 9,
#                       'learning_rate': 0.1, 'gamma': 0.1, 'colsample_bytree': 0.5}
param = {
    'max_depth': 9,  
    'eta': 0.1,  
    'silent': 0,  
    'num_class': 3,
    'min_child_weight': 0.6,
    'gamma': 0.1,
    'subsample': 0.9,
    'colsample_bytree': 0.5,
    'objective': 'multi:softprob'
    }  
num_round = 500  # the number of training iterations


# fit the model
model = xgb.train(param, dtrain, num_round)

# evaluate the model
y_pred = model.predict(dtest)
y_pred = np.argmax(y_pred, axis=1)

accuracy = accuracy_score(test_y, y_pred)
print("Accuracy: %0.1f%% " % (accuracy * 100))



Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Accuracy: 83.0% 


# Importance Features

In [None]:
# get importance
importance = model.feature_importances_

# plot feature importance
pyplot.bar([x for x in range(len(importance))], importance)
pyplot.savefig('XGBClassifier.png')
pyplot.show()



# Evaluation

In [127]:
filename = 'detected_IPs_traffic_gw14_non_streaming_ips.csv'
model_path = 'tf-keras/logs/cnn_export/20201223-124940'

# Load and pre-process the IP level traffic data
ip_traffic = pd.read_csv(filename, usecols=['time', 'ip', 'gbps'])
ip_traffic['time'] = pd.to_datetime(ip_traffic['time'])
ip_traffic['date'] = ip_traffic['time'].apply(lambda d: d.date())
ip_traffic['time'] = ip_traffic['time'].apply(lambda d: d.time())

ip_traffic = pd.pivot_table(ip_traffic, values='gbps', index=['date', 'ip'], columns='time', aggfunc=np.sum).fillna(0)
ip_traffic = ip_traffic.iloc[:,:-30]

ip_traffic['pred_label'] = y_pred[:, 3]



In [128]:
# define dataset
ip_test = pd.read_csv('/home/jupyter/preprocessing/data/ML/IP_TEST_stream.csv')
test_x = ip_traffic.reset_index(drop=True)
test_x.columns = range(1, ip_test.shape[1]+1)

dtest = xgb.DMatrix(test_x)

y_pred = model.predict(dtest)
y_pred = np.argmax(y_pred, axis=1)

accuracy = accuracy_score(test_y, y_pred)
print("Accuracy: %0.1f%% " % (accuracy * 100))


array([1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 2, 2, 1, 1, 1, 1, 1, 1,
       1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2,
       1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 2, 2])

# Hyperparameter Tunning

In [None]:
import time
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score


start_time=time.time()

#### Create X and Y training data here.....


# grid search
model = XGBClassifier()

param_grid = {
        'max_depth': [3, 4, 5, 6, 7, 8, 9],
        'min_child_weight': np.arange(0.1, 2, 0.1),
        'gamma': [0, 0.0005, 0.001, 0.01, 0.1, 1, 10],
        'learning_rate': [0.0005, 0.001, 0.01, 0.1],
        'subsample': np.arange(0.5,1.0,0.1),
        'colsample_bytree': np.arange(0.5,1.0,0.1)
        }
                                      

kfold = KFold(n_splits=10, shuffle=True, random_state=10)
grid_search = RandomizedSearchCV(model, param_grid, scoring="accuracy", n_iter = 500, cv=kfold)
grid_result = grid_search.fit(train_x, train_y)

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_[ 'mean_test_score' ]
stds = grid_result.cv_results_[ 'std_test_score' ]
params = grid_result.cv_results_[ 'params' ]

print(time.time()-start_time)
