In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
pd.set_option('display.width', 4000)
pd.set_option('display.max_colwidth', 4000)

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA as sklearnPCA

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score , recall_score, precision_score
from sklearn.metrics import roc_curve ,confusion_matrix , auc, roc_auc_score, classification_report , f1_score


In [2]:
data=pd.read_csv('UNSW_NB15_training-set.csv')

In [3]:
df=data.copy()

In [4]:
## is_ftp_login is binary feature, it should be either 0 or 1
df = df.drop(index=df[df['is_ftp_login']==2].index)


In [5]:
cols_to_delete=['id', 'attack_cat', 'proto']

In [6]:
df=df.drop(cols_to_delete, axis=1)


In [7]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)

df_full_train = df_full_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_full_train.label
y_test = df_test.label

del df_full_train['label']
del df_test['label']

In [8]:
categorical_cols=['service', 'state'  , 'is_ftp_login' , 'ct_ftp_cmd' , 'is_sm_ips_ports' ]
numerical_cols= list(df.columns.drop(['service', 'state'  , 'is_ftp_login' , 'ct_ftp_cmd' , 'is_sm_ips_ports' , 'label']))

In [9]:
print(numerical_cols)

['dur', 'spkts', 'dpkts', 'sbytes', 'dbytes', 'rate', 'sttl', 'dttl', 'sload', 'dload', 'sloss', 'dloss', 'sinpkt', 'dinpkt', 'sjit', 'djit', 'swin', 'stcpb', 'dtcpb', 'dwin', 'tcprtt', 'synack', 'ackdat', 'smean', 'dmean', 'trans_depth', 'response_body_len', 'ct_srv_src', 'ct_state_ttl', 'ct_dst_ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'ct_flw_http_mthd', 'ct_src_ltm', 'ct_srv_dst']


In [10]:
### Converting categorical features to numerical data using DictVectorizer

dv = DictVectorizer(sparse=False)

train_dict = df_full_train[categorical_cols].to_dict(orient='records')
X_train_categorical = dv.fit_transform(train_dict )

test_dict = df_test[categorical_cols].to_dict(orient='records')
X_test_categorical = dv.transform(test_dict)

In [11]:
len(dv.feature_names_)

23

In [12]:
##standardized numerical data

sc = StandardScaler()
X_std_train_numerical = sc.fit_transform(df_full_train[numerical_cols])
X_std_test_numerical  = sc.transform(df_test[numerical_cols])  

### combing the numerial arrays & categorical arrays
X_std_train = np.concatenate((X_train_categorical,X_std_train_numerical) , axis=1)
X_std_test = np.concatenate((X_test_categorical,X_std_test_numerical) , axis=1)

In [13]:
X_std_train.shape, X_std_test.shape

((65862, 59), (16466, 59))

In [14]:
pca = sklearnPCA()
sklearn_pca = pca.fit(X_std_train)
train_pca = sklearn_pca.transform(X_std_train)
test_pca = sklearn_pca.transform(X_std_test)

var_per = sklearn_pca.explained_variance_ratio_
cum_var_per = sklearn_pca.explained_variance_ratio_.cumsum()

n_comp=len(cum_var_per[cum_var_per <= 0.90])

sklearn_pca = sklearnPCA(n_components=n_comp)
train_pca = sklearn_pca.fit_transform(X_std_train)
test_pca = sklearn_pca.transform(X_std_test)

In [15]:
X_train= train_pca
X_test= test_pca

In [16]:
import xgboost as xgb

In [17]:
eta=0.1
max_depth=8

dtrain = xgb.DMatrix(X_train, label=y_train)

dtest = xgb.DMatrix(X_test, label=y_test)

In [18]:
watchlist = [(dtrain, 'train'), (dtest, 'test')]


In [19]:
xgb_params = {
    'eta': eta, 
    'max_depth': max_depth,
    'min_child_weight': 1,
    
    'objective': 'binary:logistic',
    'eval_metric': 'auc',

    'nthread': 8,
    'seed': 1,
    'verbosity': 1,
}



model = xgb.train(xgb_params, dtrain, num_boost_round=200,
                  verbose_eval=5,
                  evals=watchlist)

[0]	train-auc:0.96040	test-auc:0.95652
[5]	train-auc:0.96923	test-auc:0.96596
[10]	train-auc:0.97492	test-auc:0.97140
[15]	train-auc:0.97880	test-auc:0.97535
[20]	train-auc:0.98087	test-auc:0.97686
[25]	train-auc:0.98239	test-auc:0.97814
[30]	train-auc:0.98315	test-auc:0.97864
[35]	train-auc:0.98362	test-auc:0.97897
[40]	train-auc:0.98412	test-auc:0.97921
[45]	train-auc:0.98480	test-auc:0.97967
[50]	train-auc:0.98532	test-auc:0.97998
[55]	train-auc:0.98601	test-auc:0.98046
[60]	train-auc:0.98652	test-auc:0.98076
[65]	train-auc:0.98699	test-auc:0.98097
[70]	train-auc:0.98825	test-auc:0.98134
[75]	train-auc:0.98917	test-auc:0.98158
[80]	train-auc:0.99006	test-auc:0.98201
[85]	train-auc:0.99078	test-auc:0.98222
[90]	train-auc:0.99153	test-auc:0.98237
[95]	train-auc:0.99215	test-auc:0.98261
[100]	train-auc:0.99242	test-auc:0.98271
[105]	train-auc:0.99304	test-auc:0.98294
[110]	train-auc:0.99349	test-auc:0.98300
[115]	train-auc:0.99400	test-auc:0.98316
[120]	train-auc:0.99425	test-auc:0.983

In [20]:
#y_pred = model.predict_proba(X_val)[:, 1] 
scores={}
y_pred=model.predict(dtest)
y_predicted= (y_pred>=0.5).astype(int)
print(confusion_matrix(y_test, y_predicted))
print(classification_report(y_test, y_predicted))

fpr,tpr, th=roc_curve(y_test, y_predicted)
   
scores= {'classiffier': "XGboost" , 'accuracy':round(accuracy_score(y_test,y_predicted),3)
                , 'recall':round(recall_score(y_test,y_predicted),3)
                , 'precision': round(precision_score(y_test,y_predicted),3)
              , 'f1-score': round(f1_score(y_test,y_predicted), 3)  , 'roc_auc_score': round(roc_auc_score(y_test, y_pred),3)}

scores

[[6970  346]
 [ 845 8305]]
              precision    recall  f1-score   support

           0       0.89      0.95      0.92      7316
           1       0.96      0.91      0.93      9150

    accuracy                           0.93     16466
   macro avg       0.93      0.93      0.93     16466
weighted avg       0.93      0.93      0.93     16466



{'classiffier': 'XGboost',
 'accuracy': 0.928,
 'recall': 0.908,
 'precision': 0.96,
 'f1-score': 0.933,
 'roc_auc_score': 0.984}

In [21]:
print(y_pred[0])

0.06614746


In [22]:
print(y_pred[400])

0.999382


In [23]:
import bentoml

In [24]:
bentoml.xgboost.save_model(
    'cyber_attack_model',
    model,
    custom_objects={
        'dictVectorizer': dv,
        'StandardScaler': sc,
        'PCA':sklearn_pca
              
        
    },signatures={
"predict": {
    "batchable":True,
    "batch_dim":0
}
    }
)

Model(tag="cyber_attack_model:ecjll53bh2vcr5bz", path="C:\Users\Asia\bentoml\models\cyber_attack_model\ecjll53bh2vcr5bz\")

In [25]:
import json

In [26]:
request = df_test.iloc[0].to_dict()
print(json.dumps(request, indent=2))

{
  "dur": 0.453854,
  "service": "-",
  "state": "FIN",
  "spkts": 10,
  "dpkts": 6,
  "sbytes": 714,
  "dbytes": 268,
  "rate": 33.050276,
  "sttl": 254,
  "dttl": 252,
  "sload": 11334.04102,
  "dload": 3948.40625,
  "sloss": 2,
  "dloss": 0,
  "sinpkt": 6578.2406,
  "dinpkt": 10961.736,
  "sjit": 653840.0681,
  "djit": 24336.61,
  "swin": 255,
  "stcpb": 2906150044,
  "dtcpb": 3830409389,
  "dwin": 255,
  "tcprtt": 0.177939,
  "synack": 0.089164,
  "ackdat": 0.088775,
  "smean": 71,
  "dmean": 45,
  "trans_depth": 0,
  "response_body_len": 0,
  "ct_srv_src": 6,
  "ct_state_ttl": 1,
  "ct_dst_ltm": 4,
  "ct_src_dport_ltm": 4,
  "ct_dst_sport_ltm": 4,
  "ct_dst_src_ltm": 6,
  "is_ftp_login": 0,
  "ct_ftp_cmd": 0,
  "ct_flw_http_mthd": 0,
  "ct_src_ltm": 4,
  "ct_srv_dst": 7,
  "is_sm_ips_ports": 0
}


In [27]:
request = df_test.iloc[400].to_dict()
print(json.dumps(request, indent=2))

{
  "dur": 4e-06,
  "service": "dns",
  "state": "INT",
  "spkts": 2,
  "dpkts": 0,
  "sbytes": 114,
  "dbytes": 0,
  "rate": 250000.0006,
  "sttl": 254,
  "dttl": 0,
  "sload": 114000000.0,
  "dload": 0.0,
  "sloss": 0,
  "dloss": 0,
  "sinpkt": 0.004,
  "dinpkt": 0.0,
  "sjit": 0.0,
  "djit": 0.0,
  "swin": 0,
  "stcpb": 0,
  "dtcpb": 0,
  "dwin": 0,
  "tcprtt": 0.0,
  "synack": 0.0,
  "ackdat": 0.0,
  "smean": 57,
  "dmean": 0,
  "trans_depth": 0,
  "response_body_len": 0,
  "ct_srv_src": 7,
  "ct_state_ttl": 2,
  "ct_dst_ltm": 7,
  "ct_src_dport_ltm": 7,
  "ct_dst_sport_ltm": 3,
  "ct_dst_src_ltm": 7,
  "is_ftp_login": 0,
  "ct_ftp_cmd": 0,
  "ct_flw_http_mthd": 0,
  "ct_src_ltm": 8,
  "ct_srv_dst": 7,
  "is_sm_ips_ports": 0
}
