# Kaggle Dataset

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
 # Выбор данных из большого объема данных
from DatasetHandler import *

In [None]:
data = get_small(path="/Users/evgenii/DDoS Dataset/final_dataset.csv",size=110000,random_state=42)

In [None]:
data.head()

## Preprocessing

In [None]:
# Удаляем стобцы, в которых все значения одинаковые
data = unique(data)
data

In [None]:
data.info()

### Correlations

In [None]:
features = data.columns.to_list()
target = features.pop()

In [None]:
X = data.loc[:,features]
y = data.loc[:,target]

In [None]:
cormap(X,"All correlations")

In [None]:
#Удалим коррелирующие признаки при помощи фреймворка FeatureSelector
from feature_selector import FeatureSelector

In [None]:
fs = FeatureSelector(data = X, labels = y)
fs.identify_collinear(correlation_threshold = 0.80)

## Важное дополнение
При обучении модели, работающей с сетевыми данными не рекомендуется использовать IP-адреса, так как злоумышленник может подменить его, а модель придает IP-адресам большую значимость(см. old_model/). В модели нужно использовать только системные признаки(т. е. не зависящие от хакера).

In [None]:
# список признаков для удаления
ip_features = ['Flow ID','Src IP','Dst IP']
collinear_features = fs.ops['collinear']
collinear_features.extend(ip_features)
X = X.drop(collinear_features,axis=1)
X.head()

In [None]:
cormap(X,"Without correlations")

In [None]:
y =y.replace("ddos",1).replace("Benign",0)
y.head()

In [None]:
df_data = get_small(path="/Users/evgenii/DDoS Dataset/final_dataset.csv",size=200000,random_state=42)


In [None]:
df_data = unique(df_data)
X_d = df_data.loc[:,features]
y_d = df_data.loc[:,target]
X_d = X_d.drop(collinear_features,axis=1)
y_d =y_d.replace("ddos",1).replace("Benign",0)

In [None]:
X_not = X
X_not_d = X_d

## CatBoost

### With categorial feature

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier, Pool
from catboost.utils import get_roc_curve


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

train_data = Pool(X_train,y_train,cat_features=[2])
test_data = Pool(X_test,y_test,cat_features=[2])

In [None]:
tr_roc_auc = []
test_roc_auc = []
it_array = np.arange(50, 550, 25)

for it in it_array:
    cb = CatBoostClassifier(iterations=it,max_depth=3,learning_rate=0.1, loss_function='Logloss',verbose=False, thread_count= -1)
    cb.fit(train_data)
    tr_roc_auc.append(roc_auc_score(y_train,cb.predict_proba(X_train)[:,1]))
    test_roc_auc.append(roc_auc_score(y_test,cb.predict_proba(X_test)[:,1]))
    
plot_auc_array(it_array,tr_roc_auc,test_roc_auc)

In [None]:
l_r_array = np.linspace(0.1, 0.8, 10)
tr_roc_auc.clear()
test_roc_auc.clear()

for l in l_r_array:
    cb = CatBoostClassifier(iterations=200,max_depth=3,learning_rate=l, loss_function='Logloss',verbose=False, thread_count= -1)
    cb.fit(train_data)
    tr_roc_auc.append(roc_auc_score(y_train,cb.predict_proba(X_train)[:,1]))
    test_roc_auc.append(roc_auc_score(y_test,cb.predict_proba(X_test)[:,1]))
    
plot_auc_array(l_r_array,tr_roc_auc,test_roc_auc)

In [None]:
depth_array = np.arange(1,10,1)
tr_roc_auc.clear()
test_roc_auc.clear()

for d in depth_array:
    cb = CatBoostClassifier(iterations=200,max_depth=d,learning_rate=0.43, loss_function='Logloss',verbose=False, thread_count= -1)
    cb.fit(train_data)
    tr_roc_auc.append(roc_auc_score(y_train,cb.predict_proba(X_train)[:,1]))
    test_roc_auc.append(roc_auc_score(y_test,cb.predict_proba(X_test)[:,1]))
    
plot_auc_array(depth_array,tr_roc_auc,test_roc_auc)

In [None]:
cat_all_model = CatBoostClassifier(iterations=200,
                           depth=2,
                           learning_rate=0.43,
                           loss_function='Logloss',
                           verbose=False)
cat_all_model.fit(train_data, plot=True)

Метрики для малой тестовой выборки

In [None]:
roc_auc_score(y_test, cat_all_model.predict_proba(X_test)[:,1])

In [None]:
accuracy_score(y_test, cat_all_model.predict(X_test))

Метрики для большой тестовой выборки

In [None]:
roc_auc_score(y_d, cat_all_model.predict_proba(X_d)[:,1])

In [None]:
accuracy_score(y_d, cat_all_model.predict(X_d))

In [None]:
import shap
shap.initjs()

In [None]:
explainer = shap.TreeExplainer(cat_all_model)
shap_values = explainer.shap_values(X_train)
shap.summary_plot(shap_values, X_train)

### Without categorial feature

In [None]:
X_not['Timestamp'] = X_not['Timestamp'].apply(toTimestamp)
X_not_d['Timestamp'] = X_not_d['Timestamp'].apply(toTimestamp)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_not, y, test_size=0.3, random_state=42)
train_data = Pool(X_train,y_train)
test_data = Pool(X_test,y_test)

In [None]:
tr_roc_auc = []
test_roc_auc = []
it_array = np.arange(50, 550, 25)

for it in it_array:
    cb = CatBoostClassifier(iterations=it,max_depth=3,learning_rate=0.1, loss_function='Logloss',verbose=False, thread_count= -1)
    cb.fit(train_data)
    tr_roc_auc.append(roc_auc_score(y_train,cb.predict_proba(X_train)[:,1]))
    test_roc_auc.append(roc_auc_score(y_test,cb.predict_proba(X_test)[:,1]))

plot_auc_array(it_array,tr_roc_auc,test_roc_auc)

In [None]:
l_r_array = np.linspace(0.1, 0.8, 10)
tr_roc_auc.clear()
test_roc_auc.clear()

for l in l_r_array:
    cb = CatBoostClassifier(iterations=200,max_depth=3,learning_rate=l, loss_function='Logloss',verbose=False, thread_count= -1)
    cb.fit(train_data)
    tr_roc_auc.append(roc_auc_score(y_train,cb.predict_proba(X_train)[:,1]))
    test_roc_auc.append(roc_auc_score(y_test,cb.predict_proba(X_test)[:,1]))

plot_auc_array(l_r_array,tr_roc_auc,test_roc_auc)

In [None]:
depth_array = np.arange(1,10,1)
tr_roc_auc.clear()
test_roc_auc.clear()

for d in depth_array:
    cb = CatBoostClassifier(iterations=200,max_depth=d,learning_rate=0.43, loss_function='Logloss',verbose=False, thread_count= -1)
    cb.fit(train_data)
    tr_roc_auc.append(roc_auc_score(y_train,cb.predict_proba(X_train)[:,1]))
    test_roc_auc.append(roc_auc_score(y_test,cb.predict_proba(X_test)[:,1]))

plot_auc_array(depth_array,tr_roc_auc,test_roc_auc)

In [None]:
cat_one_model = CatBoostClassifier(iterations=200,
                           depth=4,
                           learning_rate=0.63,
                           loss_function='Logloss',
                           verbose=False)
cat_one_model.fit(train_data, plot=True)

Метрики для малой тестовой выборки

In [None]:
roc_auc_score(y_test, cat_one_model.predict_proba(X_test)[:,1])

In [None]:
accuracy_score(y_test, cat_one_model.predict(X_test))

Метрики для большой тестовой выборки

In [None]:
roc_auc_score(y_d, cat_one_model.predict_proba(X_not_d)[:,1])

In [None]:
accuracy_score(y_d, cat_one_model.predict(X_not_d))

In [None]:
explainer = shap.TreeExplainer(cat_one_model)
shap_values = explainer.shap_values(X_train)
shap.summary_plot(shap_values, X_train)

## LGBM

In [None]:
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

### With categorial feature

In [None]:
cat_feat=['Timestamp']
for col in cat_feat:
            X[col] = X[col].astype('category')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
train = lgb.Dataset(X_train,label=y_train,categorical_feature=cat_feat,free_raw_data=False)
test = lgb.Dataset(X_test, label=y_test,reference=train,categorical_feature=cat_feat,free_raw_data=False)

In [None]:
def set_parametrs():
    param_ = {}
    param_['boosting_type']='gbdt'
    param_['objective']='binary'
    param_['metric']='binary_logloss'
    param_['learning_rate']=0.1
    param_['num_threads'] = -1
    param_['max_depth'] = 3
    param_['verbose'] = -1
    param_['num_leaves'] = 32
    return param_

In [None]:
parametrs = set_parametrs()

In [None]:
n_est = np.arange(20, 150, 10)
tr_roc_auc.clear()
test_roc_auc.clear()

for n in n_est:
    clf=lgb.train(parametrs,train,n,valid_sets=test,verbose_eval=False)
    tr_roc_auc.append(roc_auc_score(y_train,clf.predict(X_train)))
    test_roc_auc.append(roc_auc_score(y_test, clf.predict(X_test)))
    
plot_auc_array(n_est,tr_roc_auc,test_roc_auc)

In [None]:
tr_roc_auc.clear()
test_roc_auc.clear()
l_r = np.linspace(0.01, 1, 100)
for l in l_r: 
    parametrs['learning_rate']=l
    clf=lgb.train(parametrs,train,80,valid_sets=test,verbose_eval=False)
    tr_roc_auc.append(roc_auc_score(y_train,clf.predict(X_train)))
    test_roc_auc.append(roc_auc_score(y_test, clf.predict(X_test)))
    
plot_auc_array(l_r,tr_roc_auc,test_roc_auc)

In [None]:
test_roc_auc.clear()
tr_roc_auc.clear()
depth = np.arange(1, 10, 1)

parametrs['learning_rate']=0.6
for d in depth: 
    parametrs['max_depth'] = d
    parametrs['num_leaves'] = 2**d
    clf=lgb.train(parametrs,train,50,valid_sets=test,verbose_eval=False)
    tr_roc_auc.append(roc_auc_score(y_train,clf.predict(X_train)))
    test_roc_auc.append(roc_auc_score(y_test, clf.predict(X_test)))
plot_auc_array(depth,tr_roc_auc,test_roc_auc)

In [None]:
parametrs['max_depth'] = 3
parametrs['num_leaves'] = 2**d
lgbm_all_model=lgb.train(parametrs,train,75,valid_sets=test,verbose_eval=False)

Метрики для малой тестовой выборки

In [None]:
roc_auc_score(y_test, lgbm_all_model.predict(X_test))

In [None]:
accuracy_score(y_test, lgbm_all_model.predict(X_test).round(0))

Метрики для большой тестовой выборки

In [None]:
for col in cat_feat:
    X_d[col] = X_d[col].astype('category')

In [None]:
roc_auc_score(y_d, lgbm_all_model.predict(X_d))

In [None]:
accuracy_score(y_d, lgbm_all_model.predict(X_d).round(0))

In [None]:
explainer = shap.TreeExplainer(lgbm_all_model)
shap_values = explainer.shap_values(X_train)
shap.summary_plot(shap_values, X_train)

### Without categorial feature

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_not, y, test_size=0.3, random_state=42)

In [None]:
train = lgb.Dataset(X_train,label=y_train,free_raw_data=False)
test = lgb.Dataset(X_test, label=y_test,reference=train,free_raw_data=False)

In [None]:
n_est = np.arange(20, 150, 10)
tr_roc_auc.clear()
test_roc_auc.clear()

for n in n_est:
    clf=lgb.train(parametrs,train,n,valid_sets=test,verbose_eval=False)
    tr_roc_auc.append(roc_auc_score(y_train,clf.predict(X_train)))
    test_roc_auc.append(roc_auc_score(y_test, clf.predict(X_test)))
    
plot_auc_array(n_est,tr_roc_auc,test_roc_auc)


In [None]:
tr_roc_auc.clear()
test_roc_auc.clear()
l_r = np.linspace(0.01, 1, 100)
for l in l_r: 
    parametrs['learning_rate']=l
    clf=lgb.train(parametrs,train,100,valid_sets=test,verbose_eval=False)
    tr_roc_auc.append(roc_auc_score(y_train,clf.predict(X_train)))
    test_roc_auc.append(roc_auc_score(y_test, clf.predict(X_test)))
    
plot_auc_array(l_r,tr_roc_auc,test_roc_auc)

In [None]:
test_roc_auc.clear()
tr_roc_auc.clear()
depth = np.arange(1, 10, 1)

parametrs['learning_rate']=0.6
for d in depth: 
    parametrs['max_depth'] = d
    parametrs['num_leaves'] = 2**d
    clf=lgb.train(parametrs,train,50,valid_sets=test,verbose_eval=False)
    tr_roc_auc.append(roc_auc_score(y_train,clf.predict(X_train)))
    test_roc_auc.append(roc_auc_score(y_test, clf.predict(X_test)))
    
plot_auc_array(depth,tr_roc_auc,test_roc_auc)

In [None]:
parametrs['max_depth'] = 3
parametrs['num_leaves'] = 2**d
lgbm_one_model=lgb.train(parametrs,train,100,valid_sets=test,verbose_eval=False)

Метрики для малой тестовой выборки

In [None]:
roc_auc_score(y_test, lgbm_one_model.predict(X_test))

In [None]:
accuracy_score(y_test, lgbm_one_model.predict(X_test).round(0))

Метрики для большой тестовой выборки

In [None]:
roc_auc_score(y_d, lgbm_one_model.predict(X_not_d))

In [None]:
accuracy_score(y_d, lgbm_one_model.predict(X_not_d).round(0))

In [None]:
explainer = shap.TreeExplainer(lgbm_one_model)
shap_values = explainer.shap_values(X_train)
shap.summary_plot(shap_values, X_train)