In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import optuna.integration.lightgbm as olgb
import lightgbm as lgb

In [None]:
df = pd.read_csv('./HttpParamsDataset/payload_train.csv')
test_data = pd.read_csv('./HttpParamsDataset/payload_test.csv')

train_rows = ((df.attack_type == 'norm') | (df.attack_type == 'sqli'))
df = df[train_rows]

test_train_rows = ((test_data.attack_type == 'norm') | (test_data.attack_type == 'sqli'))
test_data = test_data[test_train_rows]


In [None]:
df_y = df[['label']]
test_y = test_data[['label']]

df_x = df.iloc[:, :-1]
test_x = test_data.iloc[:, :-1]

X_all = pd.concat([df_x, test_x])
y_all = pd.concat([df_y, test_y])

In [None]:
rep = y_all.label.replace({'norm': 0, 'anom': 1})
y_all = y_all.assign(label=rep)

In [None]:
X = X_all['payload']
y = y_all

vec_opts = {
    'ngram_range': (1, 1),
    'analyzer': 'char',
    'min_df': 0.1
}
v = TfidfVectorizer(**vec_opts)

X = v.fit_transform(X)

In [None]:
features = v.get_feature_names_out()
np.array(features)

In [None]:
df = pd.DataFrame(X.toarray())
df.columns = features
df

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=101)

train = olgb.Dataset(X_train, y_train)

params = {
    "objective": "binary",
    "metric": "binary_logloss",
    "verbosity": -1,
    "boosting_type": "gbdt",
}

tuner = olgb.LightGBMTunerCV(params, train)

tuner.run()

In [None]:
print('Best score:', tuner.best_score)
best_params = tuner.best_params
print('Best params:', best_params)
print("  Params: ")
for key, value in best_params.items():
    print("       {}: {}".format(key, value))

In [None]:
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test)

params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'verbosity': -1,
    'boosting_type': 'gbdt',
    'lambda_l1': best_params['lambda_l1'],
    'lambda_l2': best_params['lambda_l2'],
    'num_leaves': best_params['num_leaves'],
    'feature_fraction': best_params['feature_fraction'],
    'bagging_fraction': best_params['bagging_fraction'],
    'bagging_freq': best_params['bagging_freq'],
    'min_child_samples': best_params['min_child_samples']
}

In [None]:
gbm = lgb.train(
    params,
    train_data,
    num_boost_round=100,
    verbose_eval=0,
)


In [None]:
preds = gbm.predict(X_test)
pred_labels = np.rint(preds)

print('Accuracy: {:.5f} %'.format(100 * accuracy_score(y_test, pred_labels)))
print('Confusion Matrix:')
print(confusion_matrix(y_test, pred_labels))