### データセットの取得
同じディレクトリで  
git clone https://github.com/Morzeux/HttpParamsDataset  
を実行してデータを取得する

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split, cross_validate
import optuna

In [None]:
df = pd.read_csv('./HttpParamsDataset/payload_train.csv')

In [None]:
df

In [None]:
def H_entropy(x):
    prob = [ float(x.count(c)) / len(x) for c in dict.fromkeys(list(x))]
    H = - sum([ p * np.log2(p) for p in prob])
    return H

In [None]:
df_norm = df[df.attack_type == 'norm']

norm_entropies = []

for i in df_norm['payload']:
    norm_entropies.append(H_entropy(i))

In [None]:
print('通常の場合のエントロピー', sum(norm_entropies) / len(norm_entropies))

In [None]:
df_sqli = df[df.attack_type == 'sqli']

sqli_entropies = []

for i in df_sqli['payload']:
    sqli_entropies.append(H_entropy(i))

In [None]:
print('SQLインジェクションの場合のエントロピー', sum(sqli_entropies) / len(sqli_entropies))

In [None]:
fig, ax = plt.subplots()

ax.set_title('Entropies of normal HTTP query string')
ax.set_xlabel('Entropy')
ax.set_ylabel('Numbers')
plt.hist(norm_entropies, bins=30, range=(0, 6), color='green')
plt.show()

In [None]:
fig, ax = plt.subplots()

ax.set_title('Entropies of SQLi HTTP query string')
ax.set_xlabel('Entropy')
ax.set_ylabel('Numbers')
plt.hist(sqli_entropies, bins=30, range=(0, 6), color='red')
plt.show()

In [None]:
df_sqli = df[df.attack_type == 'sqli']
df_sqli

In [None]:
print('{:.2f}%'.format(
    df_sqli['payload'].str.contains('\)').sum() / len(df_sqli['payload']) * 100
))

In [None]:
df_norm = df[df.attack_type == 'norm']
df_norm

In [None]:
print('{:.2f}%'.format(
    df_norm['payload'].str.contains('\)').sum() / len(df_norm['payload']) * 100
))

In [None]:
def func_preprocessing(df):
    train_rows = ((df.attack_type == 'norm') | (df.attack_type == 'sqli'))
    df = df[train_rows]

    entropies = []
    closing_parenthesis = []

    for i in df['payload']:

        entropies.append(H_entropy(i))

        if i.count(')'):
            closing_parenthesis.append(1)
        else:
            closing_parenthesis.append(0)

    df = df.assign(entropy=entropies)
    df = df.assign(closing_parenthesis=closing_parenthesis)

    rep = df.label.replace({"norm":0, "anom":1})
    df = df.assign(label=rep)
    
    return df

In [None]:
df = func_preprocessing(df)
df

In [None]:
test_data = pd.read_csv('./HttpParamsDataset/payload_test.csv')
test_data = func_preprocessing(test_data)

In [None]:
df_x = df[['length', 'entropy', 'closing_parenthesis']]
test_x = test_data[['length', 'entropy', 'closing_parenthesis']]

In [None]:
df_y = df[['label']]
test_y = test_data[['label']]

# 訓練データをテストデータを混ぜて、後の処理でまた分離している、何故?
X_all = pd.concat([df_x, test_x])
y_all = pd.concat([df_y, test_y])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, shuffle=True, random_state=101)

In [None]:
class Objective_DTC:

    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __call__(self, trial):

        params = {
            'criterion': trial.suggest_categorical('criterion', ['gini', 'entropy']),
            'max_depth': trial.suggest_int('max_depth', 1, 64),
        }

        model = DecisionTreeClassifier(**params)

        scores = cross_validate(model, self.X, self.y, scoring='accuracy', n_jobs=-1)

        return scores['test_score'].mean()

In [None]:
objective = Objective_DTC(X_train, y_train)
study = optuna.create_study()
study.optimize(objective, timeout=60)
print('params:', study.best_params)

In [None]:
model = DecisionTreeClassifier(**study.best_params)
model.fit(X_train, y_train)

pred = model.predict(X_test)

print('Accuracy: {:.5f} %'.format(100 * accuracy_score(y_test, pred)))
print('Confusion Matrix')
print(confusion_matrix(y_test, pred))