In [34]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import sys
sys.path.append('../')

import argparse
import numpy as np
import pandas as pd
import random
from importlib import reload  
from sklearn.model_selection import GridSearchCV
from sklearn import svm
from sklearn.utils import shuffle

from loglizer.models import InvariantsMiner, PCA, IsolationForest, OneClassSVM, LogClustering, LR, SVM
from loglizer import dataloader, preprocessing
from loglizer.utils import metrics

In [41]:
# ouput_dir = "../output/no_shuffle_bgl_325/"
# middle_dir = ""
# log_file = "BGL.log"

ouput_dir = r"E:\logbert-main\datasets\jeecgboot/"
middle_dir = ""
log_file = "jeecgboot.log"

<!-- # Produce event templates from train test dataset -->

# Split train test data

In [43]:
(x_train, y_train), (x_test, y_test) = dataloader.load_data(ouput_dir, middle_dir, log_file, is_mapping=False)

Train normal size: 2354
Train abnormal size: 558
Test normal size: 3886
Test abnormal size: 837


  train = np.array(train).reshape(-1, 1)
  test_normal = np.array(test_normal).reshape(-1,1)
  abnormal = np.array(abnormal).reshape(-1,1)


In [44]:
feature_extractor = preprocessing.FeatureExtractor()
x_train = feature_extractor.fit_transform(x_train)
x_test = feature_extractor.transform(x_test)

Train data shape: 2912-by-98

Test data shape: 4723-by-98



In [45]:
%%time
print("="*20 + " Model: PCA " + "="*20)
for th in np.arange(1):
    print("theshold", th)
    model = PCA(n_components=0.8, threshold=1, c_alpha = 1.9600)
    model.fit(x_train)
    print('Train validation:')
    precision, recall, f1 = model.evaluate(x_train, y_train)
    print('Test validation:')
    precision, recall, f1 = model.evaluate(x_test, y_test)

theshold 0
n_components: 2
Project matrix shape: 98-by-98
SPE threshold: 1

Train validation:
Confusion Matrix: TP: 558, FP: 1644, TN: 710, FN: 0
Precision: 25.341%, recall: 100.000%, F1-measure: 40.435%

Test validation:
Confusion Matrix: TP: 837, FP: 3573, TN: 313, FN: 0
Precision: 18.980%, recall: 100.000%, F1-measure: 31.904%

CPU times: total: 172 ms
Wall time: 1.26 s


In [46]:
%%time
print("="*20 + " Model: IsolationForest " + "="*20)
model = IsolationForest(n_estimators=100, max_samples='auto', contamination='auto', random_state=19)
model.fit(x_train)
print('Train validation:')
precision, recall, f1 = model.evaluate(x_train, y_train)
print('Test validation:')
precision, recall, f1 = model.evaluate(x_test, y_test)

Train validation:
Confusion Matrix: TP: 55, FP: 13, TN: 2341, FN: 503
Precision: 80.882, recall: 9.857, F1-measure: 17.572

Test validation:
Confusion Matrix: TP: 71, FP: 12, TN: 3874, FN: 766
Precision: 85.542, recall: 8.483, F1-measure: 15.435

CPU times: total: 391 ms
Wall time: 462 ms


In [47]:
%%time
print("="*20 + " Model: one class SVM " + "="*20)
model = OneClassSVM(kernel='rbf')
model.fit(x_train, y_train)

print('Train validation:')
precision, recall, f1 = model.evaluate(x_train, y_train)
print('Test validation:')
precision, recall, f1 = model.evaluate(x_test, y_test)

Train validation:
Confusion Matrix: TP: 185, FP: 2354, TN: 0, FN: 373
Precision: 7.286, recall: 33.154, F1-measure: 11.947

Test validation:
Confusion Matrix: TP: 276, FP: 3886, TN: 0, FN: 561
Precision: 6.631, recall: 32.975, F1-measure: 11.042

CPU times: total: 1.25 s
Wall time: 1.86 s


In [51]:
%%time
print("="*20 + " Model: LogClustering " + "="*20)
max_dist = 0.3  # the threshold to stop the clustering process
anomaly_threshold = 0  # the threshold for anomaly detection
model = LogClustering(max_dist=max_dist, anomaly_threshold=anomaly_threshold)
model.fit(x_train[y_train == 0, :])  # Use only normal samples for training
print('Train validation:')
precision, recall, f1 = model.evaluate(x_train, y_train)
print('Test validation:')
precision, recall, f1 = model.evaluate(x_test, y_test)


Starting offline clustering...
Processed 1000 instances.
Found 46 clusters offline.

Starting online clustering...
Processed 2000 instances.
Processed 2354 instances.
Found 79 clusters online.

Train validation:
Confusion Matrix: TP: 556, FP: 2286, TN: 68, FN: 2
Precision: 19.564, recall: 99.642, F1-measure: 32.706

Test validation:
Confusion Matrix: TP: 835, FP: 3867, TN: 19, FN: 2
Precision: 17.758, recall: 99.761, F1-measure: 30.150

CPU times: total: 8.67 s
Wall time: 11.3 s


In [1]:
# 读取文件seq.csv
import pandas as pd

seq = pd.read_csv(r"E:\logbert-main\datasets\jeecgboot\seq.csv")
# None改名为Label
seq.rename(columns={'None':'Label'}, inplace=True)
seq

D:\Anaconda\envs\LogBert\lib\site-packages\numpy\.libs\libopenblas.EL2C6PLE4ZYW3ECEVIV3OXXGRN2NRFM2.gfortran-win_amd64.dll
D:\Anaconda\envs\LogBert\lib\site-packages\numpy\.libs\libopenblas.GK7GX5KEQ4F6UYO3P26ULGBQYHGQO7J4.gfortran-win_amd64.dll


Unnamed: 0.1,Unnamed: 0,Label
0,0,False
1,1,False
2,2,False
3,3,False
4,4,False
...,...,...
10557,52 58 53 54 55 53 65 55 53 56 55 52 53 57 55,True
10558,53 56 55 53 56 55 53 56 55 52 58 53 54 55,True
10559,53 56 55 52 53 54 55 60 61 55 53 56 55,False
10560,53 56 55 53 56 55 53 56 55 52 53 57 55 53 56 5...,True


In [2]:
# 将Label列中的False改为0，True改为1
seq['Label'] = seq['Label'].map({False:0, True:1})
seq.rename(columns={'Unnamed: 0':'Seq'}, inplace=True)
seq

Unnamed: 0,Seq,Label
0,0,0.0
1,1,0.0
2,2,0.0
3,3,0.0
4,4,0.0
...,...,...
10557,52 58 53 54 55 53 65 55 53 56 55 52 53 57 55,1.0
10558,53 56 55 53 56 55 53 56 55 52 58 53 54 55,1.0
10559,53 56 55 52 53 54 55 60 61 55 53 56 55,0.0
10560,53 56 55 53 56 55 53 56 55 52 53 57 55 53 56 5...,1.0


In [14]:
deeplog_df_len = len(seq)
data_len = deeplog_df_len
train_len = int(data_len * 0.3)  # 训练集长度
valid_len = int(data_len * 0.2)  # 验证集长度
test_len = data_len - train_len - valid_len  # 测试集长度

train_deeplog_df = seq.iloc[:train_len]  # 训练集
valid_deeplog_df = seq.iloc[train_len:train_len + valid_len]  # 验证集
test_deeplog_df = seq.iloc[train_len + valid_len:]  # 测试集

In [19]:
# len(train_deeplog_df)
# len(valid_deeplog_df)
# len(test_deeplog_df)
print("train_len:", len(train_deeplog_df))
print("valid_len:", len(valid_deeplog_df))
print("test_len:", len(test_deeplog_df))

train_len: 3168
valid_len: 2112
test_len: 5282


In [21]:
train_deeplog_df_len = int(len(train_deeplog_df))
train_normal_df = train_deeplog_df[train_deeplog_df["Label"] == 0]
train_abnormal_df = train_deeplog_df[train_deeplog_df["Label"] == 1]
print("train_normal_df_len:", len(train_normal_df))
print("train_abnormal_df_len:", len(train_abnormal_df))

train_normal_df_len: 2354
train_abnormal_df_len: 814


In [22]:
valid_deeplog_df_len = int(len(valid_deeplog_df))
valid_normal_df = valid_deeplog_df[valid_deeplog_df["Label"] == 0]
valid_abnormal_df = valid_deeplog_df[valid_deeplog_df["Label"] == 1]
print("valid_normal_df_len:", len(valid_normal_df))
print("valid_abnormal_df_len:", len(valid_abnormal_df))


valid_normal_df_len: 1411
valid_abnormal_df_len: 701


In [24]:
test_deeplog_df_len = int(len(test_deeplog_df))
test_normal_df = test_deeplog_df[test_deeplog_df["Label"] == 0]
test_abnormal_df = test_deeplog_df[test_deeplog_df["Label"] == 1]
print("test_normal_df_len:", len(test_normal_df))
print("test_abnormal_df_len:", len(test_abnormal_df))

test_normal_df_len: 3886
test_abnormal_df_len: 1395


Unnamed: 0,Seq,Label
5282,53 56 55 53 56 55 52 53 54 55,0.0
5284,53 65 55,0.0
5285,60 61 55 60 61 55,0.0
5287,60 88 55 53 56 55,0.0
5288,60 61 55 53 56 55 53 56 55 60 61 55,0.0
...,...,...
10552,52 53 57 55,0.0
10553,53 56 55 53 56 55 53 56 55,0.0
10555,60 64 55 53 56 55 53 56 55 53 65 55,0.0
10556,53 65 55 52 53 57 55 53 56 55 53 56 55,0.0


In [25]:
# 将seq列下的所有值单独保存为numpy数组
seq_np = seq['Seq'].values
train_normal_df_np = train_normal_df['Seq'].values
valid_normal_df_np = valid_normal_df['Seq'].values
test_normal_df_np = test_normal_df['Seq'].values
test_abnormal_df_np = test_abnormal_df['Seq'].values

In [40]:
seq_np[500]

'60 61 55 60 64 55 '

In [27]:
seq_np_ls = []

for i in range(len(seq_np)):
    seq_np_ls.append(seq_np[i])

train_normal_df_np_ls = []
for i in range(len(train_normal_df_np)):
    train_normal_df_np_ls.append(train_normal_df_np[i])

valid_normal_df_np_ls = []
for i in range(len(valid_normal_df_np)):
    valid_normal_df_np_ls.append(valid_normal_df_np[i])

test_normal_df_np_ls = []
for i in range(len(test_normal_df_np)):
    test_normal_df_np_ls.append(test_normal_df_np[i])

test_abnormal_df_np_ls = []
for i in range(len(test_abnormal_df_np)):
    test_abnormal_df_np_ls.append(test_abnormal_df_np[i])


In [29]:
for i in range(len(seq_np_ls)):
    seq_np_ls[i] = seq_np_ls[i].split(' ')

for i in range(len(train_normal_df_np_ls)):
    train_normal_df_np_ls[i] = train_normal_df_np_ls[i].split(' ')

for i in range(len(valid_normal_df_np_ls)):
    valid_normal_df_np_ls[i] = valid_normal_df_np_ls[i].split(' ')

for i in range(len(test_normal_df_np_ls)):
    test_normal_df_np_ls[i] = test_normal_df_np_ls[i].split(' ')

for i in range(len(test_abnormal_df_np_ls)):
    test_abnormal_df_np_ls[i] = test_abnormal_df_np_ls[i].split(' ')

In [31]:
for i in range(len(seq_np_ls)):
    for j in range(len(seq_np_ls[i])):
        if seq_np_ls[i][j] == '':
            seq_np_ls[i].remove(seq_np_ls[i][j])
        else:
            seq_np_ls[i][j] = int(seq_np_ls[i][j])

for i in range(len(train_normal_df_np_ls)):
    for j in range(len(train_normal_df_np_ls[i])):
        if train_normal_df_np_ls[i][j] == '':
            train_normal_df_np_ls[i].remove(train_normal_df_np_ls[i][j])
        else:
            train_normal_df_np_ls[i][j] = int(train_normal_df_np_ls[i][j])

for i in range(len(valid_normal_df_np_ls)):
    for j in range(len(valid_normal_df_np_ls[i])):
        if valid_normal_df_np_ls[i][j] == '':
            valid_normal_df_np_ls[i].remove(valid_normal_df_np_ls[i][j])
        else:
            valid_normal_df_np_ls[i][j] = int(valid_normal_df_np_ls[i][j])

for i in range(len(test_normal_df_np_ls)):
    for j in range(len(test_normal_df_np_ls[i])):
        if test_normal_df_np_ls[i][j] == '':
            test_normal_df_np_ls[i].remove(test_normal_df_np_ls[i][j])
        else:
            test_normal_df_np_ls[i][j] = int(test_normal_df_np_ls[i][j])

for i in range(len(test_abnormal_df_np_ls)):
    for j in range(len(test_abnormal_df_np_ls[i])):
        if test_abnormal_df_np_ls[i][j] == '':
            test_abnormal_df_np_ls[i].remove(test_abnormal_df_np_ls[i][j])
        else:
            test_abnormal_df_np_ls[i][j] = int(test_abnormal_df_np_ls[i][j])

KeyboardInterrupt: 

In [13]:
from tqdm.notebook import tqdm

with open(r'E:\logbert-main\datasets\jeecgboot/all_seq', 'w') as f:
    for i in tqdm(range(len(seq_np_ls))):
        for j in range(len(seq_np_ls[i])):
            f.write(str(seq_np_ls[i][j]) + ' ')
        f.write('\n')
print('ok')

  0%|          | 0/10562 [00:00<?, ?it/s]

ok


In [32]:
from tqdm.notebook import tqdm

with open(r'E:\logbert-main\datasets\jeecgboot/train', 'w') as f:
    for i in tqdm(range(len(train_normal_df_np_ls))):
        for j in range(len(train_normal_df_np_ls[i])):
            f.write(str(train_normal_df_np_ls[i][j]) + ' ')
        f.write('\n')
print('ok')

with open(r'E:\logbert-main\datasets\jeecgboot/valid', 'w') as f:
    for i in tqdm(range(len(valid_normal_df_np_ls))):
        for j in range(len(valid_normal_df_np_ls[i])):
            f.write(str(valid_normal_df_np_ls[i][j]) + ' ')
        f.write('\n')
print('ok')

with open(r'E:\logbert-main\datasets\jeecgboot/test_normal', 'w') as f:
    for i in tqdm(range(len(test_normal_df_np_ls))):
        for j in range(len(test_normal_df_np_ls[i])):
            f.write(str(test_normal_df_np_ls[i][j]) + ' ')
        f.write('\n')
print('ok')

with open(r'E:\logbert-main\datasets\jeecgboot/test_abnormal', 'w') as f:
    for i in tqdm(range(len(test_abnormal_df_np_ls))):
        for j in range(len(test_abnormal_df_np_ls[i])):
            f.write(str(test_abnormal_df_np_ls[i][j]) + ' ')
        f.write('\n')
print('ok')

  0%|          | 0/2354 [00:00<?, ?it/s]

ok


  0%|          | 0/1411 [00:00<?, ?it/s]

ok


  0%|          | 0/3886 [00:00<?, ?it/s]

ok


  0%|          | 0/1395 [00:00<?, ?it/s]

ok
