# 模板匹配

> 用于二分类，区分0、1标签类

In [12]:
import pandas as pd
from drain3 import TemplateMiner # 开源在线日志解析框架
from drain3.file_persistence import FilePersistence
from drain3.template_miner_config import TemplateMinerConfig

## 1. 读取日志数据

In [13]:
## 加载所有的日志信息
data_train = pd.read_csv('./data/train/preliminary_sel_log_dataset.csv')
data_add = pd.read_csv('./data/train/additional_sel_log_dataset.csv')
data_test_a = pd.read_csv('./data/test/preliminary_sel_log_dataset_a.csv')
data_test_b = pd.read_csv('./data/test/preliminary_sel_log_dataset_b.csv')
data_all_msg = pd.concat([data_train, data_add, data_test_a, data_test_b])

## 2. 日志模板匹配

In [14]:
config = TemplateMinerConfig()
config.load('./drain3.ini')
config.profiling_enabled = False

In [15]:
drain_file = './model/template/comp_a_sellog'
persistence = FilePersistence(drain_file + '.bin')
template_miner = TemplateMiner(persistence, config=config)

In [16]:
##模板提取
for msg in data_all_msg.msg.tolist():
    template_miner.add_log_message(msg)
temp_count = len(template_miner.drain.clusters)

In [17]:
# 模板个数的上限可在drain.ini这个file中更改，更多参数调整与函数方法可自行上网搜索
temp_count

473

In [18]:
## 筛选模板
template_dic = {}

## 保存次数在10次以上的模板
for cluster in template_miner.drain.clusters: ## 把符合要求的模板存下来
    if cluster.size >= 10:
        template_dic[cluster.cluster_id] = cluster.size

temp_count_f = len(template_dic)
print(temp_count_f)

259


## 3. 模板匹配

In [19]:
def match_template(df, template_miner, template_dic):
    msg = df.msg
    cluster = template_miner.match(msg) # 匹配模板，由开源工具提供
    if cluster and cluster.cluster_id in template_dic:
        df['template_id'] = cluster.cluster_id # 模板id
        df['template'] = cluster.get_template() # 具体模板
    else:
        df['template_id'] = 'None' # 没有匹配到模板的数据也会记录下来，之后也会用作一种特征。
        df['template'] = 'None'
    return df

In [20]:
data = pd.concat([data_train, data_add, data_test_a, data_test_b])
data = data.apply(match_template, template_miner=template_miner, template_dic=template_dic, axis=1)
# 将匹配好的数据存下来
data.to_pickle('./model/template/' + drain_file +'_result_match_data.pkl')

In [21]:
# 读取匹配好模板的数据 
df_data = pd.read_pickle('./model/template/' + drain_file + '_result_match_data.pkl')

In [22]:
df_data[df_data['template_id']!='None'].head()

Unnamed: 0,sn,time,msg,server_model,template_id,template
0,SERVER_25698,2020-10-09 08:32:21,System Boot Initiated BIOS_Boot_Up | State As...,SM0,1,System Boot Initiated <:*:> <:*:> <:*:> <:*:> ...
1,SERVER_25698,2020-10-09 07:43:48,System Boot Initiated BIOS_Boot_Up | State As...,SM0,1,System Boot Initiated <:*:> <:*:> <:*:> <:*:> ...
2,SERVER_25698,2020-10-09 08:16:22,System Boot Initiated BIOS_Boot_Up | State As...,SM0,1,System Boot Initiated <:*:> <:*:> <:*:> <:*:> ...
3,SERVER_25698,2020-10-09 05:46:41,System Boot Initiated BIOS_Boot_Up | State As...,SM0,1,System Boot Initiated <:*:> <:*:> <:*:> <:*:> ...
4,SERVER_25698,2020-10-09 12:59:13,System Boot Initiated BIOS_Boot_Up | State As...,SM0,1,System Boot Initiated <:*:> <:*:> <:*:> <:*:> ...


## 4. 特征生成

In [23]:
def feature_generation(df_data, gap_list, model_name, log_source, win_list, func_list):
    gap_list = gap_list.split(',')

    dummy_list = set(df_data.template_id.unique())
    dummy_col = ['template_id_' + str(x) for x in dummy_list]

    for gap in gap_list:
        df_data['collect_time_gap'] = pd.to_datetime(df_data.collect_time).dt.ceil(gap)
        df_data = template_dummy(df_data)

        df_data = df_data.reset_index(drop=True)
        df_data = df_data.groupby(['sn','collect_time_gap']).agg(sum).reset_index()
        df_data = feature_win_fun(df_data, dummy_col, win_list, func_list, gap)
        df_data.to_pickle('./model/template/cpu_diag_comp_sel_log_all_feature' + gap + '_' + win_list + '_' + func_list +'.pkl') # 将构造好的特征数据存下来
        return df_data
    
def template_dummy(df):
    df_dummy = pd.get_dummies(df['template_id'], prefix='template_id')
    df = pd.concat([df[['sn','collect_time_gap']], df_dummy], axis=1)
    return df

def feature_win_fun(df, dummy_col, win_list, func_list, gap):
    win_list = win_list.split(',')
    func_list = func_list.split(',')
    drop_col = ['sn']
    merge_col = ['collect_time_gap']
    df_out = df[drop_col + merge_col]

    for win in win_list:
        for func in func_list:
            df_feature = df.groupby(drop_col).apply(rolling_funcs, win, func, dummy_col)
            df_feature = df_feature.reset_index(drop=True).rename(columns=dict(zip(dummy_col, map(lambda x: x + '_' +
                                                                  func + '_' + win, dummy_col))))
            df_out = pd.concat([df_out, df_feature], axis=1)
    return df_out

def rolling_funcs(df, window, func, fea_col):
    df = df.sort_values('collect_time_gap')
    df = df.set_index('collect_time_gap')
    df = df[fea_col]

    df2 = df.rolling(str(window) + 'h')

    if func in ['sum']:
        df3 = df2.apply(sum_func)
    else:
        print('func not existed')
    return df3

def sum_func(series):
    return sum(series)

In [24]:
df_data.rename(columns={'time':'collect_time'},inplace=True)
feature_generation(df_data, '1h','', '', '3', 'sum')

Unnamed: 0,sn,collect_time_gap,template_id_1_sum_3,template_id_2_sum_3,template_id_3_sum_3,template_id_4_sum_3,template_id_5_sum_3,template_id_6_sum_3,template_id_7_sum_3,template_id_8_sum_3,...,template_id_449_sum_3,template_id_451_sum_3,template_id_452_sum_3,template_id_460_sum_3,template_id_461_sum_3,template_id_464_sum_3,template_id_465_sum_3,template_id_467_sum_3,template_id_471_sum_3,template_id_472_sum_3
0,000d33b21436,2020-09-02 12:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,000d33b21436,2020-09-02 16:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0015fe530ad4,2020-05-02 00:00:00,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,00380f1435b0,2020-07-28 08:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0045a71d0221,2020-07-02 06:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
201994,fff42b378722,2020-05-06 10:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
201995,fff42b378722,2020-05-06 16:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
201996,fff73a9e5bd5,2020-03-01 14:00:00,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
201997,fffd22fffe19,2020-01-21 19:00:00,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 5. 使用

In [25]:
# 读取之前构造好的特征数据
df_data = pd.read_pickle('cpu_diag_comp_sel_log_all_feature1h_3_sum.pkl')
df_data.columns

Index(['sn', 'collect_time_gap', 'template_id_1_sum_3', 'template_id_2_sum_3',
       'template_id_3_sum_3', 'template_id_4_sum_3', 'template_id_5_sum_3',
       'template_id_6_sum_3', 'template_id_7_sum_3', 'template_id_8_sum_3',
       ...
       'template_id_449_sum_3', 'template_id_451_sum_3',
       'template_id_452_sum_3', 'template_id_460_sum_3',
       'template_id_461_sum_3', 'template_id_464_sum_3',
       'template_id_465_sum_3', 'template_id_467_sum_3',
       'template_id_471_sum_3', 'template_id_472_sum_3'],
      dtype='object', length=262)