# 处理训练数据集
1. 获取所有的token，并存储下来（可以利用无标签数据集）
2. 求取最长长度，和平均长度
3. 构造时间序列

In [19]:
# 导入需要的库
import pandas as pd
import ujson as json
import numpy as np
from datetime import datetime

In [25]:
# 定义加载数据集函数
def get_dataset(path):
    raw_df = pd.read_csv(path, encoding="utf-8")
    return raw_df

# 命令级分割
def split_msg(msg):
    msgs = msg.split(" | ")
    msgs[0]  = msgs[0].lstrip(" ")
    return msgs

# 单词级分割
def split_word(msgs):
    total_msgs = list()
    for e in msgs:
        temp_msgs = e.split(" ")
        total_msgs = total_msgs + temp_msgs
    return total_msgs

# 获取命令级和单词级  的 token字典
def get_all_tokens(df_total):
    dict_command = {x:1 for x1 in df_total["command_tokens"].values for x in x1}
    dict_words = {x:1 for x1 in df_total["word_tokens"].values for x in x1}
    list_tokens = list(dict_words.keys()) + list(dict_command.keys())
    dict_total = dict()
    for i, v in enumerate(list_tokens):
        # print(i, "  ", v)
        dict_total[v] = i
    path = "../dataset/dict/all_tokens.json"
    json.dump(dict_total, open(path, "w"))
    return dict_total
# 同一个sn的所有token聚合到一起
def groupby_token_list(lists):
    total_list = [x for x1 in lists for x in x1]
    return total_list

# 获取时间序列
def get_timeseq(time_str):
    h = datetime.strptime(time_str,"%Y-%m-%d %H:%M:%S").hour
    m = datetime.strptime(time_str,"%Y-%m-%d %H:%M:%S").minute
    # s = datetime.strptime(time_str,"%Y-%m-%d %H:%M:%S").second
    return [h, m]

# 同一个sn的所有time聚合到一起
def groupby_time_list(lists):
    total = [get_timeseq(x) for x in lists]
    return np.array(total)

# 同一个sn的所有server_model聚合到一起
def groupby_model_list(lists):
    total = [x for x in lists]
    return total[0]

# 按照指定长度补齐或者切割token
def pad_token(test_token_list, max_len = 321):
    if len(test_token_list) < max_len:
        padding_len = max_len - len(test_token_list)
        padding_list = ['PAD'] * padding_len
        result_list = test_token_list + padding_list
    elif len(test_token_list) >= max_len:
        result_list = test_token_list[:max_len]
    return result_list

# 按照指定长度补齐或者切割time
def pad_time(time_array, max_len = 35):
    if len(time_array) < max_len:
        padding_len = max_len - len(time_array)
        padding_array = np.array([25, 61] * padding_len).reshape((padding_len,2))
        # result_array = np.concatenate()
        result_array = np.append(time_array, padding_array, axis=0)
    elif len(time_array) >= max_len:
        result_array = time_array[:max_len]
    return result_array

# 对所有的token键根据字典映射成值
def map_tokens(test_list, all_tokens):
    result_list = np.array([all_tokens[x] for x in test_list]).astype(int)
    return result_list

In [26]:
path_label_s = '../dataset/raw_dataset/preliminary_train_label_dataset_s.csv'
path_label = '../dataset/raw_dataset/preliminary_train_label_dataset.csv'
# df_without_label =  get_dataset(path_without_label)
df_label_s =  get_dataset(path_label_s)
df_label =  get_dataset(path_label)
df_total_label = pd.concat([df_label_s, df_label], axis=0)
df_total_label = df_total_label.sort_values(["sn", "fault_time"])
df_total_label

Unnamed: 0,sn,fault_time,label
3061,SERVER_10001,2020-05-01 10:04:00,1
9088,SERVER_10003,2020-03-28 09:48:00,2
8488,SERVER_10008,2020-02-25 16:12:00,1
8371,SERVER_10008,2020-03-11 18:04:00,2
8359,SERVER_10009,2020-05-08 16:37:00,3
...,...,...,...
8949,SERVER_9991,2020-08-04 22:49:00,2
9035,SERVER_9991,2020-10-07 18:42:00,2
9090,SERVER_9993,2020-05-14 23:50:00,2
3062,SERVER_9998,2020-05-29 11:25:00,2


In [27]:
# path_without_label = '../dataset/raw_dataset/additional_sel_log_dataset.csv'
path_with_label = '../dataset/raw_dataset/preliminary_sel_log_dataset.csv'
# df_without_label =  get_dataset(path_without_label)
df_with_label =  get_dataset(path_with_label)
# print(df_without_label.shape)
print(df_with_label.shape)
# df_with_label = df_with_label.drop(["server_model"], axis=1)
print(df_with_label.shape)
# df_total = pd.concat([df_with_label, df_without_label], axis=0)
df_total = df_with_label.copy()
df_total = df_total.sort_values(by='time', ascending=False)
df_total

(482536, 4)
(482536, 4)


Unnamed: 0,sn,time,msg,server_model
261330,SERVER_20339,2020-11-25 23:21:06,System Boot Initiated BIOS_Boot_Up | Initiate...,SM35
241259,SERVER_20339,2020-11-25 23:11:23,System Boot Initiated BIOS_Boot_Up | Initiate...,SM35
291022,SERVER_20339,2020-11-25 23:10:19,Management Subsystem Health System_Health | S...,SM35
267210,SERVER_20339,2020-11-25 23:10:19,Management Subsys Health System_Health | Sens...,SM35
257969,SERVER_20339,2020-11-25 23:10:08,System ACPI Power State ACPI_PWR_Status | S0/...,SM35
...,...,...,...,...
330791,SERVER_10657,2019-12-27 23:39:00,Memory CPU1C0_DIMM_Stat | Correctable ECC | A...,SM54
329200,SERVER_10657,2019-12-27 23:38:46,Memory CPU1C0_DIMM_Stat | Correctable ECC | A...,SM54
329772,SERVER_10657,2019-12-27 23:38:33,Memory CPU1C0_DIMM_Stat | Correctable ECC | A...,SM54
328692,SERVER_10657,2019-12-27 23:38:19,Memory CPU1C0_DIMM_Stat | Correctable ECC | A...,SM54


In [28]:
df_total["command_tokens"] = df_total["msg"].apply(lambda x: split_msg(x))
df_total["word_tokens"] = df_total["command_tokens"].apply(lambda x: split_word(x))
df_total["all_tokens"] = df_total["command_tokens"] + df_total["word_tokens"]
df_total = df_total.drop(['msg', 'command_tokens', 'word_tokens'], axis=1)
df_total

Unnamed: 0,sn,time,server_model,all_tokens
261330,SERVER_20339,2020-11-25 23:21:06,SM35,"[System Boot Initiated BIOS_Boot_Up, Initiated..."
241259,SERVER_20339,2020-11-25 23:11:23,SM35,"[System Boot Initiated BIOS_Boot_Up, Initiated..."
291022,SERVER_20339,2020-11-25 23:10:19,SM35,"[Management Subsystem Health System_Health, Se..."
267210,SERVER_20339,2020-11-25 23:10:19,SM35,"[Management Subsys Health System_Health, Senso..."
257969,SERVER_20339,2020-11-25 23:10:08,SM35,"[System ACPI Power State ACPI_PWR_Status, S0/G..."
...,...,...,...,...
330791,SERVER_10657,2019-12-27 23:39:00,SM54,"[Memory CPU1C0_DIMM_Stat, Correctable ECC, Ass..."
329200,SERVER_10657,2019-12-27 23:38:46,SM54,"[Memory CPU1C0_DIMM_Stat, Correctable ECC, Ass..."
329772,SERVER_10657,2019-12-27 23:38:33,SM54,"[Memory CPU1C0_DIMM_Stat, Correctable ECC, Ass..."
328692,SERVER_10657,2019-12-27 23:38:19,SM54,"[Memory CPU1C0_DIMM_Stat, Correctable ECC, Ass..."


In [29]:
df_grouped_time = df_total.groupby("sn")["time"].apply(groupby_time_list).to_frame()
df_grouped_time = df_grouped_time.reset_index()
df_grouped_all_tokens = df_total.groupby("sn")["all_tokens"].apply(groupby_token_list).to_frame()
df_grouped_all_tokens = df_grouped_all_tokens.reset_index()
df_grouped_model = df_total.groupby("sn")["server_model"].apply(groupby_model_list).to_frame()
df_grouped_model = df_grouped_model.reset_index()
final_df = pd.merge(left=df_grouped_time, right=df_grouped_all_tokens, on="sn", how="inner")
final_df = pd.merge(left=final_df, right=df_grouped_model, on="sn", how="inner")
final_df["all_tokens"] = final_df["all_tokens"].apply(lambda x: pad_token(x, max_len=321))
all_tokens = json.load(open("../dataset/dict/all_tokens.json", "r"))
all_tokens["PAD"] = 91798
final_df["all_tokens"] = final_df["all_tokens"].apply(lambda x: map_tokens(x, all_tokens))

server_model_dict = json.load(open("../dataset/dict/server_model_dict.json", "r"))
final_df["server_model"] = final_df["server_model"].apply(lambda x: server_model_dict[x])

final_df["time"] = final_df["time"].apply(lambda x: pad_time(x, max_len=35))
final_df.to_json("../dataset/pre_process/final_df.json", orient="index")
final_df

Unnamed: 0,sn,time,all_tokens,server_model
0,SERVER_10001,"[[9, 5], [9, 0], [9, 0], [8, 59], [8, 59], [8,...","[45083, 45325, 45085, 0, 1, 2, 3, 2, 274, 278,...",51
1,SERVER_10003,"[[9, 48], [9, 48], [9, 48], [9, 48], [9, 48], ...","[45465, 45104, 45085, 45280, 420, 39, 40, 4508...",51
2,SERVER_10008,"[[16, 47], [16, 46], [16, 46], [1, 52], [15, 5...","[45086, 45118, 45085, 45287, 7, 45118, 45085, ...",20
3,SERVER_10009,"[[16, 7], [16, 7], [16, 7], [16, 7], [25, 61],...","[45938, 45172, 45090, 51, 52, 69, 70, 758, 51,...",20
4,SERVER_10012,"[[3, 2], [3, 2], [3, 2], [3, 2], [25, 61], [25...","[46000, 45172, 45090, 51, 52, 69, 70, 794, 51,...",20
...,...,...,...,...
13700,SERVER_999,"[[19, 8], [19, 8], [19, 6], [19, 6], [25, 61],...","[45280, 45293, 45085, 45280, 39, 40, 254, 4508...",50
13701,SERVER_9991,"[[18, 14], [22, 46], [19, 52], [25, 61], [25, ...","[45335, 45104, 45085, 45280, 292, 39, 40, 4508...",33
13702,SERVER_9993,"[[23, 48], [23, 43], [25, 61], [25, 61], [25, ...","[45459, 45104, 45085, 45280, 414, 39, 40, 4508...",51
13703,SERVER_9998,"[[11, 19], [11, 4], [25, 61], [25, 61], [25, 6...","[45477, 45104, 45085, 45280, 430, 39, 40, 4508...",51


In [30]:
dataset_laji = pd.merge(left=df_total_label, right=final_df, on="sn", how="left")
dataset_laji.to_json("../dataset/pre_process/dataset_laji.json", orient="index")
dataset_laji

Unnamed: 0,sn,fault_time,label,time,all_tokens,server_model
0,SERVER_10001,2020-05-01 10:04:00,1,"[[9, 5], [9, 0], [9, 0], [8, 59], [8, 59], [8,...","[45083, 45325, 45085, 0, 1, 2, 3, 2, 274, 278,...",51
1,SERVER_10003,2020-03-28 09:48:00,2,"[[9, 48], [9, 48], [9, 48], [9, 48], [9, 48], ...","[45465, 45104, 45085, 45280, 420, 39, 40, 4508...",51
2,SERVER_10008,2020-02-25 16:12:00,1,"[[16, 47], [16, 46], [16, 46], [1, 52], [15, 5...","[45086, 45118, 45085, 45287, 7, 45118, 45085, ...",20
3,SERVER_10008,2020-03-11 18:04:00,2,"[[16, 47], [16, 46], [16, 46], [1, 52], [15, 5...","[45086, 45118, 45085, 45287, 7, 45118, 45085, ...",20
4,SERVER_10009,2020-05-08 16:37:00,3,"[[16, 7], [16, 7], [16, 7], [16, 7], [25, 61],...","[45938, 45172, 45090, 51, 52, 69, 70, 758, 51,...",20
...,...,...,...,...,...,...
16664,SERVER_9991,2020-08-04 22:49:00,2,"[[18, 14], [22, 46], [19, 52], [25, 61], [25, ...","[45335, 45104, 45085, 45280, 292, 39, 40, 4508...",33
16665,SERVER_9991,2020-10-07 18:42:00,2,"[[18, 14], [22, 46], [19, 52], [25, 61], [25, ...","[45335, 45104, 45085, 45280, 292, 39, 40, 4508...",33
16666,SERVER_9993,2020-05-14 23:50:00,2,"[[23, 48], [23, 43], [25, 61], [25, 61], [25, ...","[45459, 45104, 45085, 45280, 414, 39, 40, 4508...",51
16667,SERVER_9998,2020-05-29 11:25:00,2,"[[11, 19], [11, 4], [25, 61], [25, 61], [25, 6...","[45477, 45104, 45085, 45280, 430, 39, 40, 4508...",51
