# 待做事项
1. 特征工程构造数据集，处理未存在值
2. 载入模型
3. 预测打标签

In [1]:
# 载入依赖
import pandas as pd
import ujson as json
import numpy as np
from datetime import datetime

In [21]:
# 定义加载数据集函数
def get_dataset(path):
    raw_df = pd.read_csv(path, encoding="gbk")
    return raw_df

# 命令级分割
def split_msg(msg):
    msgs = msg.split(" | ")
    msgs[0]  = msgs[0].lstrip(" ")
    return msgs

# 单词级分割
def split_word(msgs):
    total_msgs = list()
    for e in msgs:
        temp_msgs = e.split(" ")
        total_msgs = total_msgs + temp_msgs
    return total_msgs

# 获取命令级和单词级  的 token字典
def get_all_tokens(df_total):
    dict_command = {x:1 for x1 in df_total["command_tokens"].values for x in x1}
    dict_words = {x:1 for x1 in df_total["word_tokens"].values for x in x1}
    list_tokens = list(dict_words.keys()) + list(dict_command.keys())
    dict_total = dict()
    for i, v in enumerate(list_tokens):
        # print(i, "  ", v)
        dict_total[v] = i
    path = "../dataset/dict/all_tokens.json"
    json.dump(dict_total, open(path, "w"))
    return dict_total
# 同一个sn的所有token聚合到一起
def groupby_token_list(lists):
    total_list = [x for x1 in lists for x in x1]
    return total_list

# 获取时间序列
def get_timeseq(time_str):
    h = datetime.strptime(time_str,"%Y/%m/%d %H:%M").hour
    m = datetime.strptime(time_str,"%Y/%m/%d %H:%M").minute
    # s = datetime.strptime(time_str,"%Y-%m-%d %H:%M:%S").second
    return [h, m]

# 同一个sn的所有time聚合到一起
def groupby_time_list(lists):
    total = [get_timeseq(x) for x in lists]
    return np.array(total)

# 同一个sn的所有server_model聚合到一起
def groupby_model_list(lists):
    total = [x for x in lists]
    return total[0]

# 按照指定长度补齐或者切割token
def pad_token(test_token_list, max_len = 321):
    if len(test_token_list) < max_len:
        padding_len = max_len - len(test_token_list)
        padding_list = ['PAD'] * padding_len
        result_list = test_token_list + padding_list
    elif len(test_token_list) >= max_len:
        result_list = test_token_list[:max_len]
    return result_list

# 按照指定长度补齐或者切割time
def pad_time(time_array, max_len = 35):
    if len(time_array) < max_len:
        padding_len = max_len - len(time_array)
        padding_array = np.array([25, 61] * padding_len).reshape((padding_len,2))
        # result_array = np.concatenate()
        result_array = np.append(time_array, padding_array, axis=0)
    elif len(time_array) >= max_len:
        result_array = time_array[:max_len]
    return result_array

# 对所有的token键根据字典映射成值
def map_tokens(test_list, all_tokens):
    result_list = list()
    for x in test_list:
        try:
            temp_ = all_tokens[x]
        except:
            temp_ = 91799
        result_list.append(temp_)
    result_list = np.array(result_list).astype(int)
    return result_list

In [14]:
path_log_a = '../dataset/raw_dataset/preliminary_sel_log_dataset_a.csv'
df_log_a =  get_dataset(path_log_a)
df_total = df_log_a.copy()
df_total

Unnamed: 0,sn,time,msg,server_model
0,000d33b21436,2020/9/2 11:38,System Boot Initiated BIOS_Boot_Up | Initiate...,SM40
1,000d33b21436,2020/9/2 15:46,System Boot Initiated BIOS_Boot_Up | Initiate...,SM40
2,005c5a9218ba,2020/6/28 18:26,Memory Memory_Status | Correctable ECC | Asse...,SM99
3,005c5a9218ba,2020/6/28 18:40,System ACPI Power State #0x7d | S0/G0: workin...,SM99
4,005c5a9218ba,2020/6/28 18:26,Memory Memory_Status | Correctable ECC | Asse...,SM99
...,...,...,...,...
10986,fffd22fffe19,2020/1/21 19:16,Microcontroller/Coprocessor #0x16 | Transitio...,SM16
10987,fffd22fffe19,2020/1/21 19:17,System Event #0x10 | Timestamp Clock Sync | A...,SM16
10988,fffd22fffe19,2020/1/21 18:32,Memory #0xf9 | Uncorrectable ECC | Asserted,SM16
10989,fffd22fffe19,2020/1/21 19:18,System Boot Initiated BIOS_Boot_Up | Initiate...,SM16


In [23]:
path_label_a = '../dataset/raw_dataset/preliminary_submit_dataset_a.csv'
df_label_a =  get_dataset(path_label_a)
df_label_a

Unnamed: 0,sn,fault_time
0,000d33b21436,2020/9/2 16:42
1,005c5a9218ba,2020/6/28 19:05
2,0079283bde6e,2020/4/26 21:32
3,007bdf23b62f,2020/6/16 18:40
4,00a577a8e54f,2020/4/7 7:16
...,...,...
3006,ffbf46b4af21,2019/12/28 20:10
3007,ffc229b6cd9a,2020/6/27 2:39
3008,ffd44698a52b,2020/1/21 15:46
3009,fff73a9e5bd5,2020/3/1 22:43


In [15]:
df_total["command_tokens"] = df_total["msg"].apply(lambda x: split_msg(x))
df_total["word_tokens"] = df_total["command_tokens"].apply(lambda x: split_word(x))
df_total["all_tokens"] = df_total["command_tokens"] + df_total["word_tokens"]
df_total = df_total.drop(['msg', 'command_tokens', 'word_tokens'], axis=1)
df_total

Unnamed: 0,sn,time,server_model,all_tokens
0,000d33b21436,2020/9/2 11:38,SM40,"[System Boot Initiated BIOS_Boot_Up, Initiated..."
1,000d33b21436,2020/9/2 15:46,SM40,"[System Boot Initiated BIOS_Boot_Up, Initiated..."
2,005c5a9218ba,2020/6/28 18:26,SM99,"[Memory Memory_Status, Correctable ECC, Assert..."
3,005c5a9218ba,2020/6/28 18:40,SM99,"[System ACPI Power State #0x7d, S0/G0: working..."
4,005c5a9218ba,2020/6/28 18:26,SM99,"[Memory Memory_Status, Correctable ECC, Assert..."
...,...,...,...,...
10986,fffd22fffe19,2020/1/21 19:16,SM16,"[Microcontroller/Coprocessor #0x16, Transition..."
10987,fffd22fffe19,2020/1/21 19:17,SM16,"[System Event #0x10, Timestamp Clock Sync, Ass..."
10988,fffd22fffe19,2020/1/21 18:32,SM16,"[Memory #0xf9, Uncorrectable ECC, Asserted, Me..."
10989,fffd22fffe19,2020/1/21 19:18,SM16,"[System Boot Initiated BIOS_Boot_Up, Initiated..."


In [22]:
df_grouped_time = df_total.groupby("sn")["time"].apply(groupby_time_list).to_frame()
df_grouped_time = df_grouped_time.reset_index()
df_grouped_all_tokens = df_total.groupby("sn")["all_tokens"].apply(groupby_token_list).to_frame()
df_grouped_all_tokens = df_grouped_all_tokens.reset_index()
df_grouped_model = df_total.groupby("sn")["server_model"].apply(groupby_model_list).to_frame()
df_grouped_model = df_grouped_model.reset_index()
final_df = pd.merge(left=df_grouped_time, right=df_grouped_all_tokens, on="sn", how="inner")
final_df = pd.merge(left=final_df, right=df_grouped_model, on="sn", how="inner")
final_df["all_tokens"] = final_df["all_tokens"].apply(lambda x: pad_token(x, max_len=321))
all_tokens = json.load(open("../dataset/dict/all_tokens.json", "r"))
all_tokens["PAD"] = 91798
final_df["all_tokens"] = final_df["all_tokens"].apply(lambda x: map_tokens(x, all_tokens))

server_model_dict = json.load(open("../dataset/dict/server_model_dict.json", "r"))
final_df["server_model"] = final_df["server_model"].apply(lambda x: server_model_dict[x])

final_df["time"] = final_df["time"].apply(lambda x: pad_time(x, max_len=35))
final_df

Unnamed: 0,sn,time,all_tokens,server_model
0,000d33b21436,"[[11, 38], [15, 46], [25, 61], [25, 61], [25, ...","[45083, 45325, 45085, 0, 1, 2, 3, 2, 274, 278,...",85
1,005c5a9218ba,"[[18, 26], [18, 40], [18, 26], [18, 38], [18, ...","[45566, 45104, 45085, 45280, 511, 39, 40, 4508...",76
2,0079283bde6e,"[[20, 54], [25, 61], [25, 61], [25, 61], [25, ...","[45269, 45131, 45085, 32, 62, 228, 73, 50, 450...",49
3,007bdf23b62f,"[[17, 11], [17, 13], [17, 10], [17, 16], [17, ...","[45310, 45104, 45085, 45280, 267, 39, 40, 4508...",74
4,00a577a8e54f,"[[6, 46], [6, 52], [6, 43], [6, 45], [6, 43], ...","[45412, 45092, 45085, 0, 1, 2, 0, 18, 0, 18, 4...",44
...,...,...,...,...
2878,ffbf46b4af21,"[[20, 9], [20, 9], [20, 8], [25, 61], [25, 61]...","[45460, 45104, 45085, 45280, 415, 39, 40, 4508...",23
2879,ffc229b6cd9a,"[[2, 29], [2, 36], [25, 61], [25, 61], [25, 61...","[45310, 45104, 45085, 45280, 267, 39, 40, 4508...",29
2880,ffd44698a52b,"[[13, 57], [13, 57], [25, 61], [25, 61], [25, ...","[46142, 45107, 45085, 0, 31, 32, 4, 856, 43, 4...",57
2881,fff73a9e5bd5,"[[13, 50], [13, 50], [13, 18], [13, 49], [13, ...","[46659, 45104, 45085, 45280, 1294, 39, 40, 450...",28


In [24]:
dataset_laji = pd.merge(left=df_label_a, right=final_df, on="sn", how="left")
dataset_laji.to_json("../dataset/pre_process/submit.json", orient="index")
dataset_laji

Unnamed: 0,sn,fault_time,time,all_tokens,server_model
0,000d33b21436,2020/9/2 16:42,"[[11, 38], [15, 46], [25, 61], [25, 61], [25, ...","[45083, 45325, 45085, 0, 1, 2, 3, 2, 274, 278,...",85
1,005c5a9218ba,2020/6/28 19:05,"[[18, 26], [18, 40], [18, 26], [18, 38], [18, ...","[45566, 45104, 45085, 45280, 511, 39, 40, 4508...",76
2,0079283bde6e,2020/4/26 21:32,"[[20, 54], [25, 61], [25, 61], [25, 61], [25, ...","[45269, 45131, 45085, 32, 62, 228, 73, 50, 450...",49
3,007bdf23b62f,2020/6/16 18:40,"[[17, 11], [17, 13], [17, 10], [17, 16], [17, ...","[45310, 45104, 45085, 45280, 267, 39, 40, 4508...",74
4,00a577a8e54f,2020/4/7 7:16,"[[6, 46], [6, 52], [6, 43], [6, 45], [6, 43], ...","[45412, 45092, 45085, 0, 1, 2, 0, 18, 0, 18, 4...",44
...,...,...,...,...,...
3006,ffbf46b4af21,2019/12/28 20:10,"[[20, 9], [20, 9], [20, 8], [25, 61], [25, 61]...","[45460, 45104, 45085, 45280, 415, 39, 40, 4508...",23
3007,ffc229b6cd9a,2020/6/27 2:39,"[[2, 29], [2, 36], [25, 61], [25, 61], [25, 61...","[45310, 45104, 45085, 45280, 267, 39, 40, 4508...",29
3008,ffd44698a52b,2020/1/21 15:46,"[[13, 57], [13, 57], [25, 61], [25, 61], [25, ...","[46142, 45107, 45085, 0, 31, 32, 4, 856, 43, 4...",57
3009,fff73a9e5bd5,2020/3/1 22:43,"[[13, 50], [13, 50], [13, 18], [13, 49], [13, ...","[46659, 45104, 45085, 45280, 1294, 39, 40, 450...",28


In [26]:
# 判断是否有空值
dataset_laji.isnull().sum()

sn              0
fault_time      0
time            0
all_tokens      0
server_model    0
dtype: int64

# 2. 载入模型进行预测
1. 载入模型
2. 进行预测
3. 对标签值进行处理
4. 给原始数据打上标签


In [27]:
# 载入依赖
from keras.models import load_model


In [28]:
# 载入模型
path_model = "../models/Embedding_v1.h5"
model = load_model(path_model)

In [29]:
model.summary()

Model: "model_23"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 token_input (InputLayer)       [(None, 321)]        0           []                               
                                                                                                  
 server_input (InputLayer)      [(None, 1)]          0           []                               
                                                                                                  
 time_hour_input (InputLayer)   [(None, 35)]         0           []                               
                                                                                                  
 time_min_input (InputLayer)    [(None, 35)]         0           []                               
                                                                                           

In [30]:
# 载入数据
raw_dataset = pd.read_json("../dataset/pre_process/submit.json", orient="index")
raw_dataset

Unnamed: 0,sn,fault_time,time,all_tokens,server_model
0,000d33b21436,2020-09-02 16:42:00,"[[11, 38], [15, 46], [25, 61], [25, 61], [25, ...","[45083, 45325, 45085, 0, 1, 2, 3, 2, 274, 278,...",85
1,005c5a9218ba,2020-06-28 19:05:00,"[[18, 26], [18, 40], [18, 26], [18, 38], [18, ...","[45566, 45104, 45085, 45280, 511, 39, 40, 4508...",76
2,0079283bde6e,2020-04-26 21:32:00,"[[20, 54], [25, 61], [25, 61], [25, 61], [25, ...","[45269, 45131, 45085, 32, 62, 228, 73, 50, 450...",49
3,007bdf23b62f,2020-06-16 18:40:00,"[[17, 11], [17, 13], [17, 10], [17, 16], [17, ...","[45310, 45104, 45085, 45280, 267, 39, 40, 4508...",74
4,00a577a8e54f,2020-04-07 07:16:00,"[[6, 46], [6, 52], [6, 43], [6, 45], [6, 43], ...","[45412, 45092, 45085, 0, 1, 2, 0, 18, 0, 18, 4...",44
...,...,...,...,...,...
3006,ffbf46b4af21,2019-12-28 20:10:00,"[[20, 9], [20, 9], [20, 8], [25, 61], [25, 61]...","[45460, 45104, 45085, 45280, 415, 39, 40, 4508...",23
3007,ffc229b6cd9a,2020-06-27 02:39:00,"[[2, 29], [2, 36], [25, 61], [25, 61], [25, 61...","[45310, 45104, 45085, 45280, 267, 39, 40, 4508...",29
3008,ffd44698a52b,2020-01-21 15:46:00,"[[13, 57], [13, 57], [25, 61], [25, 61], [25, ...","[46142, 45107, 45085, 0, 31, 32, 4, 856, 43, 4...",57
3009,fff73a9e5bd5,2020-03-01 22:43:00,"[[13, 50], [13, 50], [13, 18], [13, 49], [13, ...","[46659, 45104, 45085, 45280, 1294, 39, 40, 450...",28


In [45]:
X_train = raw_dataset[["time", "all_tokens", "server_model"]]
X_train_times = np.array([[x for x in x1] for x1 in  X_train.values[:, 0]])
X_train_tokens = np.array([[x for x in x1] for x1 in  X_train.values[:, 1]])
X_train_model = X_train.values[:, 2]

In [48]:
result = model.predict(x= (X_train_times[:, :, 0].astype("float64"), X_train_times[:, :, 1].astype("float64"), X_train_tokens.astype("float64"), X_train_model.astype("float64")), verbose=1)



In [69]:
result_int = [[int(np.where(x== max(x))[0])] for x in result]
result_int

[[2],
 [2],
 [3],
 [2],
 [2],
 [2],
 [1],
 [2],
 [2],
 [1],
 [2],
 [2],
 [2],
 [1],
 [3],
 [3],
 [2],
 [3],
 [2],
 [1],
 [2],
 [2],
 [1],
 [2],
 [1],
 [2],
 [1],
 [3],
 [2],
 [2],
 [1],
 [1],
 [2],
 [2],
 [2],
 [1],
 [1],
 [2],
 [2],
 [2],
 [1],
 [2],
 [1],
 [1],
 [1],
 [2],
 [2],
 [2],
 [1],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [3],
 [1],
 [3],
 [2],
 [2],
 [2],
 [1],
 [2],
 [3],
 [1],
 [2],
 [2],
 [2],
 [3],
 [1],
 [1],
 [1],
 [1],
 [2],
 [1],
 [2],
 [3],
 [1],
 [3],
 [3],
 [2],
 [3],
 [1],
 [3],
 [2],
 [2],
 [2],
 [1],
 [2],
 [3],
 [3],
 [2],
 [2],
 [3],
 [1],
 [3],
 [2],
 [3],
 [2],
 [2],
 [2],
 [2],
 [2],
 [1],
 [1],
 [1],
 [2],
 [2],
 [2],
 [2],
 [1],
 [3],
 [3],
 [2],
 [2],
 [1],
 [1],
 [2],
 [1],
 [2],
 [3],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [0],
 [1],
 [1],
 [2],
 [3],
 [1],
 [2],
 [3],
 [2],
 [1],
 [2],
 [2],
 [2],
 [2],
 [1],
 [2],
 [1],
 [2],
 [2],
 [1],
 [1],
 [1],
 [3],
 [2],
 [3],
 [3],
 [1],
 [1],
 [2],
 [1],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2],
 [2]

In [75]:
df_label = pd.DataFrame(result_int, columns=["label"])
final_result = pd.concat([df_label_a, df_label], axis=1) 
final_result.to_csv("../dataset/result/preliminary_submit_dataset_a.csv", encoding = "gbk", index=False)
final_result

Unnamed: 0,sn,fault_time,label
0,000d33b21436,2020/9/2 16:42,2
1,005c5a9218ba,2020/6/28 19:05,2
2,0079283bde6e,2020/4/26 21:32,3
3,007bdf23b62f,2020/6/16 18:40,2
4,00a577a8e54f,2020/4/7 7:16,2
...,...,...,...
3006,ffbf46b4af21,2019/12/28 20:10,2
3007,ffc229b6cd9a,2020/6/27 2:39,2
3008,ffd44698a52b,2020/1/21 15:46,2
3009,fff73a9e5bd5,2020/3/1 22:43,2
