# 1. 引入包

In [10]:
import os
import nltk
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.ensemble import RandomForestClassifier

# 2. 读取数据

In [11]:
# 读取sel日志，排序
sel_data = pd.read_csv('../dataset/raw_dataset/preliminary_sel_log_dataset.csv')
sel_data.sort_values(by=['sn', 'time'], inplace=True)
sel_data.reset_index(drop=True, inplace=True)
sel_data

Unnamed: 0,sn,time,msg,server_model
0,SERVER_10001,2020-05-01 08:54:43,Processor CPU1_Status | IERR | Asserted,SM57
1,SERVER_10001,2020-05-01 08:54:43,Processor CPU0_Status | IERR | Asserted,SM57
2,SERVER_10001,2020-05-01 08:55:03,Management Subsys Health System_Health | Sens...,SM57
3,SERVER_10001,2020-05-01 08:59:48,Processor CPU0_Status | IERR | Deasserted,SM57
4,SERVER_10001,2020-05-01 08:59:48,Processor CPU1_Status | IERR | Deasserted,SM57
...,...,...,...,...
482531,SERVER_9999,2020-10-13 02:56:48,Processor CPU1_Status | Configuration Error |...,SM57
482532,SERVER_9999,2020-10-13 02:56:56,Memory CPU1C0_DIMM_Stat | Uncorrectable ECC |...,SM57
482533,SERVER_9999,2020-10-13 02:56:57,Processor CPU1_Status | Configuration Error |...,SM57
482534,SERVER_9999,2020-10-13 02:57:03,Memory CPU1C0_DIMM_Stat | Uncorrectable ECC |...,SM57


# 3. 分词

In [12]:
# 取出每台服务器的最后十条日志
sn_list = sel_data['sn'].drop_duplicates(keep='first').to_list()
tail_msg_list = ['.'.join(sel_data[sel_data['sn']==i]['msg'].tail(10).to_list()) for i in sn_list]
tokenized_sent = [word_tokenize(s.lower()) for s in tail_msg_list]

# 4. 训练Embedding模型（Doc2Vec）

In [13]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(tokenized_sent)]
model = Doc2Vec(tagged_data, vector_size = 10, window = 5, min_count = 1, epochs = 100)

2022-03-10 21:09:21,391 : INFO : collecting all words and their counts
2022-03-10 21:09:21,392 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2022-03-10 21:09:21,444 : INFO : PROGRESS: at example #10000, processed 664621 words (12962321/s), 1121 word types, 0 tags
2022-03-10 21:09:21,464 : INFO : collected 1212 word types and 13705 unique tags from a corpus of 13705 examples and 908997 words
2022-03-10 21:09:21,465 : INFO : Creating a fresh vocabulary
2022-03-10 21:09:21,468 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=1 retains 1212 unique words (100.0%% of original 1212, drops 0)', 'datetime': '2022-03-10T21:09:21.468234', 'gensim': '4.1.2', 'python': '3.10.2 (tags/v3.10.2:a58ebcc, Jan 17 2022, 14:12:15) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'prepare_vocab'}
2022-03-10 21:09:21,468 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=1 leaves 908997 word corpus (100.0%% of original 90

# 5. 构建树模型的训练集

In [14]:
label = pd.read_csv('../dataset/raw_dataset/preliminary_train_label_dataset.csv')
label.sort_values(by=['sn', 'fault_time'], inplace=True)
label.reset_index(drop=True, inplace=True)
train_data = []
for i, row in label.iterrows():
    train_data.append(model.infer_vector(word_tokenize('.'.join(sel_data[(sel_data['sn']==row['sn'])&(sel_data['time']<=row['fault_time'])].tail(10)['msg']).lower())))
train_feature = np.array(train_data)
train_label = label['label'].values

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_feature, train_label, test_size = 0.25)

In [None]:
rf = RandomForestClassifier(oob_score=True)
rf.fit(X_train, y_train)
score = rf.score(X_test, y_test)
score

In [None]:
# submit = pd.read_csv('../dataset/raw_dataset/preliminary_submit_dataset.csv')
# submit.sort_values(by=['sn', 'fault_time'], inplace=True)
# submit.reset_index(drop=True, inplace=True)
# test_data = []
# for i, row in submit.iterrows():
#     test_data.append(model.infer_vector(word_tokenize('. '.join(sel_data[(sel_data['sn']==row['sn'])&(sel_data['time']<=row['fault_time'])].tail(10)['msg']).lower())))
# test_feature = np.array(test_data)

In [None]:
# test_label = rf.predict(test_feature)
# submit['label'] = test_label
# submit.to_csv('./preliminary_pred_df.csv', index=0)