In [1]:
import pandas as pd
import numpy as np
import os
import logging
logger = logging.getLogger(__name__)
logging.basicConfig(format='%(asctime)s [%(levelname)s] %(message)s',level=logging.INFO)
from datetime import datetime, timedelta
from tqdm import tqdm
from nltk.tokenize import word_tokenize
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score
import gc

# data_path
train_pre_sel_log_path1 = "data/preliminary_sel_log_dataset.csv"
train_pre_sel_log_path2 = "data/preliminary_sel_log_dataset_a.csv"
train_label_path1 = "data/preliminary_train_label_dataset.csv"
train_label_path2 = "data/preliminary_train_label_dataset_s.csv"
test_submit_path = "data/preliminary_submit_dataset_a.csv"

2022-03-25 01:34:12,733 [INFO] Note: NumExpr detected 12 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2022-03-25 01:34:12,734 [INFO] NumExpr defaulting to 8 threads.


### 1.标签数据EDA

In [2]:
logger.info("label EDA")
train_label1 = pd.read_csv(train_label_path1, parse_dates=["fault_time"])
train_label2 = pd.read_csv(train_label_path2, parse_dates=["fault_time"])
train_label = pd.concat([train_label1,train_label2 ])
train_label.reset_index(drop=True, inplace=True)


print(train_label.head())

print("训练集样本个数:{}".format(len(train_label)))
print("每一类样本数:")
print(train_label["label"].value_counts())
print("每个服务器出现次数:")
print(train_label["sn"].value_counts())

2022-03-25 01:34:12,933 [INFO] label EDA


             sn          fault_time  label
0  SERVER_25698 2020-10-09 13:43:00      0
1  SERVER_25699 2020-08-25 18:50:00      0
2  SERVER_25712 2020-03-16 13:20:00      0
3  SERVER_25708 2020-07-25 12:44:00      0
4  SERVER_25711 2020-03-16 16:51:00      0
训练集样本个数:16669
每一类样本数:
2    9343
1    3387
3    2463
0    1476
Name: label, dtype: int64
每个服务器出现次数:
SERVER_13884    8
SERVER_23459    8
SERVER_12517    8
SERVER_23217    7
SERVER_1490     6
               ..
SERVER_14959    1
SERVER_10667    1
SERVER_9358     1
SERVER_19058    1
SERVER_4655     1
Name: sn, Length: 13705, dtype: int64


### 2.基础SEL日志数据EDA

In [3]:
train_pre_sel_log1 = pd.read_csv(train_pre_sel_log_path1, parse_dates=["time"])
train_pre_sel_log2 = pd.read_csv(train_pre_sel_log_path2, parse_dates=["time"])
train_pre_sel_log = pd.concat([train_pre_sel_log1, train_pre_sel_log2])

print(train_pre_sel_log.head())
print("日志个数：{}".format(len(train_pre_sel_log)))
print("每个服务器产生的日志个数:{}".format(train_pre_sel_log["sn"].value_counts()))
print("最大日志时间：{}".format(max(train_pre_sel_log["time"])))
print("最小日志时间：{}".format(min(train_pre_sel_log["time"])))
print("每种日志的个数：{}".format(train_pre_sel_log["msg"].value_counts()))

             sn                time  \
0  SERVER_25698 2020-10-09 08:32:21   
1  SERVER_25698 2020-10-09 07:43:48   
2  SERVER_25698 2020-10-09 08:16:22   
3  SERVER_25698 2020-10-09 05:46:41   
4  SERVER_25698 2020-10-09 12:59:13   

                                                 msg server_model  
0   System Boot Initiated BIOS_Boot_Up | State As...          SM0  
1   System Boot Initiated BIOS_Boot_Up | State As...          SM0  
2   System Boot Initiated BIOS_Boot_Up | State As...          SM0  
3   System Boot Initiated BIOS_Boot_Up | State As...          SM0  
4   System Boot Initiated BIOS_Boot_Up | State As...          SM0  
日志个数：493527
每个服务器产生的日志个数:SERVER_26689    6958
SERVER_23498    6821
SERVER_9056     6049
SERVER_4846     5519
SERVER_22927    5495
                ... 
SERVER_5730        1
SERVER_19751       1
7ff77080a768       1
SERVER_288         1
SERVER_3557        1
Name: sn, Length: 16588, dtype: int64
最大日志时间：2020-11-25 23:21:06
最小日志时间：2019-12-27 23:38:05
每种日志的个数： 

In [4]:
test_submit = pd.read_csv(test_submit_path, parse_dates=["fault_time"])
test_submit["fault_time_label"] = test_submit["fault_time"].dt.floor(freq="5T")  #四舍五入 分和秒
print("测试集样本个数:{}".format(len(test_submit)))
print("测试集中每个服务器的个数：{}".format(test_submit["sn"].value_counts()))
print(test_submit.head())

测试集样本个数:3011
测试集中每个服务器的个数：a49f03aa6bd9    3
f8097ee49558    3
eec3d17b0128    3
330d7e1153c4    3
16a8dff8cc19    3
               ..
26d148de95f7    1
19ef7f0fffcb    1
7781e5903e1c    1
4599ffc8c626    1
b182964bf2a1    1
Name: sn, Length: 2883, dtype: int64
             sn          fault_time    fault_time_label
0  000d33b21436 2020-09-02 16:42:54 2020-09-02 16:40:00
1  005c5a9218ba 2020-06-28 19:05:16 2020-06-28 19:05:00
2  0079283bde6e 2020-04-26 21:32:44 2020-04-26 21:30:00
3  007bdf23b62f 2020-06-16 18:40:39 2020-06-16 18:40:00
4  00a577a8e54f 2020-04-07 07:16:55 2020-04-07 07:15:00


### 3.数据处理

In [5]:
train_pre_sel_log["time_label"] = train_pre_sel_log["time"].dt.floor(freq="5T")  #四舍五入 分和秒
train_pre_sel_log = train_pre_sel_log.groupby(["sn", "time_label"]).agg({"msg":list}).reset_index(drop=False)  #一个服务器同一时间产生的msg合并
train_pre_sel_log["msg"] = train_pre_sel_log["msg"].apply(lambda x: ','.join(x))

# test_pre_sel_log["time_label"] = test_pre_sel_log["fault_time"].dt.floor(freq="5T")
# test_pre_sel_log = test_pre_sel_log.groupby(["sn", "time_label"]).agg({"msg":list}).reset_index(drop=False)
# test_pre_sel_log["msg"] = test_pre_sel_log["msg"].apply(lambda x: ','.join(x))

In [25]:
train_pre_sel_log["time_label"].dt.ceil(freq="5T")

KeyError: 'time_label'

### 4.分词、训练

In [38]:
embedding_size = 16
msg_list = list(train_pre_sel_log["msg"])
tokenized_sent = [word_tokenize(s.lower()) for s in msg_list] # 分词
tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(tokenized_sent)]
#model = Doc2Vec(tagged_data, vector_size = embedding_size, window = 2, min_count = 1, epochs = 10)
#model.save('./model/doc2vec_model/doc2vec.model')
model = Doc2Vec.load('./model/doc2vec_model/doc2vec.model')
embedding_vec = []
for i in tqdm(range(len(msg_list))):
    embedding_vec.append(model.infer_vector(word_tokenize(" ".join(msg_list[i]))))

embedding_vec = np.array(embedding_vec)

for i in tqdm(range(embedding_size)):
    train_pre_sel_log[f"embedding_feat{i}"] = embedding_vec[:, i]

print(train_pre_sel_log.head())

del embedding_vec;gc.collect()

2022-03-24 16:33:47,942 [INFO] loading Doc2Vec object from ./model/doc2vec_model/doc2vec.model
2022-03-24 16:33:47,972 [INFO] loading vocabulary recursively from ./model/doc2vec_model/doc2vec.model.vocabulary.* with mmap=None
2022-03-24 16:33:47,973 [INFO] loading trainables recursively from ./model/doc2vec_model/doc2vec.model.trainables.* with mmap=None
2022-03-24 16:33:47,973 [INFO] loading wv recursively from ./model/doc2vec_model/doc2vec.model.wv.* with mmap=None
2022-03-24 16:33:47,974 [INFO] loading docvecs recursively from ./model/doc2vec_model/doc2vec.model.docvecs.* with mmap=None
2022-03-24 16:33:47,975 [INFO] loaded ./model/doc2vec_model/doc2vec.model
100%|██████████████████████████████████████████████████████████████████████████| 79718/79718 [1:18:43<00:00, 16.88it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 16/16 [00:00<00:00, 4001.48it/s]

             sn          time_label  \
0  000d33b21436 2020-09-02 11:35:00   
1  000d33b21436 2020-09-02 15:45:00   
2  005c5a9218ba 2020-06-28 18:25:00   
3  005c5a9218ba 2020-06-28 18:35:00   
4  005c5a9218ba 2020-06-28 18:40:00   

                                                 msg  embedding_feat0  \
0   System Boot Initiated BIOS_Boot_Up | Initiate...         0.893904   
1   System Boot Initiated BIOS_Boot_Up | Initiate...         0.858545   
2   Memory Memory_Status | Correctable ECC | Asse...         1.879179   
3   System ACPI Power State #0x7d | S4/S5: soft-o...         0.855251   
4   System ACPI Power State #0x7d | S0/G0: workin...         1.254723   

   embedding_feat1  embedding_feat2  embedding_feat3  embedding_feat4  \
0         0.233783         0.359199         0.464951        -0.321756   
1         0.135370         0.450973         0.403031        -0.412947   
2         1.486713         1.353432         1.231778        -0.636971   
3        -0.206310         0.88306




55

In [7]:
#读取保存的template特征
template_fea = pd.read_pickle('./feature_data/template_fea/cpu_diag_comp_sel_log_all_feature1h_3_sum.pkl')
template_fea.head()

Unnamed: 0,sn,collect_time_gap,template_id_1_sum_3,template_id_2_sum_3,template_id_3_sum_3,template_id_4_sum_3,template_id_5_sum_3,template_id_6_sum_3,template_id_7_sum_3,template_id_8_sum_3,...,template_id_197_sum_3,template_id_198_sum_3,template_id_199_sum_3,template_id_200_sum_3,template_id_201_sum_3,template_id_202_sum_3,template_id_203_sum_3,template_id_204_sum_3,template_id_205_sum_3,template_id_206_sum_3
0,000d33b21436,2020-09-02 12:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,000d33b21436,2020-09-02 16:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,005c5a9218ba,2020-06-28 19:00:00,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0079283bde6e,2020-04-26 21:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,007bdf23b62f,2020-06-16 18:00:00,2.0,0.0,13.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
template_fea.rename(columns={'collect_time_gap':'fault_time_label'},inplace=True)

In [8]:
#读取保存的embeddding
embedding_train = pd.read_pickle('./feature_data/embedding_fea/embedding_train.pkl')
embedding_test = pd.read_pickle('./feature_data/embedding_fea/embedding_test.pkl')
embedding_train.head()

Unnamed: 0,sn,embedding_feat0,embedding_feat1,embedding_feat2,embedding_feat3,embedding_feat4,embedding_feat5,embedding_feat6,embedding_feat7,embedding_feat8,embedding_feat9,embedding_feat10,embedding_feat11,embedding_feat12,embedding_feat13,embedding_feat14,embedding_feat15,fault_time_label,fault_time,label
0,SERVER_25698,1.020277,0.297846,0.474779,0.499088,-0.413101,-0.495065,0.249191,0.35931,-1.15273,0.599747,-0.212463,-0.209093,-0.867964,0.370903,1.387727,-0.352047,2020-10-09 13:40:00,2020-10-09 13:43:00,0
1,SERVER_25699,2.070783,-1.092592,3.970862,0.975216,0.239782,0.081832,-0.406009,3.066064,-1.713196,2.249714,-1.178554,-0.943295,-1.027284,2.333962,1.700569,-0.351994,2020-08-25 18:50:00,2020-08-25 18:50:00,0
2,SERVER_25712,2.132418,-1.659015,5.298077,0.512517,0.095081,0.020548,-0.193147,4.359308,-1.869364,2.813022,-1.705419,-1.755826,-1.044548,2.987559,1.28612,-0.58049,2020-03-16 13:20:00,2020-03-16 13:20:00,0
3,SERVER_25708,1.542623,-2.590915,5.310016,-0.339423,0.127313,-0.233968,-0.386217,4.480275,-1.985875,2.827102,-1.653761,-2.214546,-1.002495,3.578571,0.555872,-0.187675,2020-07-25 12:40:00,2020-07-25 12:44:00,0
4,SERVER_25711,2.175311,-1.558408,4.927365,0.550914,0.059585,0.028171,0.076562,3.995207,-1.86202,2.510679,-1.62623,-1.578915,-0.989311,2.600618,1.263535,-0.436117,2020-03-16 16:50:00,2020-03-16 16:51:00,0


In [9]:
embedding_test.head()

Unnamed: 0,sn,embedding_feat0,embedding_feat1,embedding_feat2,embedding_feat3,embedding_feat4,embedding_feat5,embedding_feat6,embedding_feat7,embedding_feat8,embedding_feat9,embedding_feat10,embedding_feat11,embedding_feat12,embedding_feat13,embedding_feat14,embedding_feat15,fault_time_label,fault_time
0,000d33b21436,0.796288,0.208153,0.331902,0.466663,-0.353881,-0.589886,0.32246,0.261887,-0.960032,0.504131,-0.231247,-0.170639,-0.79519,0.319201,1.228289,-0.289095,2020-09-02 16:40:00,2020-09-02 16:42:54
1,005c5a9218ba,1.17622,0.431399,1.176222,0.638711,-0.182056,-0.04501,0.269417,0.887499,-1.173676,0.741454,-0.46756,-0.333037,-0.865643,0.477464,1.439878,-0.188226,2020-06-28 19:05:00,2020-06-28 19:05:16
2,0079283bde6e,0.767825,0.182314,0.569512,0.43602,-0.238626,-0.246788,0.13751,0.436921,-0.69675,0.519538,-0.358014,-0.107496,-0.519124,0.330847,1.001029,-0.172139,2020-04-26 21:30:00,2020-04-26 21:32:44
3,007bdf23b62f,1.155905,-0.058251,1.451123,0.422066,-0.049248,-0.127855,-0.115311,0.793798,-0.992192,0.848966,-0.406178,-0.454347,-0.72658,0.724698,1.205146,-0.180917,2020-06-16 18:40:00,2020-06-16 18:40:39
4,00a577a8e54f,1.025073,0.507007,0.886335,0.731065,-0.280406,-0.107155,0.288103,0.645884,-0.955032,0.575018,-0.454296,-0.138279,-0.741069,0.286128,1.21861,-0.169919,2020-04-07 07:15:00,2020-04-07 07:16:55


In [35]:
embedding_train['fault_time_label'] = embedding_train['fault_time'].dt.ceil(freq='1h')

In [39]:
embedding_train.head()

Unnamed: 0,sn,embedding_feat0,embedding_feat1,embedding_feat2,embedding_feat3,embedding_feat4,embedding_feat5,embedding_feat6,embedding_feat7,embedding_feat8,embedding_feat9,embedding_feat10,embedding_feat11,embedding_feat12,embedding_feat13,embedding_feat14,embedding_feat15,fault_time_label,fault_time,label
0,SERVER_25698,1.020277,0.297846,0.474779,0.499088,-0.413101,-0.495065,0.249191,0.35931,-1.15273,0.599747,-0.212463,-0.209093,-0.867964,0.370903,1.387727,-0.352047,2020-10-09 14:00:00,2020-10-09 13:43:00,0
1,SERVER_25699,2.070783,-1.092592,3.970862,0.975216,0.239782,0.081832,-0.406009,3.066064,-1.713196,2.249714,-1.178554,-0.943295,-1.027284,2.333962,1.700569,-0.351994,2020-08-25 19:00:00,2020-08-25 18:50:00,0
2,SERVER_25712,2.132418,-1.659015,5.298077,0.512517,0.095081,0.020548,-0.193147,4.359308,-1.869364,2.813022,-1.705419,-1.755826,-1.044548,2.987559,1.28612,-0.58049,2020-03-16 14:00:00,2020-03-16 13:20:00,0
3,SERVER_25708,1.542623,-2.590915,5.310016,-0.339423,0.127313,-0.233968,-0.386217,4.480275,-1.985875,2.827102,-1.653761,-2.214546,-1.002495,3.578571,0.555872,-0.187675,2020-07-25 13:00:00,2020-07-25 12:44:00,0
4,SERVER_25711,2.175311,-1.558408,4.927365,0.550914,0.059585,0.028171,0.076562,3.995207,-1.86202,2.510679,-1.62623,-1.578915,-0.989311,2.600618,1.263535,-0.436117,2020-03-16 17:00:00,2020-03-16 16:51:00,0


In [13]:
# data = pd.merge(template_fea, embedding_train, on=["sn", "fault_time_label"], how="inner")
# print(len(data),len(template_fea),len(embedding_train))

986 35180 16165


In [36]:
df_train_label = pd.read_csv('./data/preliminary_train_label_dataset.csv')
df_train_label_s = pd.read_csv('./data/preliminary_train_label_dataset_s.csv')
df_train_label = pd.concat([df_train_label, df_train_label_s])
df_train_label = df_train_label.drop_duplicates(['sn','fault_time','label'])

In [38]:
df_data_train = pd.merge(template_fea[template_fea.sn.isin(df_train_label.sn)],df_train_label, on='sn', how='left')
# y = df_data_train['label']
# x = df_data_train.drop(['sn','fault_time','label'],axis=1)
print(len(df_data_train))
print(df_data_train.head())

df_data_train = df_data_train.merge(embedding_train[embedding_train.sn.isin(df_train_label.sn)],on=['sn','fault_time_label'],how='left')
print(len(df_data_train))
print(df_data_train.head())

44110
             sn    fault_time_label  template_id_1_sum_3  template_id_2_sum_3  \
0  SERVER_10001 2020-05-01 09:00:00                  0.0                  0.0   
1  SERVER_10001 2020-05-01 10:00:00                  0.0                  0.0   
2  SERVER_10003 2020-03-28 10:00:00                  0.0                  0.0   
3  SERVER_10008 2020-02-25 16:00:00                  0.0                  0.0   
4  SERVER_10008 2020-02-25 16:00:00                  0.0                  0.0   

   template_id_3_sum_3  template_id_4_sum_3  template_id_5_sum_3  \
0                  0.0                  0.0                  0.0   
1                  0.0                  0.0                  0.0   
2                  0.0                  0.0                  0.0   
3                  0.0                  0.0                  0.0   
4                  0.0                  0.0                  0.0   

   template_id_6_sum_3  template_id_7_sum_3  template_id_8_sum_3  ...  \
0                  0.0   

In [23]:
embedding_train[embedding_train['sn']=='SERVER_23182']

Unnamed: 0,sn,embedding_feat0,embedding_feat1,embedding_feat2,embedding_feat3,embedding_feat4,embedding_feat5,embedding_feat6,embedding_feat7,embedding_feat8,embedding_feat9,embedding_feat10,embedding_feat11,embedding_feat12,embedding_feat13,embedding_feat14,embedding_feat15,fault_time_label,fault_time,label
10175,SERVER_23182,0.663335,-0.122767,0.711739,0.381165,-0.09904,-0.241903,-0.042311,0.556463,-0.648758,0.471126,-0.291186,-0.209479,-0.518813,0.49811,0.8402,-0.068447,2020-10-16 02:15:00,2020-10-16 02:16:00,2
10183,SERVER_23182,1.557626,0.111594,1.974448,0.479942,-0.038823,0.08391,0.190751,1.481791,-1.362267,0.899555,-0.986968,-0.572592,-1.083219,0.979055,1.507587,-0.010032,2020-04-16 04:35:00,2020-04-16 04:38:00,2
10184,SERVER_23182,1.557626,0.111594,1.974448,0.479942,-0.038823,0.08391,0.190751,1.481791,-1.362267,0.899555,-0.986968,-0.572592,-1.083219,0.979055,1.507587,-0.010032,2020-04-16 04:35:00,2020-04-16 04:35:00,2
10185,SERVER_23182,1.557626,0.111594,1.974448,0.479942,-0.038823,0.08391,0.190751,1.481791,-1.362267,0.899555,-0.986968,-0.572592,-1.083219,0.979055,1.507587,-0.010032,2020-04-16 04:35:00,2020-04-16 04:39:00,2
10186,SERVER_23182,1.557626,0.111594,1.974448,0.479942,-0.038823,0.08391,0.190751,1.481791,-1.362267,0.899555,-0.986968,-0.572592,-1.083219,0.979055,1.507587,-0.010032,2020-04-16 04:35:00,2020-04-16 04:38:00,2
10187,SERVER_23182,1.557626,0.111594,1.974448,0.479942,-0.038823,0.08391,0.190751,1.481791,-1.362267,0.899555,-0.986968,-0.572592,-1.083219,0.979055,1.507587,-0.010032,2020-04-16 04:35:00,2020-04-16 04:35:00,2
10188,SERVER_23182,1.557626,0.111594,1.974448,0.479942,-0.038823,0.08391,0.190751,1.481791,-1.362267,0.899555,-0.986968,-0.572592,-1.083219,0.979055,1.507587,-0.010032,2020-04-16 04:35:00,2020-04-16 04:39:00,2
10189,SERVER_23182,1.557626,0.111594,1.974448,0.479942,-0.038823,0.08391,0.190751,1.481791,-1.362267,0.899555,-0.986968,-0.572592,-1.083219,0.979055,1.507587,-0.010032,2020-04-16 04:35:00,2020-04-16 04:38:00,2
10190,SERVER_23182,1.557626,0.111594,1.974448,0.479942,-0.038823,0.08391,0.190751,1.481791,-1.362267,0.899555,-0.986968,-0.572592,-1.083219,0.979055,1.507587,-0.010032,2020-04-16 04:35:00,2020-04-16 04:35:00,2
10191,SERVER_23182,1.557626,0.111594,1.974448,0.479942,-0.038823,0.08391,0.190751,1.481791,-1.362267,0.899555,-0.986968,-0.572592,-1.083219,0.979055,1.507587,-0.010032,2020-04-16 04:35:00,2020-04-16 04:39:00,2


### 5.获取数据集label

In [40]:
train_label["fault_time_label"] = train_label["fault_time"].dt.floor(freq="5T")
print(train_label.head())
print(train_label.groupby(["sn", "fault_time_label"]).count())

             sn          fault_time  label    fault_time_label
0  SERVER_25698 2020-10-09 13:43:00      0 2020-10-09 13:40:00
1  SERVER_25699 2020-08-25 18:50:00      0 2020-08-25 18:50:00
2  SERVER_25712 2020-03-16 13:20:00      0 2020-03-16 13:20:00
3  SERVER_25708 2020-07-25 12:44:00      0 2020-07-25 12:40:00
4  SERVER_25711 2020-03-16 16:51:00      0 2020-03-16 16:50:00
                                  fault_time  label
sn           fault_time_label                      
SERVER_10001 2020-05-01 10:00:00           1      1
SERVER_10003 2020-03-28 09:45:00           1      1
SERVER_10008 2020-02-25 16:10:00           1      1
             2020-03-11 18:00:00           1      1
SERVER_10009 2020-05-08 16:35:00           1      1
...                                      ...    ...
SERVER_9991  2020-08-04 22:45:00           1      1
             2020-10-07 18:40:00           1      1
SERVER_9993  2020-05-14 23:50:00           1      1
SERVER_9998  2020-05-29 11:25:00           1      

### 6.训练集数据：特征为前一天所有的日志embedding均值

In [8]:
train_label.reset_index(drop=True, inplace=True)
feat_cols = ["embedding_feat{}".format(i) for i in range(embedding_size)]


data = pd.DataFrame()
for i in tqdm(range(len(train_label))):
    sn_i = train_label.loc[i, "sn"]
    fault_time = train_label.loc[i, "fault_time_label"]
    start_time = fault_time - timedelta(days=1)
    df_ = train_pre_sel_log[
        (train_pre_sel_log["sn"] == sn_i) & (train_pre_sel_log["time_label"] < fault_time) & (train_pre_sel_log["time_label"] >= start_time)]
    df_ = df_.groupby("sn")[feat_cols].mean().reset_index(drop=False)
    df_["fault_time_label"] = fault_time
    data = data.append(df_)
        
# 特征标签合并
logger.info(f"{len(data)}, {len(train_label)}")
data = pd.merge(data, train_label, on=["sn", "fault_time_label"], how="inner")
        
print(len(data))
print(data.head())

100%|███████████████████████████████████████████████████████████████████████████| 16669/16669 [01:43<00:00, 161.09it/s]
2022-03-23 23:37:08,371 [INFO] 15787, 16669


16165
             sn  embedding_feat0  embedding_feat1  embedding_feat2  \
0  SERVER_25698         1.020277         0.297846         0.474779   
1  SERVER_25699         2.070783        -1.092592         3.970862   
2  SERVER_25712         2.132418        -1.659015         5.298077   
3  SERVER_25708         1.542623        -2.590915         5.310016   
4  SERVER_25711         2.175311        -1.558408         4.927365   

   embedding_feat3  embedding_feat4  embedding_feat5  embedding_feat6  \
0         0.499088        -0.413101        -0.495065         0.249191   
1         0.975216         0.239782         0.081832        -0.406009   
2         0.512517         0.095081         0.020548        -0.193147   
3        -0.339423         0.127313        -0.233968        -0.386217   
4         0.550914         0.059585         0.028171         0.076562   

   embedding_feat7  embedding_feat8  embedding_feat9  embedding_feat10  \
0         0.359310        -1.152730         0.599747        

In [61]:
data.head()

Unnamed: 0,sn,embedding_feat0,embedding_feat1,embedding_feat2,embedding_feat3,embedding_feat4,embedding_feat5,embedding_feat6,embedding_feat7,embedding_feat8,embedding_feat9,embedding_feat10,embedding_feat11,embedding_feat12,embedding_feat13,embedding_feat14,embedding_feat15,fault_time_label,fault_time,label
0,SERVER_25698,1.020277,0.297846,0.474779,0.499088,-0.413101,-0.495065,0.249191,0.35931,-1.15273,0.599747,-0.212463,-0.209093,-0.867964,0.370903,1.387727,-0.352047,2020-10-09 13:40:00,2020-10-09 13:43:00,0
1,SERVER_25699,2.070783,-1.092592,3.970862,0.975216,0.239782,0.081832,-0.406009,3.066064,-1.713196,2.249714,-1.178554,-0.943295,-1.027284,2.333962,1.700569,-0.351994,2020-08-25 18:50:00,2020-08-25 18:50:00,0
2,SERVER_25712,2.132418,-1.659015,5.298077,0.512517,0.095081,0.020548,-0.193147,4.359308,-1.869364,2.813022,-1.705419,-1.755826,-1.044548,2.987559,1.28612,-0.58049,2020-03-16 13:20:00,2020-03-16 13:20:00,0
3,SERVER_25708,1.542623,-2.590915,5.310016,-0.339423,0.127313,-0.233968,-0.386217,4.480275,-1.985875,2.827102,-1.653761,-2.214546,-1.002495,3.578571,0.555872,-0.187675,2020-07-25 12:40:00,2020-07-25 12:44:00,0
4,SERVER_25711,2.175311,-1.558408,4.927365,0.550914,0.059585,0.028171,0.076562,3.995207,-1.86202,2.510679,-1.62623,-1.578915,-0.989311,2.600618,1.263535,-0.436117,2020-03-16 16:50:00,2020-03-16 16:51:00,0


In [62]:
data.to_pickle('./feature_data/embedding_fea/embedding_train.pkl')

### 7.预测集数据

In [9]:
test = pd.DataFrame()
for i in tqdm(range(len(test_submit))):
    sn_i = test_submit.loc[i, "sn"]
    fault_time = test_submit.loc[i, "fault_time_label"]
    start_time = fault_time - timedelta(days=1)
    df_ = train_pre_sel_log[
        (train_pre_sel_log["sn"] == sn_i) & (train_pre_sel_log["time_label"] < fault_time) & (train_pre_sel_log["time_label"] >= start_time)]
    df_ = df_.groupby("sn")[feat_cols].mean().reset_index(drop=False)
    df_["fault_time_label"] = fault_time
    test = test.append(df_)
        
test.drop_duplicates(keep="first", inplace=True)
# 特征标签合并
logger.info(f"{len(test)}, {len(test_submit)}")
test = pd.merge(test, test_submit, on=["sn", "fault_time_label"], how="right")
        
print(len(test))
print(test_submit.head())
del train_pre_sel_log
gc.collect()

100%|█████████████████████████████████████████████████████████████████████████████| 3011/3011 [00:18<00:00, 164.05it/s]
2022-03-23 23:38:36,869 [INFO] 2790, 3011


3011
             sn          fault_time    fault_time_label
0  000d33b21436 2020-09-02 16:42:54 2020-09-02 16:40:00
1  005c5a9218ba 2020-06-28 19:05:16 2020-06-28 19:05:00
2  0079283bde6e 2020-04-26 21:32:44 2020-04-26 21:30:00
3  007bdf23b62f 2020-06-16 18:40:39 2020-06-16 18:40:00
4  00a577a8e54f 2020-04-07 07:16:55 2020-04-07 07:15:00


4302

In [64]:
test.to_pickle('./feature_data/embedding_fea/embedding_test.pkl')

### 8.线上评价指标

In [52]:
def Macro_f1(y_true,y_pred):
    every_class_f1 = f1_score(y_true, y_pred, average=None)  #list  包含每类的f1score
    macro_f1 =  3/7*every_class_f1[0]+2/7*every_class_f1[1]+1/7*every_class_f1[2]+1/7*every_class_f1[3]
    return 'maroc_f1',macro_f1

### 9.多折交叉训练

In [59]:
folds = StratifiedKFold(n_splits=5,shuffle=True, random_state=2022)
test_x = test[feat_cols]
test_y = np.zeros((len(test_x), 4),np.float16)
oof_pred = np.zeros(len(data))
for i, (train_idx, valid_idx) in enumerate(folds.split(data[feat_cols], data["label"])):
    logger.info("#####第{}折#####".format(i+1))
    train_x, valid_x = data.loc[train_idx, feat_cols], data.loc[valid_idx, feat_cols]
    train_y, valid_y = data.loc[train_idx, "label"], data.loc[valid_idx, "label"]
    print(train_x.shape, valid_x.shape)
    clf = XGBClassifier(
        max_depth=10,
        learning_rate=0.08, 
        n_estimators=10000,
        gamma=0.001, 
        subsample=0.8, 
        colsample_bytree=0.8,
        reg_lambda=1, 
        reg_alpha=1,
        objective="multi:softmax",
        tree_method='gpu_hist' 
    )
    clf.fit(train_x, train_y, 
            eval_set=[(train_x, train_y), (valid_x, valid_y)],
            eval_metric=["merror", "mlogloss"],
            early_stopping_rounds=100, verbose=1)
    test_y += clf.predict_proba(test_x)
    oof_pred[valid_idx] = clf.predict(valid_x)
    
print(oof_pred)
macro_f1_score = Macro_f1(oof_pred,data.label)
print('加权f1_score:{}'.format(macro_f1_score))

2022-03-24 21:30:59,642 [INFO] #####training fold0#####


(12932, 16) (3233, 16)
Parameters: { "randon_state" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	validation_0-merror:0.26601	validation_0-mlogloss:1.31799	validation_1-merror:0.36375	validation_1-mlogloss:1.33043




[1]	validation_0-merror:0.24366	validation_0-mlogloss:1.25855	validation_1-merror:0.33993	validation_1-mlogloss:1.28254
[2]	validation_0-merror:0.23152	validation_0-mlogloss:1.20330	validation_1-merror:0.33405	validation_1-mlogloss:1.23962
[3]	validation_0-merror:0.22471	validation_0-mlogloss:1.15415	validation_1-merror:0.32694	validation_1-mlogloss:1.20056
[4]	validation_0-merror:0.21976	validation_0-mlogloss:1.10774	validation_1-merror:0.32632	validation_1-mlogloss:1.16511
[5]	validation_0-merror:0.21443	validation_0-mlogloss:1.06647	validation_1-merror:0.32663	validation_1-mlogloss:1.13417
[6]	validation_0-merror:0.21234	validation_0-mlogloss:1.02798	validation_1-merror:0.32663	validation_1-mlogloss:1.10539
[7]	validation_0-merror:0.20863	validation_0-mlogloss:0.99362	validation_1-merror:0.32756	validation_1-mlogloss:1.08041
[8]	validation_0-merror:0.20608	validation_0-mlogloss:0.96178	validation_1-merror:0.32539	validation_1-mlogloss:1.05770
[9]	validation_0-merror:0.20213	validati

[69]	validation_0-merror:0.07849	validation_0-mlogloss:0.36169	validation_1-merror:0.30220	validation_1-mlogloss:0.77998
[70]	validation_0-merror:0.07771	validation_0-mlogloss:0.35940	validation_1-merror:0.30282	validation_1-mlogloss:0.77967
[71]	validation_0-merror:0.07671	validation_0-mlogloss:0.35651	validation_1-merror:0.30282	validation_1-mlogloss:0.77935
[72]	validation_0-merror:0.07555	validation_0-mlogloss:0.35262	validation_1-merror:0.30189	validation_1-mlogloss:0.77886
[73]	validation_0-merror:0.07346	validation_0-mlogloss:0.34914	validation_1-merror:0.30220	validation_1-mlogloss:0.77827
[74]	validation_0-merror:0.07246	validation_0-mlogloss:0.34635	validation_1-merror:0.30158	validation_1-mlogloss:0.77799
[75]	validation_0-merror:0.07106	validation_0-mlogloss:0.34320	validation_1-merror:0.30220	validation_1-mlogloss:0.77755
[76]	validation_0-merror:0.06913	validation_0-mlogloss:0.34065	validation_1-merror:0.30158	validation_1-mlogloss:0.77715
[77]	validation_0-merror:0.06774

[137]	validation_0-merror:0.01887	validation_0-mlogloss:0.21085	validation_1-merror:0.30034	validation_1-mlogloss:0.77198
[138]	validation_0-merror:0.01848	validation_0-mlogloss:0.20953	validation_1-merror:0.30003	validation_1-mlogloss:0.77208
[139]	validation_0-merror:0.01833	validation_0-mlogloss:0.20810	validation_1-merror:0.29879	validation_1-mlogloss:0.77212
[140]	validation_0-merror:0.01833	validation_0-mlogloss:0.20688	validation_1-merror:0.29941	validation_1-mlogloss:0.77238
[141]	validation_0-merror:0.01786	validation_0-mlogloss:0.20568	validation_1-merror:0.29941	validation_1-mlogloss:0.77243
[142]	validation_0-merror:0.01763	validation_0-mlogloss:0.20450	validation_1-merror:0.29972	validation_1-mlogloss:0.77261
[143]	validation_0-merror:0.01732	validation_0-mlogloss:0.20295	validation_1-merror:0.30034	validation_1-mlogloss:0.77268
[144]	validation_0-merror:0.01717	validation_0-mlogloss:0.20168	validation_1-merror:0.30034	validation_1-mlogloss:0.77291
[145]	validation_0-merro

2022-03-24 21:31:19,489 [INFO] #####training fold1#####


(12932, 16) (3233, 16)
Parameters: { "randon_state" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	validation_0-merror:0.26284	validation_0-mlogloss:1.31770	validation_1-merror:0.36127	validation_1-mlogloss:1.32973
[1]	validation_0-merror:0.24296	validation_0-mlogloss:1.25782	validation_1-merror:0.34426	validation_1-mlogloss:1.28087
[2]	validation_0-merror:0.23438	validation_0-mlogloss:1.20383	validation_1-merror:0.33560	validation_1-mlogloss:1.23725
[3]	validation_0-merror:0.22417	validation_0-mlogloss:1.15493	validation_1-merror:0.32880	validation_1-mlogloss:1.19861
[4]	validation_0-merror:0.21690	validation_0-mlogloss:1.10856	validation_1-merror:0.32725	validation_1-mlogloss:1.16320
[5]	validation_0-merror:0.21126	validation_0-mlogloss:1.06

[65]	validation_0-merror:0.08498	validation_0-mlogloss:0.37461	validation_1-merror:0.29972	validation_1-mlogloss:0.77977
[66]	validation_0-merror:0.08251	validation_0-mlogloss:0.37049	validation_1-merror:0.29694	validation_1-mlogloss:0.77923
[67]	validation_0-merror:0.08174	validation_0-mlogloss:0.36749	validation_1-merror:0.29725	validation_1-mlogloss:0.77865
[68]	validation_0-merror:0.08065	validation_0-mlogloss:0.36423	validation_1-merror:0.29756	validation_1-mlogloss:0.77862
[69]	validation_0-merror:0.07973	validation_0-mlogloss:0.36183	validation_1-merror:0.29787	validation_1-mlogloss:0.77786
[70]	validation_0-merror:0.07795	validation_0-mlogloss:0.35889	validation_1-merror:0.29725	validation_1-mlogloss:0.77731
[71]	validation_0-merror:0.07655	validation_0-mlogloss:0.35492	validation_1-merror:0.29848	validation_1-mlogloss:0.77670
[72]	validation_0-merror:0.07501	validation_0-mlogloss:0.35149	validation_1-merror:0.29787	validation_1-mlogloss:0.77646
[73]	validation_0-merror:0.07377

[133]	validation_0-merror:0.02049	validation_0-mlogloss:0.21558	validation_1-merror:0.29230	validation_1-mlogloss:0.77191
[134]	validation_0-merror:0.02018	validation_0-mlogloss:0.21421	validation_1-merror:0.29385	validation_1-mlogloss:0.77169
[135]	validation_0-merror:0.02018	validation_0-mlogloss:0.21286	validation_1-merror:0.29385	validation_1-mlogloss:0.77180
[136]	validation_0-merror:0.01987	validation_0-mlogloss:0.21131	validation_1-merror:0.29385	validation_1-mlogloss:0.77183
[137]	validation_0-merror:0.01926	validation_0-mlogloss:0.20977	validation_1-merror:0.29230	validation_1-mlogloss:0.77166
[138]	validation_0-merror:0.01902	validation_0-mlogloss:0.20809	validation_1-merror:0.29230	validation_1-mlogloss:0.77164
[139]	validation_0-merror:0.01864	validation_0-mlogloss:0.20682	validation_1-merror:0.29292	validation_1-mlogloss:0.77194
[140]	validation_0-merror:0.01833	validation_0-mlogloss:0.20521	validation_1-merror:0.29199	validation_1-mlogloss:0.77183
[141]	validation_0-merro

[201]	validation_0-merror:0.00789	validation_0-mlogloss:0.14282	validation_1-merror:0.29261	validation_1-mlogloss:0.78429
[202]	validation_0-merror:0.00789	validation_0-mlogloss:0.14218	validation_1-merror:0.29168	validation_1-mlogloss:0.78444
[203]	validation_0-merror:0.00781	validation_0-mlogloss:0.14165	validation_1-merror:0.29292	validation_1-mlogloss:0.78478


2022-03-24 21:31:37,589 [INFO] #####training fold2#####


(12932, 16) (3233, 16)
Parameters: { "randon_state" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	validation_0-merror:0.26593	validation_0-mlogloss:1.31827	validation_1-merror:0.35447	validation_1-mlogloss:1.32948
[1]	validation_0-merror:0.23685	validation_0-mlogloss:1.25700	validation_1-merror:0.32787	validation_1-mlogloss:1.28039
[2]	validation_0-merror:0.22750	validation_0-mlogloss:1.20211	validation_1-merror:0.32261	validation_1-mlogloss:1.23654
[3]	validation_0-merror:0.22239	validation_0-mlogloss:1.15355	validation_1-merror:0.32076	validation_1-mlogloss:1.19901
[4]	validation_0-merror:0.21860	validation_0-mlogloss:1.10688	validation_1-merror:0.32137	validation_1-mlogloss:1.16262
[5]	validation_0-merror:0.21474	validation_0-mlogloss:1.06

[65]	validation_0-merror:0.08846	validation_0-mlogloss:0.38157	validation_1-merror:0.28920	validation_1-mlogloss:0.76972
[66]	validation_0-merror:0.08699	validation_0-mlogloss:0.37783	validation_1-merror:0.28859	validation_1-mlogloss:0.76899
[67]	validation_0-merror:0.08545	validation_0-mlogloss:0.37378	validation_1-merror:0.28704	validation_1-mlogloss:0.76807
[68]	validation_0-merror:0.08406	validation_0-mlogloss:0.37045	validation_1-merror:0.28673	validation_1-mlogloss:0.76758
[69]	validation_0-merror:0.08305	validation_0-mlogloss:0.36743	validation_1-merror:0.28735	validation_1-mlogloss:0.76701
[70]	validation_0-merror:0.08104	validation_0-mlogloss:0.36434	validation_1-merror:0.28704	validation_1-mlogloss:0.76658
[71]	validation_0-merror:0.07911	validation_0-mlogloss:0.36085	validation_1-merror:0.28611	validation_1-mlogloss:0.76577
[72]	validation_0-merror:0.07741	validation_0-mlogloss:0.35663	validation_1-merror:0.28549	validation_1-mlogloss:0.76533
[73]	validation_0-merror:0.07632

[133]	validation_0-merror:0.02142	validation_0-mlogloss:0.21729	validation_1-merror:0.28642	validation_1-mlogloss:0.76156
[134]	validation_0-merror:0.02096	validation_0-mlogloss:0.21546	validation_1-merror:0.28642	validation_1-mlogloss:0.76167
[135]	validation_0-merror:0.02080	validation_0-mlogloss:0.21384	validation_1-merror:0.28673	validation_1-mlogloss:0.76188
[136]	validation_0-merror:0.02049	validation_0-mlogloss:0.21208	validation_1-merror:0.28549	validation_1-mlogloss:0.76196
[137]	validation_0-merror:0.02011	validation_0-mlogloss:0.21058	validation_1-merror:0.28549	validation_1-mlogloss:0.76204
[138]	validation_0-merror:0.01972	validation_0-mlogloss:0.20902	validation_1-merror:0.28518	validation_1-mlogloss:0.76213
[139]	validation_0-merror:0.01910	validation_0-mlogloss:0.20748	validation_1-merror:0.28549	validation_1-mlogloss:0.76237
[140]	validation_0-merror:0.01895	validation_0-mlogloss:0.20627	validation_1-merror:0.28673	validation_1-mlogloss:0.76248
[141]	validation_0-merro

[201]	validation_0-merror:0.00750	validation_0-mlogloss:0.14301	validation_1-merror:0.28642	validation_1-mlogloss:0.77133
[202]	validation_0-merror:0.00727	validation_0-mlogloss:0.14227	validation_1-merror:0.28673	validation_1-mlogloss:0.77170
[203]	validation_0-merror:0.00727	validation_0-mlogloss:0.14159	validation_1-merror:0.28673	validation_1-mlogloss:0.77173
[204]	validation_0-merror:0.00727	validation_0-mlogloss:0.14089	validation_1-merror:0.28642	validation_1-mlogloss:0.77197
[205]	validation_0-merror:0.00727	validation_0-mlogloss:0.14019	validation_1-merror:0.28766	validation_1-mlogloss:0.77225
[206]	validation_0-merror:0.00727	validation_0-mlogloss:0.13952	validation_1-merror:0.28766	validation_1-mlogloss:0.77231
[207]	validation_0-merror:0.00727	validation_0-mlogloss:0.13881	validation_1-merror:0.28735	validation_1-mlogloss:0.77260


2022-03-24 21:31:56,107 [INFO] #####training fold3#####


(12932, 16) (3233, 16)
Parameters: { "randon_state" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	validation_0-merror:0.25093	validation_0-mlogloss:1.31625	validation_1-merror:0.35107	validation_1-mlogloss:1.32877
[1]	validation_0-merror:0.23322	validation_0-mlogloss:1.25575	validation_1-merror:0.33282	validation_1-mlogloss:1.27984
[2]	validation_0-merror:0.22379	validation_0-mlogloss:1.20095	validation_1-merror:0.32076	validation_1-mlogloss:1.23609
[3]	validation_0-merror:0.21760	validation_0-mlogloss:1.15113	validation_1-merror:0.30993	validation_1-mlogloss:1.19687
[4]	validation_0-merror:0.21172	validation_0-mlogloss:1.10478	validation_1-merror:0.31302	validation_1-mlogloss:1.16094
[5]	validation_0-merror:0.20770	validation_0-mlogloss:1.06

[65]	validation_0-merror:0.08336	validation_0-mlogloss:0.37185	validation_1-merror:0.28797	validation_1-mlogloss:0.76878
[66]	validation_0-merror:0.08174	validation_0-mlogloss:0.36788	validation_1-merror:0.28859	validation_1-mlogloss:0.76815
[67]	validation_0-merror:0.08019	validation_0-mlogloss:0.36428	validation_1-merror:0.28951	validation_1-mlogloss:0.76741
[68]	validation_0-merror:0.07918	validation_0-mlogloss:0.36020	validation_1-merror:0.28859	validation_1-mlogloss:0.76714
[69]	validation_0-merror:0.07787	validation_0-mlogloss:0.35739	validation_1-merror:0.29013	validation_1-mlogloss:0.76682
[70]	validation_0-merror:0.07671	validation_0-mlogloss:0.35486	validation_1-merror:0.28982	validation_1-mlogloss:0.76620
[71]	validation_0-merror:0.07485	validation_0-mlogloss:0.35146	validation_1-merror:0.28951	validation_1-mlogloss:0.76584
[72]	validation_0-merror:0.07369	validation_0-mlogloss:0.34815	validation_1-merror:0.28951	validation_1-mlogloss:0.76546
[73]	validation_0-merror:0.07338

[133]	validation_0-merror:0.01980	validation_0-mlogloss:0.21782	validation_1-merror:0.28457	validation_1-mlogloss:0.75861
[134]	validation_0-merror:0.01972	validation_0-mlogloss:0.21639	validation_1-merror:0.28457	validation_1-mlogloss:0.75865
[135]	validation_0-merror:0.01933	validation_0-mlogloss:0.21496	validation_1-merror:0.28457	validation_1-mlogloss:0.75875
[136]	validation_0-merror:0.01887	validation_0-mlogloss:0.21378	validation_1-merror:0.28364	validation_1-mlogloss:0.75889
[137]	validation_0-merror:0.01848	validation_0-mlogloss:0.21266	validation_1-merror:0.28457	validation_1-mlogloss:0.75897
[138]	validation_0-merror:0.01840	validation_0-mlogloss:0.21113	validation_1-merror:0.28457	validation_1-mlogloss:0.75893
[139]	validation_0-merror:0.01825	validation_0-mlogloss:0.20960	validation_1-merror:0.28518	validation_1-mlogloss:0.75917
[140]	validation_0-merror:0.01778	validation_0-mlogloss:0.20780	validation_1-merror:0.28518	validation_1-mlogloss:0.75900
[141]	validation_0-merro

[201]	validation_0-merror:0.00804	validation_0-mlogloss:0.14530	validation_1-merror:0.28054	validation_1-mlogloss:0.77067
[202]	validation_0-merror:0.00796	validation_0-mlogloss:0.14433	validation_1-merror:0.27931	validation_1-mlogloss:0.77077
[203]	validation_0-merror:0.00796	validation_0-mlogloss:0.14347	validation_1-merror:0.27869	validation_1-mlogloss:0.77112
[204]	validation_0-merror:0.00804	validation_0-mlogloss:0.14263	validation_1-merror:0.27838	validation_1-mlogloss:0.77156
[205]	validation_0-merror:0.00796	validation_0-mlogloss:0.14191	validation_1-merror:0.27931	validation_1-mlogloss:0.77193
[206]	validation_0-merror:0.00796	validation_0-mlogloss:0.14119	validation_1-merror:0.27962	validation_1-mlogloss:0.77226
[207]	validation_0-merror:0.00789	validation_0-mlogloss:0.14048	validation_1-merror:0.27962	validation_1-mlogloss:0.77242
[208]	validation_0-merror:0.00781	validation_0-mlogloss:0.13971	validation_1-merror:0.28024	validation_1-mlogloss:0.77285
[209]	validation_0-merro

2022-03-24 21:32:14,775 [INFO] #####training fold4#####


(12932, 16) (3233, 16)
Parameters: { "randon_state" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	validation_0-merror:0.26005	validation_0-mlogloss:1.31690	validation_1-merror:0.36251	validation_1-mlogloss:1.32987
[1]	validation_0-merror:0.23631	validation_0-mlogloss:1.25452	validation_1-merror:0.33220	validation_1-mlogloss:1.27955
[2]	validation_0-merror:0.22843	validation_0-mlogloss:1.20043	validation_1-merror:0.32632	validation_1-mlogloss:1.23585
[3]	validation_0-merror:0.22193	validation_0-mlogloss:1.15124	validation_1-merror:0.31921	validation_1-mlogloss:1.19788
[4]	validation_0-merror:0.21567	validation_0-mlogloss:1.10465	validation_1-merror:0.31364	validation_1-mlogloss:1.16163
[5]	validation_0-merror:0.21203	validation_0-mlogloss:1.06

[65]	validation_0-merror:0.08220	validation_0-mlogloss:0.37343	validation_1-merror:0.29199	validation_1-mlogloss:0.77647
[66]	validation_0-merror:0.08119	validation_0-mlogloss:0.37010	validation_1-merror:0.29230	validation_1-mlogloss:0.77618
[67]	validation_0-merror:0.07973	validation_0-mlogloss:0.36648	validation_1-merror:0.29013	validation_1-mlogloss:0.77550
[68]	validation_0-merror:0.07741	validation_0-mlogloss:0.36249	validation_1-merror:0.29044	validation_1-mlogloss:0.77444
[69]	validation_0-merror:0.07601	validation_0-mlogloss:0.35919	validation_1-merror:0.29106	validation_1-mlogloss:0.77407
[70]	validation_0-merror:0.07431	validation_0-mlogloss:0.35532	validation_1-merror:0.28982	validation_1-mlogloss:0.77366
[71]	validation_0-merror:0.07338	validation_0-mlogloss:0.35195	validation_1-merror:0.28982	validation_1-mlogloss:0.77305
[72]	validation_0-merror:0.07184	validation_0-mlogloss:0.34801	validation_1-merror:0.28951	validation_1-mlogloss:0.77274
[73]	validation_0-merror:0.07114

[133]	validation_0-merror:0.02134	validation_0-mlogloss:0.21529	validation_1-merror:0.29199	validation_1-mlogloss:0.76824
[134]	validation_0-merror:0.02080	validation_0-mlogloss:0.21396	validation_1-merror:0.29044	validation_1-mlogloss:0.76848
[135]	validation_0-merror:0.02057	validation_0-mlogloss:0.21255	validation_1-merror:0.28982	validation_1-mlogloss:0.76853
[136]	validation_0-merror:0.02011	validation_0-mlogloss:0.21100	validation_1-merror:0.28982	validation_1-mlogloss:0.76868
[137]	validation_0-merror:0.01941	validation_0-mlogloss:0.20965	validation_1-merror:0.29075	validation_1-mlogloss:0.76869
[138]	validation_0-merror:0.01879	validation_0-mlogloss:0.20808	validation_1-merror:0.29044	validation_1-mlogloss:0.76912
[139]	validation_0-merror:0.01825	validation_0-mlogloss:0.20658	validation_1-merror:0.28951	validation_1-mlogloss:0.76908
[140]	validation_0-merror:0.01833	validation_0-mlogloss:0.20492	validation_1-merror:0.28920	validation_1-mlogloss:0.76912
[141]	validation_0-merro

[201]	validation_0-merror:0.00820	validation_0-mlogloss:0.14283	validation_1-merror:0.29168	validation_1-mlogloss:0.77915
[202]	validation_0-merror:0.00812	validation_0-mlogloss:0.14205	validation_1-merror:0.29013	validation_1-mlogloss:0.77932
[203]	validation_0-merror:0.00796	validation_0-mlogloss:0.14146	validation_1-merror:0.29137	validation_1-mlogloss:0.77947
[204]	validation_0-merror:0.00804	validation_0-mlogloss:0.14086	validation_1-merror:0.29137	validation_1-mlogloss:0.77978
[205]	validation_0-merror:0.00789	validation_0-mlogloss:0.14002	validation_1-merror:0.29075	validation_1-mlogloss:0.77986
[206]	validation_0-merror:0.00796	validation_0-mlogloss:0.13938	validation_1-merror:0.29106	validation_1-mlogloss:0.77994
[207]	validation_0-merror:0.00773	validation_0-mlogloss:0.13858	validation_1-merror:0.29044	validation_1-mlogloss:0.78029
[208]	validation_0-merror:0.00773	validation_0-mlogloss:0.13794	validation_1-merror:0.29106	validation_1-mlogloss:0.78047
[209]	validation_0-merro

In [None]:
folds = StratifiedKFold(n_splits=5,shuffle=True, random_state=2022)
test_x = test[feat_cols]
test_y = np.zeros((len(test_x), 4),np.float16)
for i, (train_idx, valid_idx) in enumerate(folds.split(data[feat_cols], data["label"])):
    logger.info(f"#####training fold{i}#####")
    train_x, valid_x = data.loc[train_idx, feat_cols], data.loc[valid_idx, feat_cols]
    train_y, valid_y = data.loc[train_idx, "label"], data.loc[valid_idx, "label"]
    print(train_x.shape, valid_x.shape)
    clf = XGBClassifier(
        learning_rate=0.08, 
        n_estimators=200,
        gamma=0.001, 
        subsample=0.8, 
        colsample_bytree=0.8,
        reg_lambda=1, 
        reg_alpha=1,
        objective="multi:softmax",
        tree_method='gpu_hist' 
    )
    clf.fit(train_x, train_y, 
            eval_set=[(train_x, train_y), (valid_x, valid_y)],
#             eval_metric = Macro_f1
            eval_metric=["merror", "mlogloss"]
            , early_stopping_rounds=30, verbose=1)
    test_y += clf.predict_proba(test_x)
    

### 9.预测结果生成csv文件提交

In [60]:
pred_y = np.argmax(test_y, axis=1)

test_submit["label"] = pred_y
df_submit = test_submit[["sn", "fault_time", "label"]]


csv_file = "./submit/submit.csv"
df_submit.to_csv(csv_file, index=False)

# from zipfile import ZipFile
# zip_file = ZipFile(f"{csv_file}.zip", "w")
# zip_file.write(csv_file )
# zip_file.close()

logger.info("#####Well Done <_>#####")

2022-03-24 21:33:02,001 [INFO] #####Well Done <_>#####


In [None]:
res['label'] = clf.predict(x_test)
res = res.sort_values(['sn','fault_time'])
res = res.drop_duplicates(['sn','fault_time'],keep='last')      #去重，同一个sn出现多次诊断结果，则评测程序会选取第一条用于评分
res.to_csv('comp_a_result_1.csv', index=0)