In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from catboost import CatBoostClassifier
import matplotlib.pyplot as plt

In [2]:
feature = []
for i in range(130):
    feature.append('Event_Id_'+str(i))
feature.extend(['num_log', 'span_delta', 'before', 
                'min_delta',  'max_delta', 'mean_delta', 'std_delta', 
                'min_delta_diff',  'max_delta_diff', 'mean_delta_diff', 'std_delta_diff',
                'server_model'])

In [3]:
random_seed = 42

In [4]:
data  = pd.read_csv('./dataset/v1/label_pro.csv')
X = data[feature].values
y = data['label'].values
sn = data['sn'].values
fault_time = data['fault_time'].values

test  = pd.read_csv('./dataset/v1/submit_pro.csv')[feature].values

kf = KFold(n_splits=10, shuffle=True, random_state=random_seed)

In [5]:
preds, valid_dfs = [], []
for i, (train_index, valid_index) in enumerate(kf.split(X, y)):
    print('第'+str(i+1)+'折...')
    X_train, X_valid = X[train_index], X[valid_index]
    y_train, y_valid = y[train_index], y[valid_index]
    sn_valid = sn[valid_index]
    fault_time_valid = fault_time[valid_index]
    
    # 训练
    model = CatBoostClassifier(loss_function='MultiClass', verbose=0, 
                               class_weights=[3 / 7, 2 / 7, 1 / 7, 1 / 7], random_seed=random_seed,
                               learning_rate=0.1, use_best_model=True)
    model.fit(X_train,y_train, eval_set=(X_valid, y_valid), plot=False)
              
    # 验证
    proba = model.predict_proba(X_valid).T
    pred = model.predict(X_valid)
    overall_df = pd.DataFrame()
    overall_df['sn'] = sn_valid
    overall_df['fault_time'] = fault_time_valid
    overall_df['proba_0'] = proba[0,:]
    overall_df['proba_1'] = proba[1,:]
    overall_df['proba_2'] = proba[2,:]
    overall_df['proba_3'] = proba[3,:]
    overall_df['label_gt'] = y_valid
    overall_df['label_pr'] = pred
    overall_df['true'] = overall_df.apply(lambda x: 1 if x['label_gt']==x['label_pr'] else 0, axis=1)
    valid_dfs.append(overall_df)

    weights = [3 / 7, 2 / 7, 1 / 7, 1 / 7]

    macro_F1=0
    for i in range(len(weights)):
        TP = len(overall_df[(overall_df['label_gt'] == i) & (overall_df['label_pr'] == i)])
        FP = len(overall_df[(overall_df['label_gt'] != i) & (overall_df['label_pr'] == i)])
        FN = len(overall_df[(overall_df['label_gt'] == i) & (overall_df['label_pr'] != i)])
        precision = TP / (TP + FP) if (TP + FP) > 0 else 0
        recall = TP / (TP + FN) if (TP + FN) > 0 else 0
        F1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
        macro_F1 += weights[i] * F1
    print('valid score: '+str(macro_F1))
              
    # 测试
    pred = model.predict_proba(test)
    preds.append(pred)

第1折...
valid score: 0.6525008644346393
第2折...
valid score: 0.6455576001879212
第3折...
valid score: 0.6583470571774973
第4折...
valid score: 0.6364535073886086
第5折...
valid score: 0.7029956942725135
第6折...
valid score: 0.66423376934522
第7折...
valid score: 0.6690577849703133
第8折...
valid score: 0.6564941964887316
第9折...
valid score: 0.6654103555267306
第10折...
valid score: 0.671253516725976


In [6]:
valid = pd.concat(valid_dfs)
valid.to_csv('./valid.csv', index=False)

In [7]:
preds = np.argmax(np.mean(preds, axis=0), axis=1)

In [8]:
result = pd.read_csv('./dataset/src/preliminary_submit_dataset_a.csv')
result['label'] = preds
result.to_csv('./result.csv', index=False)
result.head()

Unnamed: 0,sn,fault_time,label
0,000d33b21436,2020-09-02 16:42:54,3
1,005c5a9218ba,2020-06-28 19:05:16,2
2,0079283bde6e,2020-04-26 21:32:44,3
3,007bdf23b62f,2020-06-16 18:40:39,2
4,00a577a8e54f,2020-04-07 07:16:55,2
