In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from catboost import CatBoostClassifier
import matplotlib.pyplot as plt

In [2]:
feature  = pd.read_csv('./dataset/v1/label_pro.csv').columns.values.tolist()
feature.remove('sn')
feature.remove('fault_time')
feature.remove('label')

In [3]:
random_seed = 42

In [4]:
data  = pd.read_csv('./dataset/v1/label_pro.csv')
X = data[feature].values
y = data['label'].values

test  = pd.read_csv('./dataset/v1/submit_pro.csv')[feature].values

kf = KFold(n_splits=10, shuffle=True, random_state=random_seed)

In [5]:
preds = []
for i, (train_index, valid_index) in enumerate(kf.split(X, y)):
    print('第'+str(i+1)+'折...')
    X_train, X_valid = X[train_index], X[valid_index]
    y_train, y_valid = y[train_index], y[valid_index]
    
    # 训练
    model = CatBoostClassifier(loss_function='MultiClass', task_type="GPU", devices='0:1', verbose=0, 
                               class_weights=[3 / 7, 2 / 7, 1 / 7, 1 / 7], random_seed=random_seed,
                               learning_rate=0.1, use_best_model=True)
    model.fit(X_train,y_train, eval_set=(X_valid, y_valid), plot=False)
              
    # 验证
    pred = model.predict(X_valid)
    overall_df = pd.DataFrame()
    overall_df['label_gt'] = y_valid
    overall_df['label_pr'] = pred

    weights = [3 / 7, 2 / 7, 1 / 7, 1 / 7]

    macro_F1=0
    for i in range(len(weights)):
        TP = len(overall_df[(overall_df['label_gt'] == i) & (overall_df['label_pr'] == i)])
        FP = len(overall_df[(overall_df['label_gt'] != i) & (overall_df['label_pr'] == i)])
        FN = len(overall_df[(overall_df['label_gt'] == i) & (overall_df['label_pr'] != i)])
        precision = TP / (TP + FP) if (TP + FP) > 0 else 0
        recall = TP / (TP + FN) if (TP + FN) > 0 else 0
        F1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
        macro_F1 += weights[i] * F1
    print('valid score: '+str(macro_F1))
              
    # 测试
    pred = model.predict_proba(test)
    preds.append(pred)

第1折...
valid score: 0.647415154884297
第2折...
valid score: 0.6538197214663478
第3折...
valid score: 0.6716511078145372
第4折...
valid score: 0.631715525834514
第5折...
valid score: 0.7017892569150029
第6折...
valid score: 0.6581594307173915
第7折...
valid score: 0.6829919283976753
第8折...
valid score: 0.669712569680072
第9折...
valid score: 0.6465597769879424
第10折...
valid score: 0.6844547400897085


In [6]:
preds = np.argmax(np.mean(preds, axis=0), axis=1)

In [8]:
result = pd.read_csv('./dataset/src/preliminary_submit_dataset_b.csv')
result['label'] = preds
result.to_csv('./result_0.csv', index=False)
result.head()

Unnamed: 0,sn,fault_time,label
0,0015fe530ad4,2020-05-01 23:48:17,2
1,00380f1435b0,2020-07-28 07:51:13,3
2,0045a71d0221,2020-07-02 06:33:54,1
3,004d5a7954e7,2020-08-24 08:27:55,2
4,004d5a7954e7,2020-08-24 09:42:45,0
