In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from catboost import CatBoostClassifier
import matplotlib.pyplot as plt

In [2]:
feature  = pd.read_csv('./dataset/v1/label_pro.csv').columns.values.tolist()
feature.remove('sn')
feature.remove('fault_time')
feature.remove('label')

In [3]:
random_seed = 42

In [4]:
data  = pd.read_csv('./dataset/v1/label_pro.csv')
X = data[feature].values
y = data['label'].values

X_test  = pd.read_csv('./dataset/v1/submit_pro.csv')[feature].values

kf = KFold(n_splits=10, shuffle=True, random_state=random_seed)

In [5]:
preds = []
for i, (train_index, valid_index) in enumerate(kf.split(X, y)):
    print('第'+str(i+1)+'折...')
    X_train, X_valid = X[train_index], X[valid_index]
    y_train, y_valid = y[train_index], y[valid_index]
    
    # 区分01->1, 23->0
    y_train_copy, y_valid_copy = y_train.copy(), y_valid.copy()
    y_train_copy[(y_train_copy==0) | (y_train_copy==1)] = 1
    y_train_copy[(y_train_copy==2) | (y_train_copy==3)] = 0
    y_valid_copy[(y_valid_copy==0) | (y_valid_copy==1)] = 1
    y_valid_copy[(y_valid_copy==2) | (y_valid_copy==3)] = 0
    
    model1 = CatBoostClassifier(loss_function='Logloss', verbose=0, eval_metric='F1',
                               class_weights=[2/7, 5/7], random_seed=random_seed,
                               learning_rate=0.1, use_best_model=True)
    model1.fit(X_train, y_train_copy, eval_set=(X_valid, y_valid_copy), plot=False)
    
    # 区分0->1, 1->0
    X_train_copy = X_train[(y_train==0) | (y_train==1)]
    X_valid_copy = X_valid[(y_valid==0) | (y_valid==1)]
    y_train_copy = y_train[(y_train==0) | (y_train==1)]
    y_valid_copy = y_valid[(y_valid==0) | (y_valid==1)]
    y_train_copy[y_train_copy==0] = 2
    y_train_copy[y_train_copy==1] = 0
    y_train_copy[y_train_copy==2] = 1
    y_valid_copy[y_valid_copy==0] = 2
    y_valid_copy[y_valid_copy==1] = 0
    y_valid_copy[y_valid_copy==2] = 1
    
    model2 = CatBoostClassifier(loss_function='Logloss', verbose=0, eval_metric='F1',
                               class_weights=[2/5, 3/5], random_seed=random_seed,
                               learning_rate=0.1, use_best_model=True)
    model2.fit(X_train_copy, y_train_copy, eval_set=(X_valid_copy, y_valid_copy), plot=False)
    
    # 区分2->1, 3->0
    X_train_copy = X_train[(y_train==2) | (y_train==3)]
    X_valid_copy = X_valid[(y_valid==2) | (y_valid==3)]
    y_train_copy = y_train[(y_train==2) | (y_train==3)]
    y_valid_copy = y_valid[(y_valid==2) | (y_valid==3)]
    y_train_copy[y_train_copy==2] = 1
    y_train_copy[y_train_copy==3] = 0
    y_valid_copy[y_valid_copy==2] = 1
    y_valid_copy[y_valid_copy==3] = 0
    
    model3 = CatBoostClassifier(loss_function='Logloss', verbose=0, eval_metric='F1',
                               class_weights=[1/2, 1/2], random_seed=random_seed,
                               learning_rate=0.1, use_best_model=True)
    model3.fit(X_train_copy, y_train_copy, eval_set=(X_valid_copy, y_valid_copy), plot=False)
    
    # 验证
    pred = model1.predict(X_valid)
    pred01 = model2.predict(X_valid[pred==1])
    pred01[pred01==0]=2
    pred01[pred01==1]=0
    pred01[pred01==2]=1
    pred23 = model3.predict(X_valid[pred==0])
    pred23[pred23==0]=3
    pred23[pred23==1]=2
    pred[pred==0]=pred23
    pred[pred==1]=pred01
    
    overall_df = pd.DataFrame()
    overall_df['label_gt'] = y_valid
    overall_df['label_pr'] = pred

    weights = [3 / 7, 2 / 7, 1 / 7, 1 / 7]

    macro_F1=0
    for i in range(len(weights)):
        TP = len(overall_df[(overall_df['label_gt'] == i) & (overall_df['label_pr'] == i)])
        FP = len(overall_df[(overall_df['label_gt'] != i) & (overall_df['label_pr'] == i)])
        FN = len(overall_df[(overall_df['label_gt'] == i) & (overall_df['label_pr'] != i)])
        precision = TP / (TP + FP) if (TP + FP) > 0 else 0
        recall = TP / (TP + FN) if (TP + FN) > 0 else 0
        F1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
        macro_F1 += weights[i] * F1
    print('valid score: '+str(macro_F1))
              
    # 测试
    pred = model1.predict(X_test)
    pred01 = model2.predict(X_test[pred==1])
    pred01[pred01==0]=2
    pred01[pred01==1]=0
    pred01[pred01==2]=1
    pred23 = model3.predict(X_test[pred==0])
    pred23[pred23==0]=3
    pred23[pred23==1]=2
    pred[pred==0]=pred23
    pred[pred==1]=pred01
    
    preds.append(pred)

第1折...
valid score: 0.68094451205944
第2折...
valid score: 0.6864528260249338
第3折...
valid score: 0.678875394062602
第4折...
valid score: 0.6663009029019122
第5折...
valid score: 0.7131654991261889
第6折...
valid score: 0.6689767619156742
第7折...
valid score: 0.706317195398102
第8折...
valid score: 0.6822654153198716
第9折...
valid score: 0.6819596472752716
第10折...
valid score: 0.6807518622850676


In [6]:
preds = np.array(preds).T
preds1 = []
for pred in preds:
    preds1.append(np.argmax(np.bincount(pred)))

In [7]:
result = pd.read_csv('./dataset/src/preliminary_submit_dataset_b.csv')
result['label'] = np.array(preds1)
result.to_csv('./result_step.csv', index=False)
result.head()

Unnamed: 0,sn,fault_time,label
0,0015fe530ad4,2020-05-01 23:48:17,2
1,00380f1435b0,2020-07-28 07:51:13,3
2,0045a71d0221,2020-07-02 06:33:54,1
3,004d5a7954e7,2020-08-24 08:27:55,1
4,004d5a7954e7,2020-08-24 09:42:45,0
