In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from catboost import CatBoostClassifier
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

In [2]:
random_seed = 42

In [3]:
# feature = []
# for i in range(70):
#     feature.append('Event_Id_'+str(i))

# feature.extend(['num_log', 'span_delta', 'before', 
#                 'min_delta',  'max_delta', 'mean_delta', 'std_delta', 
#                 'min_delta_diff',  'max_delta_diff', 'mean_delta_diff', 'std_delta_diff',
#                 'server_model'])

# feature_1 = []
# for i in range(70):
#     for j in range(70):
#         feature_1.append('Event_Id_'+str(i)+'->'+str(j))

In [4]:
X = pd.read_csv('./dataset/v1/label_pro.csv').drop(['sn', 'fault_time', 'label'], axis=1).replace(0, np.nan)
X = X.dropna(axis=1, how="all")
feature = X.columns.values.tolist()

X_1  = pd.read_csv('./dataset/v1/label_pro_1.csv').drop(['sn', 'fault_time', 'label'], axis=1).replace(0, np.nan)
X_1 = X_1.dropna(axis=1, how="all")
feature_1 = X_1.columns.values.tolist()

In [5]:
# 训练集
y = pd.read_csv('./dataset/v1/label_pro.csv')['label'].values

# lda = LDA(n_components=3)
X  = pd.read_csv('./dataset/v1/label_pro.csv').fillna(0)[feature].values
# X = lda.fit_transform(X, y)

lda_1 = LDA(n_components=3)
X_1  = pd.read_csv('./dataset/v1/label_pro_1.csv').fillna(0)[feature_1].values
X_1 = lda_1.fit_transform(X_1, y)

X = np.hstack((X, X_1))



# 测试集
test = pd.read_csv('./dataset/v1/submit_pro.csv').fillna(0)[feature].values
# test = lda.transform(test)

test_1 = pd.read_csv('./dataset/v1/submit_pro_1.csv').fillna(0)[feature_1].values
test_1 = lda_1.transform(test_1)

test = np.hstack((test, test_1))

kf = KFold(n_splits=10, shuffle=True, random_state=random_seed)

In [6]:
preds = []
for i, (train_index, valid_index) in enumerate(kf.split(X, y)):
    print('第'+str(i+1)+'折...')
    X_train, X_valid = X[train_index], X[valid_index]
    y_train, y_valid = y[train_index], y[valid_index]
    
    # 训练
    model = CatBoostClassifier(loss_function='MultiClass', task_type="GPU", devices='0:1', verbose=0, 
                               class_weights=[3 / 7, 2 / 7, 1 / 7, 1 / 7], random_seed=random_seed,
                               learning_rate=0.1, use_best_model=True)
    model.fit(X_train,y_train, eval_set=(X_valid, y_valid), plot=False)
              
    # 验证
    pred = model.predict(X_valid)
    overall_df = pd.DataFrame()
    overall_df['label_gt'] = y_valid
    overall_df['label_pr'] = pred

    weights = [3 / 7, 2 / 7, 1 / 7, 1 / 7]

    macro_F1=0
    for i in range(len(weights)):
        TP = len(overall_df[(overall_df['label_gt'] == i) & (overall_df['label_pr'] == i)])
        FP = len(overall_df[(overall_df['label_gt'] != i) & (overall_df['label_pr'] == i)])
        FN = len(overall_df[(overall_df['label_gt'] == i) & (overall_df['label_pr'] != i)])
        precision = TP / (TP + FP) if (TP + FP) > 0 else 0
        recall = TP / (TP + FN) if (TP + FN) > 0 else 0
        F1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
        macro_F1 += weights[i] * F1
    print('valid score: '+str(macro_F1))
              
    # 测试
    pred = model.predict_proba(test)
    preds.append(pred)

第1折...
valid score: 0.6756101662985209
第2折...
valid score: 0.676317695381624
第3折...
valid score: 0.6695560371159327
第4折...
valid score: 0.6364936144879751
第5折...
valid score: 0.7050037050461428
第6折...
valid score: 0.6581462220287438
第7折...
valid score: 0.6951223563979062
第8折...
valid score: 0.6759794044162535
第9折...
valid score: 0.6739679151851479
第10折...
valid score: 0.6818564453834968


In [7]:
# import matplotlib.pyplot as plt 
# fea_ = model.feature_importances_
# fea_name = model.feature_names_
# plt.figure(figsize=(10, 100))
# plt.barh(fea_name,fea_,height =0.5)

In [8]:
preds = np.argmax(np.mean(preds, axis=0), axis=1)

In [9]:
result = pd.read_csv('./dataset/src/preliminary_submit_dataset_a.csv')
result['label'] = preds
result.to_csv('./result.csv', index=False)
result.head()

Unnamed: 0,sn,fault_time,label
0,000d33b21436,2020-09-02 16:42:54,3
1,005c5a9218ba,2020-06-28 19:05:16,2
2,0079283bde6e,2020-04-26 21:32:44,3
3,007bdf23b62f,2020-06-16 18:40:39,2
4,00a577a8e54f,2020-04-07 07:16:55,2
