@Copyright tianchi 2022
https://tianchi.aliyun.com/competition/entrance/531947/introduction?spm=5176.12281957.1004.3.5b713eafy3IvEA

In [137]:
import pandas as pd
import numpy as np
import json
import math
import tensorflow as tf
import os
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import Sequence,to_categorical
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.utils import shuffle
from tensorflow.keras.layers import Reshape,Flatten
from tqdm import tqdm

In [2]:
model_dir = "./model/rnn_model/best_model.h5"
submit_dir = "./data/submit/"

In [3]:
df = pd.read_pickle(r'./feature_data/template_fea/cpu_diag_comp_sel_log_all_feature1h_3_sum.pkl') # 读取之前构造好的特征数据

In [4]:
fea_cols = [i for i in list(df.columns) if i not in ['sn', 'collect_time_gap']]

In [162]:
#制作一个迭代器，迭代器里面的每个元素是一个bt=n的step
#https://blog.csdn.net/weixin_37737254/article/details/103884255
class DataGenerator(Sequence):
    def __init__(self, df, batch_size):
        self.data = df
        self.num = df.shape[0]
        self.batch_size = batch_size
        self.fea = fea_cols
        
    def __len__(self):
        return math.ceil(self.num / self.batch_size)

    def __getitem__(self,idx):
        batch_data = self.data.iloc[idx*self.batch_size:(idx+1)*self.batch_size]
        input_1 = np.array(batch_data[self.fea])
        output = np.array(to_categorical(batch_data.label,4))

        return input_1, output
    

In [172]:
def build_model(feature_num):
    input_1 = tf.keras.Input(shape=(feature_num))
    output_1 = tf.keras.layers.Dense(256, activation="relu")(input_1)
#     output = tf.keras.layers.Dense(128, activation="relu")(output_1)
#     dp = tf.keras.layers.Dropout(0.15)(output)
    output = tf.keras.layers.Dense(64, activation="relu")(output_1)
    output = tf.keras.layers.Dense(4, activation="softmax")(output)

    model = tf.keras.Model(inputs=input_1, outputs=output)

    return model

# train

In [173]:
df_train_label = pd.read_csv('./data/preliminary_train_label_dataset.csv')
df_train_label_s = pd.read_csv('./data/preliminary_train_label_dataset_s.csv')
df_train_label = pd.concat([df_train_label, df_train_label_s])  #12261+4410
df_train_label = df_train_label.drop_duplicates(['sn','fault_time','label'])

In [174]:
train = pd.merge(df[df.sn.isin(df_train_label.sn)],df_train_label, on='sn', how='left')
train = train.drop(['sn', 'collect_time_gap', 'fault_time'], axis=1)
train = shuffle(train)
# y = df_data_train['label']
# x = df_data_train.drop(['sn','collect_time_gap','fault_time','label'],axis=1)

In [175]:
test_df = pd.read_csv('./data/preliminary_submit_dataset_a.csv',index_col=0).reset_index()
print(len(test_df))
test = pd.merge(df[df.sn.isin(test_df.sn)],test_df, on='sn', how='left')
print(len(test))
test = test.drop(['sn', 'collect_time_gap', 'fault_time'], axis=1)
test['label'] = -1   #为了能让DataGenerator读入

3011
4292


In [176]:
len(test)

4292

In [177]:
# new_train = DataGenerator(train[:40000],100)
# next(iter(new_train))

In [178]:
def Macro_f1(y_true,y_pred):
    every_class_f1 = f1_score(y_true, y_pred, average=None)  #list  包含每类的f1score
    macro_f1 =  3/7*every_class_f1[0]+2/7*every_class_f1[1]+1/7*every_class_f1[2]+1/7*every_class_f1[3]
    return 'maroc_f1',macro_f1

In [180]:
new_test = DataGenerator(test,100)

new_train = DataGenerator(train[:40000],100)
new_val = DataGenerator(train[40000:],100)
        
model = build_model(feature_num=206)
model.summary()

model.compile(optimizer=tf.keras.optimizers.Adam(0.001),loss='categorical_crossentropy',metrics='accuracy')
early_stopping = tf.keras.callbacks.EarlyStopping(monitor="val_acc", patience=3, restore_best_weights=True)
lr_reduce = tf.keras.callbacks.ReduceLROnPlateau(patience=2,monitor='val_acc', factor=0.1)
best_checkpoint = tf.keras.callbacks.ModelCheckpoint(model_dir,save_best_only=True, save_weights_only=False,verbose=1)
#model.fit(iter(train_bt),steps_per_epoch=len(train_bt),validation_data=iter(val_bt),validation_steps=len(val_bt),epochs=20,callbacks=[best_checkpoint,early_stopping,lr_reduce])
#model.save('./data/model/model_fold{}.h5'.format(kf))
model.fit_generator(generator=new_train,
                    steps_per_epoch=len(new_train),
                    epochs=20,
                    verbose=1,
                    validation_data=new_val,
                    validation_steps=len(new_val),
#                     use_multiprocessing=False,
#                     workers=1,
                    callbacks=[best_checkpoint,early_stopping,lr_reduce])
    
#重新加载当前折最优的模型
best_model = tf.keras.models.load_model(model_dir)
#测试集推理
test_pred =  best_model.predict(new_test, steps=len(new_test))

#验证集推理
val_pred =  best_model.predict(new_val, steps=len(new_val))
val_pred = [np.argmax(i) for i in val_pred]
#计算整体验证集得分
y_true = train.iloc[40000:]['label']
score = Macro_f1(y_true,val_pred)
print('得分：{}'.format(score))

Model: "model_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_23 (InputLayer)        [(None, 206)]             0         
_________________________________________________________________
dense_31 (Dense)             (None, 256)               52992     
_________________________________________________________________
dense_32 (Dense)             (None, 64)                16448     
_________________________________________________________________
dense_33 (Dense)             (None, 4)                 260       
Total params: 69,700
Trainable params: 69,700
Non-trainable params: 0
_________________________________________________________________
Epoch 1/20

Epoch 00001: val_loss improved from inf to 0.96447, saving model to ./model/rnn_model\best_model.h5
Epoch 2/20

Epoch 00002: val_loss improved from 0.96447 to 0.76860, saving model to ./model/rnn_model\best_model.h5
Epoch 3/20

Epoch 00003:

Epoch 15/20

Epoch 00015: val_loss did not improve from 0.64402
Epoch 16/20

Epoch 00016: val_loss did not improve from 0.64402
Epoch 17/20

Epoch 00017: val_loss did not improve from 0.64402
Epoch 18/20

Epoch 00018: val_loss did not improve from 0.64402
Epoch 19/20

Epoch 00019: val_loss did not improve from 0.64402
Epoch 20/20

Epoch 00020: val_loss did not improve from 0.64402
得分：('maroc_f1', 0.5273656902530937)


#### 多折交叉验证

In [30]:
oof_pred = np.zeros((len(train),4))
test_preds = np.zeros((len(test),4))

y = train['label']
x = train.drop('label',axis=1)
kfold = StratifiedKFold(random_state=2021,n_splits=5,shuffle=True)

new_test = DataGenerator(test,256)
for kf,(train_idx,val_idx) in enumerate(kfold.split(x,y)):
    print('#####第{}折####'.format(kf+1))
    print('train len: {}'.format(len(train_idx)))
    print('val len: {}'.format(len(val_idx)))
    train_x = x.iloc[train_idx]
    train_y = y.iloc[train_idx]
    train_df = pd.concat([train_x,train_y],axis=1)
    val_x = x.iloc[val_idx]
    val_y = y.iloc[val_idx]
    val_df = pd.concat([val_x,val_y],axis=1)

    train_bt = DataGenerator(train_df,256)
    val_bt = DataGenerator(val_df,256)
    
    #model = tf.keras.models.load_model('./data/model/best_model.h5')
    model = build_model(feature_num=206)
    model.compile(optimizer=tf.keras.optimizers.Adam(0.0009),loss='categorical_crossentropy',metrics='accuracy')
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor="val_acc", patience=3, restore_best_weights=True)
    lr_reduce = tf.keras.callbacks.ReduceLROnPlateau(patience=2,monitor='val_acc', factor=0.1)
    best_checkpoint = tf.keras.callbacks.ModelCheckpoint(model_dir.format(kf+1),save_best_only=True, save_weights_only=False,verbose=1)
    #model.fit(iter(train_bt),steps_per_epoch=len(train_bt),validation_data=iter(val_bt),validation_steps=len(val_bt),epochs=20,callbacks=[best_checkpoint,early_stopping,lr_reduce])
    #model.save('./data/model/model_fold{}.h5'.format(kf))
    model.fit_generator(generator=train_bt,
                    steps_per_epoch=len(train_bt),
                    epochs=10,
                    verbose=1,
                    validation_data=val_bt,
                    validation_steps=len(val_bt),
#                     use_multiprocessing=False,
#                     workers=1,
                    callbacks=[best_checkpoint,early_stopping,lr_reduce])
 
    #重新加载当前折最优的模型
    best_model = tf.keras.models.load_model(model_dir.format(kf+1))
    
    #验证集推理
    val_pred =  best_model.predict(val_bt, steps=len(val_bt))
    oof_pred[val_idx] = val_pred
    
    #测试集推理
    test_pred =  best_model.predict(new_test, steps=len(new_test))
    test_preds += test_pred
    
    
#计算整体验证集得分
y_true = train.label
oof_pred = [np.argmax(i) for i in oof_pred]
score = Macro_f1(y_true,oof_pred)
print('多折验证集总体得分：{}'.format(score))

#####第1折####
train len: 35288
val len: 8822
Note that input tensors are instantiated via `tensor = tf.keras.Input(shape)`.
The tensor that caused the issue was: tf.expand_dims_1/ExpandDims:0


ValueError: Graph disconnected: cannot obtain value for tensor KerasTensor(type_spec=TensorSpec(shape=(None, 206), dtype=tf.float32, name='input_4'), name='input_4', description="created by layer 'input_4'") at layer "tf.expand_dims_1". The following previous layers were accessed without issue: []

# predict

In [18]:
testpred = [np.argmax(i) for i in test_preds]

In [37]:
len(testpred)

4292

In [38]:
test_df = pd.read_csv('./data/preliminary_submit_dataset_a.csv',index_col=0).reset_index()
test_df = pd.merge(df[df.sn.isin(test_df.sn)],test_df, on='sn', how='left')
res = test_df[['sn','fault_time']]
len(res)

4292

In [39]:
res['label'] = testpred
res = res.sort_values(['sn','fault_time'])
res = res.drop_duplicates(['sn','fault_time'],keep='last')
res.to_csv('./submit/submit_{}.csv'.format(round(score,4)), index=0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  res['label'] = testpred
