In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import keras
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping,ReduceLROnPlateau
from keras.utils import to_categorical
from tensorflow.keras.layers import Conv1D,InputLayer,Dense,Dropout,Flatten,MaxPooling1D,BatchNormalization,MaxPool1D,Add,Input,concatenate,Activation
from tensorflow.keras.models import Sequential,Model
from sklearn.model_selection import KFold
import warnings
warnings.filterwarnings("ignore")

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Using TensorFlow backend.


In [4]:
train = pd.read_csv('./data/train.csv',encoding='utf-8')
test=pd.read_csv('./data/testA.csv',encoding='utf-8')
train.head()

Unnamed: 0,id,heartbeat_signals,label
0,0,"0.9912297987616655,0.9435330436439665,0.764677...",0.0
1,1,"0.9714822034884503,0.9289687459588268,0.572932...",0.0
2,2,"1.0,0.9591487564065292,0.7013782792997189,0.23...",2.0
3,3,"0.9757952826275774,0.9340884687738161,0.659636...",0.0
4,4,"0.0,0.055816398940721094,0.26129357194994196,0...",2.0


In [3]:
train.tail()

Unnamed: 0,id,heartbeat_signals,label
119995,119995,"1.0,0.8330283177934747,0.6340472606311671,0.63...",0.0
119996,119996,"1.0,0.8259705825857048,0.4521053488322387,0.08...",0.0
119997,119997,"0.951744840752379,0.9162611283848351,0.6675251...",2.0
119998,119998,"0.9276692903808186,0.6771898159607004,0.242906...",0.0
119999,119999,"0.6653212231837624,0.527064114047737,0.5166625...",0.0


In [5]:
#转换数据类型来减小数据占用内存
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2 
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2 
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [6]:
# 简单预处理
train_list = []

for items in train.values:
    train_list.append([items[0]] + [float(i) for i in items[1].split(',')] + [items[2]])

train = pd.DataFrame(np.array(train_list))
train.columns = ['id'] + ['s_'+str(i) for i in range(len(train_list[0])-2)] + ['label']
train = reduce_mem_usage(train)

test_list=[]
for items in test.values:
    test_list.append([items[0]] + [float(i) for i in items[1].split(',')])

test = pd.DataFrame(np.array(test_list))
test.columns = ['id'] + ['s_'+str(i) for i in range(len(test_list[0])-1)]
test = reduce_mem_usage(test)

Memory usage of dataframe is 157.93 MB
Memory usage after optimization is: 39.67 MB
Decreased by 74.9%
Memory usage of dataframe is 31.43 MB
Memory usage after optimization is: 7.90 MB
Decreased by 74.9%


In [7]:
Y = train['label']
X = train.drop(['id','label'],axis=1)
test= test.drop(['id'],axis=1)
print(np.shape(X),np.shape(test))

(100000, 205) (20000, 205)


In [8]:
def abs_sum(y_pre,y_tru):
    y_pre=np.array(y_pre)
    y_tru=np.array(y_tru)
    loss=sum(sum(abs(y_pre-y_tru)))
    return loss

In [9]:
X = np.array(X).reshape(-1,205,1)   #因为模型读入数据要求是三维
test = np.array(test).reshape(-1,205,1)

In [10]:
Y = to_categorical(Y)

In [11]:
def buildmodel3():  #线上172，用172当预训练再训练168
    inputs=Input(shape=(205,1))
    x1 = Conv1D(32,kernel_size=32, strides=1, padding='SAME',activation='relu')(inputs)
    x2 = Conv1D(32,kernel_size=16, strides=1, padding='SAME',activation='relu')(inputs)
    x3 = Conv1D(32,kernel_size=48, strides=1, padding='SAME',activation='relu')(inputs)
    x4 = Add()([x1,x2,x3])
    x = BatchNormalization()(x4)    
    
    x = Conv1D(64,kernel_size=16, strides=1, padding='SAME',activation='relu')(x4)
    x = Conv1D(128,kernel_size=8, strides=1, padding='SAME',activation='relu')(x)
    
    x = MaxPool1D(pool_size=4, strides=2, padding='SAME')(x)
    x = Dropout(rate=0.25)(x)
    x = Flatten()(x)
    x = Dense(512,activation='relu')(x)
    x = Dense(1024,activation='relu')(x)
    output = Dense(4,activation='softmax')(x)
    model = Model(inputs=inputs,outputs=output)
   
    return model


In [12]:
model = buildmodel3()
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 205, 1)]     0                                            
__________________________________________________________________________________________________
conv1d (Conv1D)                 (None, 205, 32)      1056        input_1[0][0]                    
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 205, 32)      544         input_1[0][0]                    
__________________________________________________________________________________________________
conv1d_2 (Conv1D)               (None, 205, 32)      1568        input_1[0][0]                    
______________________________________________________________________________________________

In [31]:
def cv_model(train_x, train_y, test):
    folds = 10
    seeds = [600]
    tests = []
    cv_scores = []
    
    for seed in seeds:
        kf = KFold(n_splits=folds, shuffle=True, random_state=seed)
        for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
            print('************************************ 随机种子{}***第{}折 ************************************'.format(seed,str(i+1)))
            trn_x, trn_y, val_x, val_y = train_x[train_index], train_y[train_index], train_x[valid_index], train_y[valid_index]
        
            #model = buildmodel3()
            model = tf.keras.models.load_model('/root/model/cnnbestnew_425_1_2000.h5')
            model.compile(optimizer=tf.keras.optimizers.Adam(lr=0.0009),
             loss = 'categorical_crossentropy',
            metrics = ['acc']
             ) 
            
            best_weights_filepath = '/root/model/cnnbestnew_429_{}_{}.h5'.format(i+1,seed)
            
            earlystop = tf.keras.callbacks.EarlyStopping(monitor='val_acc',patience=4,verbose=2,mode='max')
            reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', patience=3, mode='min',verbose=2,factor=0.5)
            saveBestModel = tf.keras.callbacks.ModelCheckpoint(best_weights_filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max',save_weights_only=False)
            
            my_callbacks = [earlystop,reduce_lr,saveBestModel]
            model.fit(trn_x,trn_y,epochs=30,batch_size=256,validation_data=(val_x,val_y),shuffle=True,callbacks=my_callbacks)
    
          #注意，这里要重新加载保存的模型，即最优模型；否则他会用最后一次迭代的模型去推理
            model = tf.keras.models.load_model('/root/model/cnnbestnew_429_{}_{}.h5'.format(i+1,seed))
#             test_pred = model.predict(test)                 
#             test_temp = pd.DataFrame(np.zeros((20000,4)))
#             for t in range(len(test_pred)):
#                 a = pd.DataFrame(test_pred).iloc[t,:].argmax(0)
#                 test_temp.iloc[t,a] = 1
    
#             val_pred = pd.DataFrame(val_pred)
#             for col in range(4):
#                 val_pred.iloc[:,col] = val_pred.iloc[:,col].apply(lambda x:0 if x<0.5 else 1)
            
            val_pred = model.predict(val_x)
            val_temp = pd.DataFrame(np.zeros((10000,4)))
            for t in range(len(val_pred)):
                a = pd.DataFrame(val_pred).iloc[t,:].argmax(0)
                val_temp.iloc[t,a] = 1
                
            #tests.append(test_temp)
            score=abs_sum(val_y, val_temp)
            cv_scores.append(score)
            print(score)
        print("score_mean:{}".format(np.mean(cv_scores)))

   # return tests

In [32]:
result = cv_model(X,Y,test)

************************************ 随机种子600***第1折 ************************************
Train on 90000 samples, validate on 10000 samples
Epoch 1/30
Epoch 00001: val_acc improved from -inf to 0.99720, saving model to /root/model/cnnbestnew_429_1_600.h5
Epoch 2/30
Epoch 00002: val_acc did not improve from 0.99720
Epoch 3/30
Epoch 00003: val_acc did not improve from 0.99720
Epoch 4/30
Epoch 00004: ReduceLROnPlateau reducing learning rate to 0.00044999999227002263.

Epoch 00004: val_acc did not improve from 0.99720
Epoch 5/30
Epoch 00005: val_acc improved from 0.99720 to 0.99840, saving model to /root/model/cnnbestnew_429_1_600.h5
Epoch 6/30
Epoch 00006: val_acc did not improve from 0.99840
Epoch 7/30
Epoch 00007: val_acc did not improve from 0.99840
Epoch 8/30
Epoch 00008: ReduceLROnPlateau reducing learning rate to 0.00022499999613501132.

Epoch 00008: val_acc did not improve from 0.99840
Epoch 9/30
Epoch 00009: val_acc did not improve from 0.99840
Epoch 00009: early stopping
32.0
*****

In [34]:
preds = 0
for i in range(10):
    for seed in [2000]:
        model = tf.keras.models.load_model('。/model/cnnbestnew_429_{}_{}.h5'.format(str(i+1),seed))
        pred = model.predict(test)
        preds+=pred
temp = preds/10   #在172的model3上fintuing，

In [None]:
tmp = 0
for i in result:
    tmp+=i
tmp = tmp/10
temp = pd.DataFrame(tmp)

In [35]:
print(np.shape(temp))

(20000, 4)


In [36]:
temp1 = pd.DataFrame(np.zeros((20000,4)))
temp = pd.DataFrame(temp)
for t in range(len(temp)):
    a = temp.iloc[t,:].argmax(0)

    temp1.iloc[t,a] = 1
temp1.head()

#此种方式相比用阈值分割处理，效果更佳

Unnamed: 0,0,1,2,3
0,1.0,0.0,0.0,0.0
1,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,1.0
3,1.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0


In [37]:
results=pd.read_csv('./submit/sample_submit.csv')
results['label_0']=temp1[0]
results['label_1']=temp1[1]
results['label_2']=temp1[2]
results['label_3']=temp1[3]

In [38]:
#result = result/5
# for i in range(len(result)):
#     print(result.iloc[i,:])

# for col in ['label_0','label_1','label_2','label_3']:
#     results[col] = results[col].apply(lambda x:1 if x>0.5 else 0)
    
results.to_csv('./submit/cnnbest_4_29_1.csv',index=False)

In [None]:
#CNN_4_12_2.CSV  线下十折，每折样本10000，得分156，线上测试样本20000，得分173
#cnnbest_   cnn_4_13_1.csv只在第一折初始化模型   线上191
#cnnbest1_ cnn_4_13_2.csv每折都初始化模型  线下144.9  线上174
#cnnbest2_{seed} cnn_4_14_1.csv  三组随机数，十折  线下137  以上前三层卷积核尺寸 32,16,16  线上174
#cnnbest2_{seed} cnn_4_14_2.csv  三组随机数，十折  线下130  以上前三层卷积核尺寸 64,32,16   线上177
#cnnbest_4_22_1.csv   ... ... ..             线下129.9 前三层32,16,16 
#4231  seed700 十折 前三层的卷积核64,128,256 buildModel1   202
#4232  seed700 十折  前两层相加输入第三层  线下129  buildmodel  //199,可能用的仍是buildModel1
#4233 seed2000 十折  前三层相加   线下125 buildmodel2  cnnbestnew_422_{}_{}.h5
#4241 seed2000 十折  前三层相加   线下117 buildmodel2  cnnbestnew_423_{}_{}.h5  监控valacc  线上188  
#4242  和上述4241的模型一样，在最后数据处理部分，改用取最大值为正确类别
#4252         前两层相加  卷积核32和64     192
#4253         前两层相加  卷积核32和48        cnnbestnew_425_{}_{}.h5 
#4261         model3前三层相加  卷积核32和48，16 + 归一化    cnnbestnew_425_{}_{}.h5  线下<118  4_26_1  线上172  **********best
#4262         model4    cnnbestnew_426_{}_{}.h5  线下112.4   线上188   X
#4263        model3   去掉了归一化层  线上192
#4264         model3  没去掉归一化   用之前的模型425第一折作为预训练模型继续训练 随机种子2000 cnnbestnew_427_{}_{}.h5 线下42 线上168
#4271        尝试用监控valacc去再跑一遍原本的那套cnn,换全连接层的激活函数 cnnbestnew_428_{}_{}.h5  线上198
#4272       model3  没去掉归一化   用之前的模型作为预训练模型继续训练（425即线上172的模型）  cnnbestnew_428_{}_{}.h5  线下37.5  168  加入了测试集训练
#4273       model3  没去掉归一化   用之前的模型作为预训练模型继续训练（427模型即线上168的作为与训练）  cnnbestnew_429_{}_{}.h5  线下13 178  初始学习率0.0009 加入了测试机训练
#4291      model3  没去掉归一化   用之前的模型（425线上得分172的第一折）作为预训练模型继续训练 学习率0.0005  cnnbestnew_429_{}_{}.h5  三组随机数种子交叉验证 线下25  线上194

In [None]:
#新思路，构造新的特征加到cnn，如均值，方差