In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
import tensorflow as tf
import numpy as np
from tensorflow import keras
import pandas as pd
import os
from sklearn.model_selection import StratifiedKFold
import gc

kfcv_seed = 1998
kfold_func = StratifiedKFold
data_enhance_method = []
k = 5

data_path = ''

def set_data_enhance(val):
    if not isinstance(val, list):
        val = [val]
    global data_enhance_method
    data_enhance_method = val

mapping = {0: 'A_0', 1: 'A_1', 2: 'A_2', 3: 'A_3', 
    4: 'D_4', 5: 'A_5', 6: 'B_1',7: 'B_5', 
    8: 'B_2', 9: 'B_3', 10: 'B_0', 11: 'A_6', 
    12: 'C_1', 13: 'C_3', 14: 'C_0', 15: 'B_6', 
    16: 'C_2', 17: 'C_5', 18: 'C_6'}

reversed_mapping = {value: key for key, value in mapping.items()}

def decode_label(label_code):
    str = mapping[label_code]
    scene_code = ord(str.split('_')[0]) - ord('A')
    action_code = ord(str.split('_')[1]) - ord('0')
    return scene_code, action_code

def kfcv_evaluate(model_name, x, y):
    kfold = kfold_func(n_splits=k, shuffle=True, random_state=kfcv_seed)
    evals = {'loss':0.0, 'accuracy':0.0}
    index = 0

    for train, val in kfold.split(x, np.argmax(y, axis=-1)):
        print('Processing fold: %d (%d, %d)' % (index, len(train), len(val)))
        
        model = keras.models.load_model(data_path + '/models/%s/part_%d.h5' % (model_name, index))

        loss, acc = model.evaluate(x=x[val], y=y[val])
        evals['loss'] += loss / k
        evals['accuracy'] += acc / k
        index += 1
    return evals

def kfcv_predict(model_name, inputs):
    path = data_path + '/models/' + model_name + '/'
    models = []
    for i in range(k):
        models.append(keras.models.load_model(path + 'part_%d.h5' % i))

    print('%s loaded.' % model_name)
    result = []
    for j in range(k):
        result.append(models[j].predict(inputs))

    print('result got')
    result = sum(result) / k
    return result

def kfcv_fit(builder, x, y, epochs, checkpoint_path, verbose=2, batch_size=64):
  kfold = kfold_func(n_splits=k, shuffle=True, random_state=kfcv_seed)
  histories = []
  evals = []

  if checkpoint_path[len(checkpoint_path) - 1] != '/':
      checkpoint_path += '/'

  for i in range(k):
    if os.path.exists(checkpoint_path + 'part_%d.h5' % i):
        os.remove(checkpoint_path + 'part_%d.h5' % i)

  for index, (train, val) in enumerate(kfold.split(x, np.argmax(y, axis=-1))):
      print('Processing fold: %d (%d, %d)' % (index, len(train), len(val)))
      model = builder()

      x_train = x[train]
      y_train = y[train]

      if len(data_enhance_method) > 0:
          x_train_copy = np.copy(x_train)
          y_train_copy = np.copy(y_train)
          for method in data_enhance_method:
              x_, y_ = data_enhance(method, x_train_copy, y_train_copy)
              x_train = np.r_[x_train, x_]
              y_train = np.r_[y_train, y_]
          x_train, y_train = shuffle(x_train, y_train)
          print('Data enhanced (%s) => %d' % (' '.join(data_enhance_method), len(x_train)))

      checkpoint = keras.callbacks.ModelCheckpoint(checkpoint_path + 'part_%d.h5' % index,
                                monitor='val_accuracy',
                                verbose=0,
                                mode='max',
                                save_best_only=True)

      class_weights = np.array([0.0357285 , 0.08708201, 0.05632962, 0.03861465, 0.07593551,
       0.04339172, 0.09335191, 0.02657245, 0.04090366, 0.0294586 ,
       0.02935908, 0.0678742 , 0.12012341, 0.03324045, 0.01582404,
       0.06458997, 0.06220143, 0.02866242, 0.05075637])
      h = model.fit(x=x_train, y=y_train,
              epochs=epochs,
              verbose=verbose,
              validation_data=(x[val], y[val]),
              callbacks=[checkpoint],
              batch_size=batch_size,
              class_weight=dict(enumerate((1-class_weights)**2)),
              shuffle=True
              )
      evals.append(model.evaluate(x=x[val], y=y[val]))
      histories.append(h)
      del model
      gc.collect()
  return histories, evals

def data_enhance(method, train_data, train_labels):
    if method == 'noise':
        # # noise = train_data
        noise = train_data + np.random.normal(0, 0.1, size=train_data.shape)
        # noise2 = train_data + np.random.uniform(-0.3, 0.3, train_data.shape)
        # noise = np.r_[noise1, noise2]
        # train_labels = np.r_[train_labels1, train_labels1]
        return noise, train_labels
    
    elif method == 'mixup':
        index = [i for i in range(len(train_labels))]
        np.random.shuffle(index)

        x_mixup = np.zeros(train_data.shape)
        y_mixup = np.zeros(train_labels.shape)

        for i in range(len(train_labels)):
            x1 = train_data[i]
            x2 = train_data[index[i]]
            y1 = train_labels[i]
            y2 = train_labels[index[i]]

            factor = np.random.beta(0.2, 0.2)

            x_mixup[i] = x1 * factor + x2 * (1 - factor)
            y_mixup[i] = y1 * factor + y2 * (1 - factor)

        return x_mixup, y_mixup

def save_results(path, output):
    print('saving...')

    df_r = pd.DataFrame(columns=['fragment_id', 'behavior_id'])
    for i in range(len(output)):
        behavior_id = output[i]
        df_r = df_r.append(
            {'fragment_id': i, 'behavior_id': behavior_id}, ignore_index=True)
    df_r.to_csv(path, index=False)

def infer(model_name, inputs, csv_output):
    proba_t = kfcv_predict(model_name, inputs)
    output = np.argmax(proba_t, axis=-1)
    save_results(csv_output, output)
    print('- END -')
    print('Your file locates at %s' % csv_output)
    pd.DataFrame(proba_t, columns = ['pred_{}'.format(i) for i in range(19)]).to_csv(data_path + '/dataset/proba_open777.csv', index= False)

def shuffle(data, labels, seed=None):
    index = [i for i in range(len(labels))]
    if seed != None:
        np.random.seed(seed)
    np.random.shuffle(index)
    return data[index], labels[index]



import tensorflow as tf
import numpy as np
from tensorflow import keras

def BLOCK(seq, filters, kernal_size):
    cnn = keras.layers.Conv1D(filters, 1, padding='SAME', activation='relu')(seq)
    cnn = keras.layers.LayerNormalization()(cnn)

    cnn = keras.layers.Conv1D(filters, kernal_size, padding='SAME', activation='relu')(cnn)
    cnn = keras.layers.LayerNormalization()(cnn)

    cnn = keras.layers.Conv1D(filters, 1, padding='SAME', activation='relu')(cnn)
    cnn = keras.layers.LayerNormalization()(cnn)

    seq = keras.layers.Conv1D(filters, 1)(seq)
    seq = keras.layers.Add()([seq, cnn])
    return seq

def BLOCK2(seq, filters=128, kernal_size=5):
    seq = BLOCK(seq, filters, kernal_size)
    seq = keras.layers.MaxPooling1D(2)(seq)
    seq = keras.layers.SpatialDropout1D(0.3)(seq)
    seq = BLOCK(seq, filters//2, kernal_size)
    seq = keras.layers.GlobalAveragePooling1D()(seq)
    return seq

def ComplexConv1D(input_shape, num_classes):
    inputs = keras.layers.Input(shape=input_shape[1:])
    seq_3 = BLOCK2(inputs, kernal_size=3)
    seq_5 = BLOCK2(inputs, kernal_size=5)
    seq_7 = BLOCK2(inputs, kernal_size=7)
    seq = keras.layers.concatenate([seq_3, seq_5, seq_7])
    seq = keras.layers.Dense(512, activation='relu')(seq)
    seq = keras.layers.Dropout(0.3)(seq)
    seq = keras.layers.Dense(128, activation='relu')(seq)
    seq = keras.layers.Dropout(0.3)(seq)
    outputs = keras.layers.Dense(num_classes, activation='softmax')(seq)

    model = keras.models.Model(inputs=[inputs], outputs=[outputs])

    model.compile(optimizer=tf.optimizers.Adam(1e-3),
            loss=tf.losses.CategoricalCrossentropy(label_smoothing=0.1),           
            metrics=['accuracy'])

    return model



import pandas as pd
import numpy as np
import random
import pickle
from sklearn.preprocessing import StandardScaler
import tensorflow as tf

# 特征列名称
src_names = ['acc_x', 'acc_y', 'acc_z', 'acc_xg', 'acc_yg', 'acc_zg', 'acc', 'acc_g']

def handle_features(data):
    data.drop(columns=['time_point'], inplace=True)

    data['acc'] = (data.acc_x ** 2 + data.acc_y ** 2 + data.acc_z ** 2) ** 0.5
    data['acc_g'] = (data.acc_xg ** 2 + data.acc_yg ** 2 + data.acc_zg ** 2) ** 0.5
    # data['acc_g_sub'] = ((data.acc_xg - data.acc_x) ** 2 + (data.acc_yg - data.acc_y) ** 2 + (data.acc_zg - data.acc_z) ** 2) ** 0.5

    return data

# 构造numpy特征矩阵
def handle_mats(grouped_data):
    mats = [i.values for i in grouped_data]
    # padding
    for i in range(len(mats)):
        padding_times = 61 - mats[i].shape[0]
        for j in range(padding_times):
            mats[i] = np.append(mats[i], [[0 for _ in range(mats[i].shape[1])]], axis=0)

    mats_padded = np.zeros([len(mats), 61, mats[0].shape[1]])
    for i in range(len(mats)):
        mats_padded[i] = mats[i]

    return mats_padded

def get_test_data(use_scaler=True):
    FILE_NAME = data_path + "dataset/sensor_test.csv"
    data = handle_features(pd.read_csv(FILE_NAME))
    if use_scaler:
        with open('scaler.pkl', 'rb') as f:
            scaler = pickle.load(f)
        data[src_names] = scaler.transform(data[src_names].values)

    grouped_data = [i.drop(columns='fragment_id') for _, i in data.groupby('fragment_id')]
    return handle_mats(grouped_data)

def get_train_data(use_scaler=True, shuffle=True, pseudo_labels_file=None):
    df = pd.read_csv(data_path + "/dataset/sensor_train.csv")

    # 简单拼接伪标签
    if pseudo_labels_file != None:
        df = df.append(pd.read_csv(pseudo_labels_file))
    data = handle_features(df)

    # 标准化，并将统计值保存
    if use_scaler:
        scaler = StandardScaler()
        scaler.fit(data[src_names].values)  
        with open('scaler.pkl', 'wb') as f:
            pickle.dump(scaler, f)
        data[src_names] = scaler.transform(data[src_names].values)

    grouped_data = [i.drop(columns='fragment_id') for _, i in data.groupby('fragment_id')]
    train_labels = np.array([int(i.iloc[0]['behavior_id']) for i in grouped_data])
    for i in range(len(grouped_data)):
        grouped_data[i].drop(columns='behavior_id', inplace=True)
    train_data = handle_mats(grouped_data)
    
    if shuffle:
        index = [i for i in range(len(train_labels))]
        np.random.seed(2020)
        np.random.shuffle(index)

        train_data = train_data[index]
        train_labels = train_labels[index]

    return train_data, train_labels

def get_train_test_data(use_scaler=True, shuffle=True, pseudo_labels_file=None):
    train_data, train_lables = get_train_data(use_scaler, shuffle, pseudo_labels_file=pseudo_labels_file)
    test_data = get_test_data(use_scaler)
    return train_data, train_lables, test_data

In [None]:
import tensorflow as tf
import numpy as np
from tensorflow import keras
# from utils import *
# from models import *
# from preprocess import *

# 导入精心挑选的pseudo labels
train_data, train_labels, test_data = get_train_test_data(pseudo_labels_file=data_path + '/dataset/pick.csv')
# 设置数据增强方式 (noise, mixup or both)
set_data_enhance(['noise'])

In [None]:
# 转换成float32节省显存，以及one_hot编码
num_classes = 19
train_data = tf.cast(train_data, tf.float32).numpy()
train_labels = tf.one_hot(train_labels, num_classes).numpy()

In [None]:
# 训练
histories, evals = kfcv_fit(builder=lambda : ComplexConv1D(train_data.shape, 19),
                                x=train_data, y=train_labels,
                                epochs=40,
                                checkpoint_path = data_path + '/models/conv1d/',
                                batch_size=64
                                )

Processing fold: 0 (8038, 2010)
Data enhanced (noise) => 24114
Epoch 1/40
377/377 - 8s - loss: 1.6767 - accuracy: 0.4905 - val_loss: 1.4900 - val_accuracy: 0.6159
Epoch 2/40
377/377 - 6s - loss: 1.3681 - accuracy: 0.6210 - val_loss: 1.3842 - val_accuracy: 0.6731
Epoch 3/40
377/377 - 6s - loss: 1.2479 - accuracy: 0.6788 - val_loss: 1.2893 - val_accuracy: 0.7174
Epoch 4/40
377/377 - 6s - loss: 1.1693 - accuracy: 0.7200 - val_loss: 1.1875 - val_accuracy: 0.7751
Epoch 5/40
377/377 - 6s - loss: 1.1089 - accuracy: 0.7502 - val_loss: 1.1543 - val_accuracy: 0.7791
Epoch 6/40
377/377 - 6s - loss: 1.0629 - accuracy: 0.7725 - val_loss: 1.1274 - val_accuracy: 0.7841
Epoch 7/40
377/377 - 6s - loss: 1.0268 - accuracy: 0.7878 - val_loss: 1.0986 - val_accuracy: 0.8090
Epoch 8/40
377/377 - 6s - loss: 0.9922 - accuracy: 0.8046 - val_loss: 1.1085 - val_accuracy: 0.8000
Epoch 9/40
377/377 - 7s - loss: 0.9618 - accuracy: 0.8196 - val_loss: 1.0502 - val_accuracy: 0.8338
Epoch 10/40
377/377 - 6s - loss: 0.93

In [None]:
# 评估
kfcv_evaluate('conv1d', train_data, train_labels)

# {'accuracy': 0.8664415955543519, 'loss': 0.9580374240875242}
# {'accuracy': 0.8672372341156006, 'loss': 0.9618942022323609}  0.7779
# {'accuracy': 0.8688300132751465, 'loss': 0.9568594574928283}  0.775
# {'accuracy': 0.8643508791923523, 'loss': 0.9691726446151732}  0.778

Processing fold: 0 (8038, 2010)
Processing fold: 1 (8038, 2010)
Processing fold: 2 (8038, 2010)
Processing fold: 3 (8039, 2009)
Processing fold: 4 (8039, 2009)


{'accuracy': 0.8698249459266664, 'loss': 0.966798758506775}

In [None]:
# 推断
infer('conv1d', get_test_data(), data_path + '/dataset/submit_open777.csv')

conv1d loaded.
result got
saving...
- END -
Your file locates at /content/gdrive/My Drive/jiaozibei/xwbank2020_baseline_keras-master/dataset/result.csv
