# 导入包

In [None]:
import numpy as np
import pandas as pd

import lightgbm as lgb
from scipy.stats import skew
from scipy.stats import kurtosis
from scipy.stats import mode

from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold
from collections import Counter
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
import warnings 
warnings.filterwarnings("ignore")                                               

# 导入数据

In [None]:
nrows = None

df_train = pd.read_csv('/content/drive/My Drive/Colab Notebooks/sensor_class/sensor_train.csv',sep=',',nrows=nrows)
df_test = pd.read_csv('/content/drive/My Drive/Colab Notebooks/sensor_class/sensor_test.csv',sep=',',nrows=nrows)
df_submit = pd.read_csv('/content/drive/My Drive/Colab Notebooks/sensor_class/提交结果示例.csv',sep=',',nrows=nrows)



In [None]:
df_train.head()

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# 合并数据

In [None]:
df_train['acc_all'] = (df_train['acc_x'] ** 2 + df_train['acc_y'] ** 2 + df_train['acc_z'] ** 2) ** 0.5
df_train['acc_allg'] = (df_train['acc_xg'] ** 2 + df_train['acc_yg'] ** 2 + df_train['acc_zg'] ** 2) ** 0.5

In [None]:
df_test['acc_all'] = (df_test['acc_x'] ** 2 + df_test['acc_y'] ** 2 + df_test['acc_z'] ** 2) ** 0.5
df_test['acc_allg'] = (df_test['acc_xg'] ** 2 + df_test['acc_yg'] ** 2 + df_test['acc_zg'] ** 2) ** 0.5

In [None]:
df_train['time_point'].diff(periods=1).dropna()

1         81.0
2         90.0
3         99.0
4         91.0
5         83.0
          ... 
425354    84.0
425355    86.0
425356    88.0
425357    95.0
425358    87.0
Name: time_point, Length: 425358, dtype: float64

# 数据聚合

In [None]:
y=df_train.groupby('fragment_id')['behavior_id'].min()

In [None]:
from scipy.signal import resample
size_df_train=df_train.groupby('fragment_id').size().count()
size_df_test=df_test.groupby('fragment_id').size().count()
x = np.zeros((size_df_train, 60, 8, 1))
t = np.zeros((size_df_test, 60, 8, 1))

for i in tqdm(range(size_df_train)):
    tmp = df_train[df_train.fragment_id == i][:60]
    a=resample(tmp.drop(['fragment_id', 'time_point', 'behavior_id'],axis=1), 75, np.array(tmp.time_point))
    x[i,:,:, 0] = a[0]

for i in tqdm(range(size_df_test)):

    tmp = df_test[df_test.fragment_id == i][:60]
    a=resample(tmp.drop(['fragment_id', 'time_point'],axis=1), 75, np.array(tmp.time_point))
    t[i,:,:, 0] = a[0]


100%|██████████| 7292/7292 [00:15<00:00, 480.00it/s]
100%|██████████| 7500/7500 [00:15<00:00, 480.16it/s]


# 模型训练 

In [None]:
import os
import tensorflow as tf
from keras.models import Sequential
from keras.layers import LSTM 
from keras.layers.core import Dense, Dropout
from numpy import mean
from numpy import std
from numpy import dstack
from pandas import read_csv
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Dropout
from keras.layers import LSTM
from keras.layers import TimeDistributed
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers import ConvLSTM2D,Bidirectional
from keras.utils import to_categorical
from matplotlib import pyplot

In [None]:
X = x
X_test = t

In [None]:
y=df_train.groupby('fragment_id')['behavior_id'].min()

In [None]:
X.shape

(7292, 60, 8, 1)

# LSTM

In [None]:
df_train_stacking = pd.DataFrame(np.zeros((X.shape[0],19)))
df_test_stacking = pd.DataFrame(np.zeros((X_test.shape[0],19)))

In [None]:
seed = 2020
folds = 2
kfold = StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed)
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from keras.callbacks import ModelCheckpoint


In [None]:

# Import the inception model  
from keras.applications.inception_resnet_v2 import InceptionResNetV2
pre_trained_model = InceptionResNetV2(include_top=False, weights='imagenet',  input_shape=(60, 10, 3), pooling='avg', classes=19)

# Make all the layers in the pre-trained model non-trainable
for layer in pre_trained_model.layers:
  layer.trainable = False
  
# Print the model summary
pre_trained_model.summary()


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
last_layer = pre_trained_model.get_layer('mixed7')
print('last layer output shape: ', last_layer.output_shape)
last_output = last_layer.output

last layer output shape:  (None, 7, 7, 768)


In [None]:

from tensorflow.keras.optimizers import RMSprop


# 训练模型

In [None]:
for fold,(train_index, val_index) in enumerate(kfold.split(X, y)):


  print('--------------- begin ---------------')
  X_train, X_val = np.array(list(X[train_index])), np.array(list(X[val_index]))
  y_train, y_val = np.array(list(y[train_index])), np.array(list(y[val_index]))
  	# one hot encode y
  y_train = to_categorical(y_train)
  y_val = to_categorical(y_val)
     
	# define model
  verbose, epochs, batch_size = 1, 25, 32
  n_timesteps, n_features, n_outputs = X_train.shape[1], X_train.shape[2], y_train.shape[1]
  # reshape into subsequences (samples, time steps, rows, cols, channels)
  n_steps, n_length = 2, 30
  # X_train = X_train.reshape((X_train.shape[0], n_steps, 1, n_length, n_features))
  # X_val = X_val.reshape((X_val.shape[0], n_steps, 1, n_length, n_features))

  X_test = np.array(list(X_test))
  # X_test = X_test.reshape((X_test.shape[0], n_steps, 1, n_length, n_features))
  # define model

  # Flatten the output layer to 1 dimension
  x = layers.Flatten()(last_output)
  # Add a fully connected layer with 1,024 hidden units and ReLU activation
  x = layers.Dense(1024, activation='relu')(x)
  # Add a dropout rate of 0.2
  x = layers.Dropout(0.2)(x)                  
  # Add a final sigmoid layer for classification
  x = layers.Dense(n_outputs, activation='softmax')(x)           

  model = Model( pre_trained_model.input, x) 

  # fit network
  model.compile(loss='categorical_crossentropy', RMSprop(lr=0.0001), metrics=['accuracy'])

  plateau = ReduceLROnPlateau(monitor="val_accuracy",
                              verbose=0,
                              mode='max',
                              factor=0.1,
                              patience=6)
  early_stopping = EarlyStopping(monitor='val_accuracy',
                                  verbose=0,
                                  mode='max',
                                  patience=10)
  checkpoint = ModelCheckpoint(f'./fold{fold}.h5',
                                monitor='val_accuracy',
                                verbose=0,
                                mode='max',
                                save_weights_only=True,
                                save_best_only=True)
  # model.summary()

  if os.path.exists(f'./fold{fold}.h5'):
      print('-------------load the model-----------------')
      model.load_weights(f'./fold{fold}.h5')


  history=model.fit(X_train, y_train, 
            epochs=epochs, 
            batch_size=batch_size, 
            verbose=verbose,
            validation_data=(X_val, y_val),
            callbacks=[plateau, early_stopping, checkpoint])
  # evaluate modelg 
  model.summary()
  X_val_predict = model.predict(X_val)
  X_test_predict = model.predict(X_test)
    
  df_train_stacking.loc[val_index,:] = X_val_predict
  df_test_stacking[:] += X_test_predict / folds
  print('--------------- end ---------------')

# 验证和输出结果

In [None]:
def accuracy(y_true, y_pred):
    return tf.keras.metrics.sparse_top_k_categorical_accuracy(y_true, y_pred, k=1)

In [None]:
def acc_combo(y, y_pred):
    # 数值ID与行为编码的对应关系
    mapping = {0: 'A_0', 1: 'A_1', 2: 'A_2', 3: 'A_3', 
        4: 'D_4', 5: 'A_5', 6: 'B_1',7: 'B_5', 
        8: 'B_2', 9: 'B_3', 10: 'B_0', 11: 'A_6', 
        12: 'C_1', 13: 'C_3', 14: 'C_0', 15: 'B_6', 
        16: 'C_2', 17: 'C_5', 18: 'C_6'}
    # 将行为ID转为编码
    code_y, code_y_pred = mapping[y], mapping[y_pred]i
    if code_y == code_y_pred: #编码完全相同得分1.0
        return 1.0
    elif code_y.split("_")[0] == code_y_pred.split("_")[0]: #编码仅字母部分相同得分1.0/7
        return 1.0/7
    elif code_y.split("_")[1] == code_y_pred.split("_")[1]: #编码仅数字部分相同得分1.0/3
        return 1.0/3
    else:
        return 0.0

In [None]:
labels = np.argmax(df_test_stacking.values, axis=1)
pred_y = np.argmax(df_train_stacking.values, axis=1)


acc_scores = round(accuracy_score(y, pred_y), 5)
acc_combo_scores = round(sum(acc_combo(y_true, y_pred) for y_true, y_pred in zip(y, pred_y)) / len(list(y)),5)

print('--------')
print(' acc : ', acc_scores, 'acc_combo : ', acc_combo_scores)

df_out =df_test.groupby('fragment_id')['fragment_id'].min()
df_out['behavior_id'] = labels
df_out.to_csv('./submit_lstm_%.5f_%.5f.csv' % (acc_scores, acc_combo_scores), index=False)

In [None]:
import matplotlib.pyplot as plt

def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()
  
plot_graphs(history, "accuracy")
plot_graphs(history, "loss")