In [None]:
import pandas as pd
import numpy as np
import os
from matplotlib import pyplot as plt

In [None]:
current_directory = os.getcwd()
print(current_directory)
train = pd.read_csv('data/train.csv')
train_labels = pd.read_csv('data/train_labels.csv')
test = pd.read_csv('data/test.csv')

In this competition, you'll classify 60-second sequences of sensor data, indicating whether a subject was in either of two activity states for the duration of the sequence.

在本次競賽中，您將對 60 秒的感測器資料序列進行分類，找出受試者在序列持續時間內處於兩種活動狀態的其中一種。

## Files and Field Descriptions
- train.csv - the training set, comprising ~26,000 60-second recordings of thirteen biological sensors for almost one thousand experimental participants
    - sequence - a unique id for each sequence
    - subject - a unique id for the subject in the experiment
    - step - time step of the recording, in one second intervals
    - sensor_00 - sensor_12 - the value for each of the thirteen sensors at that time step
- train_labels.csv - the class label for each sequence.
    - sequence - the unique id for each sequence.
    - state - the state associated to each sequence. This is the target which you are trying to predict.
- test.csv - the test set. For each of the ~12,000 sequences, you should predict a value for that sequence's state.
- sample_submission.csv - a sample submission file in the correct format.

In [None]:
train

In [None]:
train_labels

In [None]:
test

由上表與題目提示可得知，每一秒會記錄一列，每 60 列代表一個 `sequence`

確認資料是否有缺漏，本題的資料都很完美，沒有有缺漏的地方

In [None]:
train.info()

## 特徵工程

萃取需要的資料

In [None]:
groups = train['sequence']

In [None]:
train

In [None]:
Window = 60
y = train_labels['state'].to_numpy()
train = train.loc[:, ['sensor_00', 'sensor_01', 'sensor_02', 'sensor_03', 'sensor_04', 'sensor_05', 'sensor_06', 'sensor_07', 'sensor_08', 'sensor_09', 'sensor_10', 'sensor_11', 'sensor_12']]
test = test.loc[:, ['sensor_00', 'sensor_01', 'sensor_02', 'sensor_03', 'sensor_04', 'sensor_05', 'sensor_06', 'sensor_07', 'sensor_08', 'sensor_09', 'sensor_10', 'sensor_11', 'sensor_12']]


In [None]:
y

In [None]:
y.shape

In [None]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
sc.fit(train)
X_train = sc.transform(train)
X_test = sc.transform(test)

In [None]:
X_train

## 整理資料

準備進入訓練的資料格式：設定 sequence
sequence 設為 60 ，代表過去 60 秒的資訊。

In [None]:
y_train = y.copy()
X_train = X_train.reshape(-1, Window, X_train.shape[-1])
X_test = X_test.reshape(-1, Window, X_test.shape[-1])

In [None]:
print(y_train.shape, X_train.shape, X_test.shape)

In [None]:
X_train

## 搭建 LSTM 網路

In [None]:
from keras import Sequential, Model
from keras.layers import *
from keras.callbacks import *
from keras.metrics import AUC

In [None]:
def get_model():
    input_layer = Input(shape=(X_train.shape[1:]))
    conv1 = Conv1D(64, 3, activation='relu')(input_layer)
    pool1 = MaxPooling1D(3)(conv1)
    d1 = Dropout(0.5)(pool1)

    x1 = Bidirectional(LSTM(64, return_sequences=True))(d1)
    x21 = Bidirectional(LSTM(32, return_sequences=True))(x1)
    x22 = Bidirectional(LSTM(32, return_sequences=True))(d1)
    l2 = Concatenate(axis=2)([x21, x22])

    x31 = Bidirectional(LSTM(10, return_sequences=True))(l2)
    x32 = Bidirectional(LSTM(10, return_sequences=True))(x21)
    l3 = Concatenate(axis=2)([x31, x32])

    l5 = Concatenate(axis=2)([x1, l2, l3])
    x7 = Dense(32, activation='selu')(l5)
    x8 = Dropout(0.3)(x7)
    f = Flatten()(x8)
    output_layer = Dense(units=1, activation="sigmoid")(f)
    model = Model(inputs=input_layer, outputs=output_layer, name='DNN_Model')
    model.compile(optimizer="adam",loss="binary_crossentropy", metrics=[AUC(name = 'auc')])
    return  model


model = get_model()
model.summary()


Reson for why add Fallten in last dense

An LSTM layer consists of different LSTM cells that are processed sequentially. As seen in the figure below, the first cell takes an input/embedding calculates a hidden state and the next cell uses its input and the hidden state at previous time step to compute its own hidden state. Basically the arrows between the cells also pass the hidden states. <b>If you do return_sequences=False, the lstm layer only outputs the very last hidden state! (h_4 in the figure). So, all those information from all inputs and cells are embedded in a single fixed size information and it can not contain lots of information.</b> This is why, your accuracy is not good when you only use the last hidden state.

When you do `return_sequences=True`, lstm layer outputs every hidden state, so the next layers have access to all hidden states and they contain naturally more information. However, the LSTM layer returns a matrix. You can also see this in your model summary. It returns a matrix of size (None, 500, 128). None is basically number of samples in your batch, you can forget about it. 500 is your input size, and 128 is your hidden state size. <b>The dense layer can not process a matrix, it has to be a vector. That why you need to apply flatten and what it does is basically just to open up the 2D matrix and represent it as 1D vector.</b> Therefore, the size of your Flatten layer is 64000 because 500*128 = 64000. And Of course with more hidden states, the accuracy is better as they contain more information.

In [None]:
from keras.utils.vis_utils import plot_model
plot_model(model)

In [None]:
import matplotlib.pyplot as plt

def plot_hist(hist, metric='auc', ax=None, fold=0):
    if ax==None:
        plt.plot(hist.history[metric])
        plt.plot(hist.history["val_" + metric])
        plt.title(f"model performance fold {fold}")
        plt.ylabel("area_under_curve")
        plt.xlabel("epoch")
        plt.legend(["train", "validation"], loc="upper left")
        plt.show()
        return
    else:
        ax.plot(hist.history[metric])
        ax.plot(hist.history["val_" + metric])
        ax.set_title(f"model performance fold {fold}")
        ax.set_ylabel("area_under_curve")
        ax.set_xlabel("epoch")
        ax.legend(["train", "validation"], loc="upper left")

In [None]:
from sklearn.model_selection import KFold, GroupKFold
from sklearn.metrics import roc_auc_score

def fit_model(folds):
    test_preds = []
    auc = []
    nfold = folds
    ncols = 5
    nrows = round(nfold / ncols)
    fig, axes = plt.subplots(nrows, ncols, figsize=(20, round(nrows*20/ncols)))
    kf = GroupKFold(n_splits=nfold)
    col, row = 0, 0
    for fold, (train_idx, test_idx) in enumerate(kf.split(X_train, y_train, groups.unique())):
        print(f"Fold: {fold+1}", end=' ')
        X_train_part, X_valid = X_train[train_idx], X_train[test_idx]
        y_train_part, y_valid = y_train[train_idx], y_train[test_idx]

        model = get_model()
        lr = ReduceLROnPlateau(monitor="val_auc", mode='max', factor=0.7, patience=4, verbose=False)
        es = EarlyStopping(monitor='val_auc',mode='max', patience=10, verbose=False,restore_best_weights=True)
        history = model.fit(X_train_part, y_train_part, validation_data=(X_valid, y_valid), epochs=60, batch_size=32,
                            callbacks=[es,lr], verbose=1)

        y_pred = model.predict(X_valid).squeeze()
        auc_score = roc_auc_score(y_valid, y_pred)
        print(f'auc: {round(auc_score, 5)}')
        test_preds.append(model.predict(X_test).squeeze())
        auc.append(auc_score)
        
        model.save('fold_{}.h5'.format(nfold))
        plot_hist(history, metric='auc', ax=axes[row][col], fold=fold+1)
        del X_train_part, X_valid, y_train_part, y_valid, model, history

        col += 1
        if col >= ncols:
            row += 1
            col = 0
    return (test_preds, auc)

In [None]:
folds = 4
(test_preds, auc) = fit_model(folds)

print(test_preds)

## 預測真實資料

In [None]:
print(f"the mean AUC for the {folds} folds is : {round(np.mean(auc)*100,3)}")

In [None]:
sub_data = pd.read_csv("data/sample_submission.csv")

In [None]:
sub_data['state'] = sum(test_preds)/10 

In [None]:
sub_data.state = (sub_data.state > 0.5).astype(int)

In [None]:
sub_data

In [None]:
sub_data.to_csv('submission.csv', index=False)