# Dreem Project - CNN

In [1]:
import h5py
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

## Data (NO NEED TO RE-RUN)

In [2]:
root_path = "original_data/"

y_train = pd.read_csv(root_path + "y_train.csv").as_matrix()[:, 1].squeeze()
X_train = h5py.File(root_path + "X_train.h5", "r")

#y_test = pd.read_csv("y_test.csv").as_matrix()[:, 1].squeeze()
X_test = h5py.File(root_path + "X_test.h5", "r") #we don't really care about that for now

  This is separate from the ipykernel package so we can avoid doing imports until


In [3]:
def label_columns(df):
    l_columns = [
        'num_pso', 
        'mean_amp_pso',
        'mean_dur_pso',
        'amp_cso',
        'dur_cso',
        'curr_sleep_stage',
        'time_since_sleep',
        'time_in_ds',
        'time_in_ls',
        'time_in_rs',
        'time_in_ws'
    ]
    
    for i in range(12, 1261+1):
        l_columns.append('eeg_signal_%s'%(i-12+1))
    
    df.columns = l_columns

In [4]:
df_train = pd.DataFrame(data=X_train["features"][:])

# Label columns
label_columns(df_train)

# Add dummies
df_train = pd.get_dummies(
    df_train,
    prefix=['curr_sleep_stage'],
    columns=['curr_sleep_stage']
)

# Add outcome variable 'SO'
y_train_df = pd.DataFrame(
    y_train,
    columns=['SO'],
    index=df_train.index  #!important to keep same index
)
df_train['SO'] = y_train_df['SO']


df_train.head()

Unnamed: 0,num_pso,mean_amp_pso,mean_dur_pso,amp_cso,dur_cso,time_since_sleep,time_in_ds,time_in_ls,time_in_rs,time_in_ws,...,eeg_signal_1244,eeg_signal_1245,eeg_signal_1246,eeg_signal_1247,eeg_signal_1248,eeg_signal_1249,eeg_signal_1250,curr_sleep_stage_2.0,curr_sleep_stage_3.0,SO
0,237.0,152.658761,341.523207,128.017491,429.0,11379.0,2730.0,3780.0,0.0,480.0,...,7.30548,5.344436,2.674903,-0.055816,-1.212385,-2.461937,-4.930397,0,1,0
1,176.0,146.883435,338.039773,119.130849,196.0,6721.0,2580.0,2100.0,0.0,480.0,...,3.975107,1.00024,2.870631,7.071897,7.848365,4.033517,-2.110046,1,0,1
2,456.0,152.376541,335.629386,164.29258,417.0,26832.0,3240.0,7440.0,2130.0,750.0,...,5.708701,-0.753271,-5.627993,-9.804085,-12.863908,-11.951175,-5.531799,1,0,1
3,21.0,139.720772,336.285714,159.237082,407.0,1289.0,0.0,60.0,0.0,450.0,...,6.335396,-4.008689,-8.589818,-5.876062,0.166707,6.054539,12.086351,0,1,1
4,72.0,140.649432,349.875,130.184278,297.0,2262.0,630.0,960.0,0.0,450.0,...,9.222596,21.236168,28.245889,27.024864,17.794644,7.602379,-4.548318,0,1,2


In [5]:
df_train.to_hdf('df_train_CNN.h5', key='df_train_CNN')

## Split data

In [6]:
df_train = pd.read_hdf('df_train_CNN.h5')

var_to_pred = 'SO'
X_train, X_val, y_train, y_val = train_test_split(df_train.loc[:, df_train.columns != var_to_pred],
                                                  df_train[var_to_pred],
                                                  test_size=0.10,
                                                  random_state=0,
                                                  stratify=df_train[var_to_pred])
y_train = pd.DataFrame(y_train)
y_val = pd.DataFrame(y_val)

In [7]:
from keras.utils import to_categorical

# Convert labels to categorical one-hot encoding
y_train = to_categorical(y_train.values, num_classes=3)
y_val = to_categorical(y_val.values, num_classes=3)

Using TensorFlow backend.


In [8]:
df_train.head()

Unnamed: 0,num_pso,mean_amp_pso,mean_dur_pso,amp_cso,dur_cso,time_since_sleep,time_in_ds,time_in_ls,time_in_rs,time_in_ws,...,eeg_signal_1244,eeg_signal_1245,eeg_signal_1246,eeg_signal_1247,eeg_signal_1248,eeg_signal_1249,eeg_signal_1250,curr_sleep_stage_2.0,curr_sleep_stage_3.0,SO
0,237.0,152.658761,341.523207,128.017491,429.0,11379.0,2730.0,3780.0,0.0,480.0,...,7.30548,5.344436,2.674903,-0.055816,-1.212385,-2.461937,-4.930397,0,1,0
1,176.0,146.883435,338.039773,119.130849,196.0,6721.0,2580.0,2100.0,0.0,480.0,...,3.975107,1.00024,2.870631,7.071897,7.848365,4.033517,-2.110046,1,0,1
2,456.0,152.376541,335.629386,164.29258,417.0,26832.0,3240.0,7440.0,2130.0,750.0,...,5.708701,-0.753271,-5.627993,-9.804085,-12.863908,-11.951175,-5.531799,1,0,1
3,21.0,139.720772,336.285714,159.237082,407.0,1289.0,0.0,60.0,0.0,450.0,...,6.335396,-4.008689,-8.589818,-5.876062,0.166707,6.054539,12.086351,0,1,1
4,72.0,140.649432,349.875,130.184278,297.0,2262.0,630.0,960.0,0.0,450.0,...,9.222596,21.236168,28.245889,27.024864,17.794644,7.602379,-4.548318,0,1,2


In [97]:
eeg_indexes = np.arange(10, df_train.shape[1] - 3)

X_train_eeg = X_train.iloc[:, eeg_indexes]
X_val_eeg = X_val.iloc[:, eeg_indexes]

# we feed 2 downsampled datasets + the original dataset = 3
model_train = [
    X_train_eeg.iloc[:, np.arange(0, 1250, 5)].values[:,:,None], # small
    X_train_eeg.iloc[:, np.arange(0, 1250, 2)].values[:,:,None], # medium
    X_train_eeg.values[:,:,None],                                # original
]

model_val = [
    X_val_eeg.iloc[:, np.arange(0, 1250, 5)].values[:,:,None], # small
    X_val_eeg.iloc[:, np.arange(0, 1250, 2)].values[:,:,None], # medium
    X_val_eeg.values[:,:,None],                                # original
]

In [100]:
print(model_train[0].shape)
print(model_val[0].shape)

(235470, 250, 1)
(26164, 250, 1)


In [133]:
# Inspired from https://towardsdatascience.com/how-to-use-convolutional-neural-networks-for-time-series-classification-56b1b0a07a57

from keras.layers import Conv1D, Dense, Dropout, Input, Concatenate, GlobalMaxPooling1D
from keras.models import Model, Sequential
from keras.backend import expand_dims

# Base model
# it takes a time series as an input, performs 1-D convolution
# and returns it as an output ready for concatenation
def get_base_model(input_len, kernel_size):
    # the input is a time series of length n and width 1 (1 channel)
    input_seq = Input(shape=(input_len, 1))
    
    # expanding dim because Conv1D expects a 3D vector
#     input_seq_expanded = expand_dims(input_seq, axis=-1)
    
    # choose the number of convolution filters
    nb_filters = 10

    # 1-D convolution and global max-pooling
    convolved = Conv1D(nb_filters, kernel_size, padding="same", activation="tanh")(input_seq)
#     convolved = Conv1D(nb_filters, kernel_size, padding="same", activation="tanh")(input_seq_expanded)
    processed = GlobalMaxPooling1D()(convolved)
    
    # dense layer with dropout regularization
    compressed = Dense(50, activation="tanh")(processed)
    compressed = Dropout(0.3)(compressed)
    
    model = Model(inputs=input_seq, outputs=compressed)
    return model

# Main model
# it takes the original time series and its down-sampled versions as an input
# and returns the result of classification as an output
def main_model(inputs_lens = [250, 625, 1250], kernel_sizes = [4, 8, 16]):
    # the inputs to the branches are the original time series, and its down-sampled versions
    input_smallseq = Input(shape=(inputs_lens[0], 1))
    input_medseq   = Input(shape=(inputs_lens[1], 1))
    input_origseq  = Input(shape=(inputs_lens[2], 1))

    # the more down-sampled the time series, the shorter the corresponding filter
    base_net_small    = get_base_model(inputs_lens[0], kernel_sizes[0])
    base_net_med      = get_base_model(inputs_lens[1], kernel_sizes[1])
    base_net_original = get_base_model(inputs_lens[2], kernel_sizes[2])
    
    embedding_small    = base_net_small(input_smallseq)
    embedding_med      = base_net_med(input_medseq)
    embedding_original = base_net_original(input_origseq)
    
    # concatenate all the outputs
    merged = Concatenate()([embedding_small, embedding_med, embedding_original])
    out = Dense(3, activation='softmax')(merged)
    
    model = Model(inputs=[input_smallseq, input_medseq, input_origseq], outputs=out)
    return model

In [145]:
def get_base_model(input_len, kernel_size):
    nb_filters = 10
    
    model = Sequential([
        Conv1D(nb_filters, kernel_size, padding="same", activation="relu", input_shape=(input_len, 1)),
        GlobalMaxPooling1D(),
        Dense(50, activation="relu"),
        Dropout(0.3),
    ])
    
    return model

def main_model(inputs_lens = [250, 625, 1250], kernel_sizes = [4, 8, 16]):
    input_smallseq = Input(shape=(inputs_lens[0], 1))
    input_medseq   = Input(shape=(inputs_lens[1], 1))
    input_origseq  = Input(shape=(inputs_lens[2], 1))
    
    base_net_small    = get_base_model(inputs_lens[0], kernel_sizes[0])
    base_net_med      = get_base_model(inputs_lens[1], kernel_sizes[1])
    base_net_original = get_base_model(inputs_lens[2], kernel_sizes[2])
    
    model = Sequential([
        Concatenate(input[
            base_net_small(input_smallseq),
            base_net_med(input_medseq),
            base_net_original(input_origseq),
        ]),
        Dense(3, activation="softmax"),
    ])
    
    return model

# def main_model(inputs_lens = [250, 625, 1250], kernel_sizes = [4, 8, 16]):
#     # the inputs to the branches are the original time series, and its down-sampled versions
#     input_smallseq = Input(shape=(inputs_lens[0], 1))
#     input_medseq   = Input(shape=(inputs_lens[1], 1))
#     input_origseq  = Input(shape=(inputs_lens[2], 1))

#     # the more down-sampled the time series, the shorter the corresponding filter
#     base_net_small    = get_base_model(inputs_lens[0], kernel_sizes[0])
#     base_net_med      = get_base_model(inputs_lens[1], kernel_sizes[1])
#     base_net_original = get_base_model(inputs_lens[2], kernel_sizes[2])
    
#     embedding_small    = base_net_small(input_smallseq)
#     embedding_med      = base_net_med(input_medseq)
#     embedding_original = base_net_original(input_origseq)
    
#     # concatenate all the outputs
#     merged = Concatenate()([embedding_small, embedding_med, embedding_original])
#     out = Dense(3, activation='softmax')(merged)
    
#     model = Model(inputs=[input_smallseq, input_medseq, input_origseq], outputs=out)
#     return model

In [146]:
batch_size = 32
epochs = 10

model = main_model()

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.fit(model_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(model_val, y_val))

TypeError: The added layer must be an instance of class Layer. Found: Tensor("concatenate_34/concat:0", shape=(None, 150), dtype=float32)

- (binary entropy) Batch size 32 & 3 epochs --> Train acuracy = 0.6667
- (binary entropy) Batch size 512 & 5 epochs --> Train acuracy = 0.6815
- (binary entropy) Batch size 512 & 20 epochs --> Train acuracy = 0.6823 & Test accuracy = 0.6811
- (sigmoid) (categorical) () Batch size 128 & 8 epochs --> Train = 0.4358 & Test = 0.4359
- (sigmoid) (categorical) () Batch size 128 & 8 epochs --> Train = 0.4358 & Test = 0.4359

In [122]:
model.predict(model_val)

array([[0.38286135, 0.28085998, 0.33627862],
       [0.40009502, 0.2817061 , 0.3181989 ],
       [0.3826717 , 0.2808529 , 0.33647534],
       ...,
       [0.38605964, 0.28098685, 0.33295348],
       [0.38429302, 0.2809149 , 0.33479205],
       [0.38428128, 0.28091446, 0.33480427]], dtype=float32)

In [123]:
y_val

array([[1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       ...,
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.]], dtype=float32)

In [124]:
model.summary()

Model: "model_132"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_255 (InputLayer)          (None, 250, 1)       0                                            
__________________________________________________________________________________________________
input_256 (InputLayer)          (None, 625, 1)       0                                            
__________________________________________________________________________________________________
input_257 (InputLayer)          (None, 1250, 1)      0                                            
__________________________________________________________________________________________________
model_129 (Model)               (None, 50)           5550        input_255[0][0]                  
__________________________________________________________________________________________