In [None]:
from tensorflow.keras.datasets import cifar10
from tensorflow.keras.models import Sequential
import optuna
from tensorflow.keras.layers import Dense, Flatten, Conv2D, MaxPooling2D, Conv1D, Dropout, MaxPool1D, LeakyReLU
from tensorflow.keras.losses import sparse_categorical_crossentropy
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import KFold
from scipy.interpolate import interp1d
import tensorflow as tf
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import h5py # Read and write HDF5 files from Python
import os
import numpy as np

data_path = "/kaggle/input/dreem-2-sleep-classification-challenge-2020/"
file_xtrain = data_path + "X_train.h5/X_train.h5"
file_xtest = data_path + "X_test.h5/X_test.h5"
file_ytrain = data_path + "y_train.csv"
# training labels
pd.read_csv(file_ytrain)
# what does the h5 file contains ?
with h5py.File(file_xtrain, "r") as hf:
        print(list(hf.keys()))

# How to load data from h5? what is its shape and type?
with h5py.File(file_xtrain, "r") as hf:
        field = list(hf.keys())[0]
        x_data = hf[field][()]
type(x_data), x_data.shape

def normalize_data(eeg_array):
    """normalize signal between 0 and 1"""

    normalized_array = np.clip(eeg_array, -150, 150)
    normalized_array = normalized_array / 150

    return normalized_array

def split_data(input_signals_list,validation_ratio=0.2):
    with h5py.File(file_xtrain, "r") as fi:
        if len(input_signals_list) == 1:
            x_data = fi[input_signals_list[0]][()]
        else:
            x_data = np.zeros([24688,1500,len(input_signals_list)])
            for i in range(0, len(input_signals_list)):
                if 'x' in input_signals_list[i] or 'y' in input_signals_list[i] or 'z' in input_signals_list[i]:
                    f1 = interp1d(np.arange(0, 300), fi[input_signals_list[i]][()], axis=1)
                    xnew = np.linspace(0, 30, num=1500)
                    x_data[0:24688, 0:1500, i] = f1(xnew)
                else:
                    x_data[0:24688, 0:1500, i] = fi[input_signals_list[i]][()]
        y_data = pd.read_csv(file_ytrain)['sleep_stage'].to_numpy()
        # Creating data indices for training and validation splits:
        dataset_size = len(y_data)
        indices = list(range(dataset_size))
        split = int((1 - validation_ratio) * dataset_size)
        np.random.shuffle(indices)
        train_indices, val_indices = indices[:split], indices[split:]

        x_train, x_validation = x_data[train_indices], x_data[val_indices]
        y_train, y_validation = y_data[train_indices], y_data[val_indices]

        x_train, x_validation = normalize_data(x_train), normalize_data(x_validation)

    return x_train, y_train, x_validation, y_validation
input_signals_list = ['eeg_2','eeg_3','eeg_4','eeg_5','y']
#input_signals_list = ['eeg_4','eeg_1']
#input_signals_list = ['eeg_1', 'eeg_2', 'eeg_4', 'eeg_5', 'eeg_6', 'eeg_7','y','x']
x_train, y_train, x_validation, y_validation = split_data(input_signals_list)




# Model configuration
batch_size = 32
loss_function = sparse_categorical_crossentropy
no_classes = 5
no_epochs = 20
optimizer = Adam(learning_rate=0.001)
verbosity = 1
num_folds = 3

# Determine shape of the data

# Define per-fold score containers
acc_per_fold = []
loss_per_fold = []

# Merge inputs and targets
inputs = np.concatenate((x_train, x_validation), axis=0)
targets = np.concatenate((y_train, y_validation), axis=0)

# Define the K-fold Cross Validator
kfold = KFold(n_splits=num_folds, shuffle=True)

# K-fold Cross Validation model evaluation
fold_no = 1
for train, test in kfold.split(inputs, targets):

  # Define the model architecture
  model = Sequential()
  model.add(Conv1D(128, kernel_size=7, strides=2, input_shape=(1500,len(input_signals_list))))
  model.add(LeakyReLU(alpha=0.1))
  model.add(Conv1D(128, kernel_size=7, strides=2))
  model.add(LeakyReLU(alpha=0.1))
  model.add(Dropout(0.5))
  model.add(Conv1D(128, kernel_size=7, strides=2))
  model.add(LeakyReLU(alpha=0.1))
  model.add(MaxPool1D(4))
  model.add(Conv1D(256, kernel_size=7, strides=2))
  model.add(Conv1D(128, kernel_size=7, strides=2))
  model.add(LeakyReLU(alpha=0.1))
  model.add(Dropout(0.5))
  model.add(Flatten())
  model.add(Dense(256))
  model.add(LeakyReLU(alpha=0.1))
  model.add(Dense(128))
  model.add(LeakyReLU(alpha=0.1))
  model.add(Dense(no_classes, activation='softmax'))

  # Compile the model
  model.compile(loss=loss_function,
                optimizer=optimizer,
                metrics=['accuracy'])


  # Generate a print
  print('------------------------------------------------------------------------')
  print(f'Training for fold {fold_no} ...')

  # Fit data to model
  history = model.fit(inputs[train], targets[train],
              batch_size=batch_size,
              epochs=no_epochs,
              verbose=verbosity)

  # Generate generalization metrics
  scores = model.evaluate(inputs[test], targets[test], verbose=0)
  print(f'Score for fold {fold_no}: {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1]*100}%')
  acc_per_fold.append(scores[1] * 100)
  loss_per_fold.append(scores[0])

  # Increase fold number
  fold_no = fold_no + 1

# == Provide average scores ==
print('------------------------------------------------------------------------')
print('Score per fold')
for i in range(0, len(acc_per_fold)):
  print('------------------------------------------------------------------------')
  print(f'> Fold {i+1} - Loss: {loss_per_fold[i]} - Accuracy: {acc_per_fold[i]}%')
print('------------------------------------------------------------------------')
print('Average scores for all folds:')
print(f'> Accuracy: {np.mean(acc_per_fold)} (+- {np.std(acc_per_fold)})')
print(f'> Loss: {np.mean(loss_per_fold)}')
print('------------------------------------------------------------------------')