In [1]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from tools.data import DreemDatasets
from preprocessing import Compose, ExtractBands, ExtractSpectrum

In [2]:
use_datasets = ['eeg_1', 'eeg_2', 'eeg_3', 'eeg_4', 'eeg_5', 'eeg_6', 'eeg_7']
seed = 1

## Des transformations

In [3]:
dataset_transform_extract_bands = {
    "eeg_1": ExtractBands(),
    "eeg_2": ExtractBands(),
    "eeg_3": ExtractBands(),
    "eeg_4": ExtractBands(),
    "eeg_5": ExtractBands(),
    "eeg_6": ExtractBands(),
    "eeg_7": ExtractBands()
}

In [4]:
dataset_transform_spectrum = {
    "eeg_1": Compose([ExtractBands(), ExtractSpectrum(window=100)]),
    "eeg_2": Compose([ExtractBands(), ExtractSpectrum(window=100)]),
    "eeg_3": Compose([ExtractBands(), ExtractSpectrum(window=100)]),
    "eeg_4": Compose([ExtractBands(), ExtractSpectrum(window=100)]),
    "eeg_5": Compose([ExtractBands(), ExtractSpectrum(window=100)]),
    "eeg_6": Compose([ExtractBands(), ExtractSpectrum(window=100)]),
    "eeg_7": Compose([ExtractBands(), ExtractSpectrum(window=100)])
}

## Générer le dataset

Si les données ont déjà été générés, sauter cette étape.

### Données brutes
Avec directement tous les eeg

In [5]:
train_set, val_set = DreemDatasets('dataset/train.h5', 'dataset/train_y.csv', 
                                   split_train_val=0.8, seed=seed, keep_datasets=use_datasets).get()

train_set.save_data("dataset/all_eegs/train") 

val_set.save_data("dataset/all_eegs/val")

train_set.close()  # Ne ferme que les fichiers h5. Si mis en mémoire, on a toujours accès aux données !
val_set.close()

Saving into dataset/all_eegs/train ...
Loading dataset eeg_1 ...


KeyboardInterrupt: 

### Séparés en bandes

In [None]:
train_set, val_set = DreemDatasets('dataset/train.h5', 'dataset/train_y.csv', 
                                   split_train_val=0.8, seed=seed, keep_datasets=use_datasets,
                                   transforms=dataset_transform_extract_bands).get()

train_set.save_data("dataset/eegs_bands/train") 

val_set.save_data("dataset/eegs_bands/val")

train_set.close()  # Ne ferme que les fichiers h5. Si mis en mémoire, on a toujours accès aux données !
val_set.close()

### Spectre sur les données séparée en bande

In [6]:
train_set, val_set = DreemDatasets('dataset/train.h5', 'dataset/train_y.csv', 
                                   split_train_val=0.8, seed=seed, keep_datasets=use_datasets,
                                   transforms=dataset_transform_spectrum).get()

train_set.save_data("dataset/eegs_band_spectrum/train") 

val_set.save_data("dataset/eegs_band_spectrum/val")

train_set.close()  # Ne ferme que les fichiers h5. Si mis en mémoire, on a toujours accès aux données !
val_set.close()

Saving into dataset/eegs_band_spectrum/train ...
Loading dataset eeg_1 ...
Apply transformations...
Applied.
Loading dataset eeg_2 ...
Apply transformations...
Applied.
Loading dataset eeg_3 ...
Apply transformations...
Applied.
Loading dataset eeg_4 ...
Apply transformations...
Applied.
Loading dataset eeg_5 ...
Apply transformations...
Applied.
Loading dataset eeg_6 ...
Apply transformations...
Applied.
Loading dataset eeg_7 ...
Apply transformations...
Applied.
Saved.
Saving into dataset/eegs_band_spectrum/val ...
Loading dataset eeg_1 ...
Apply transformations...
Applied.
Loading dataset eeg_2 ...
Apply transformations...
Applied.
Loading dataset eeg_3 ...
Apply transformations...
Applied.
Loading dataset eeg_4 ...
Apply transformations...
Applied.
Loading dataset eeg_5 ...
Apply transformations...
Applied.
Loading dataset eeg_6 ...
Apply transformations...
Applied.
Loading dataset eeg_7 ...
Apply transformations...
Applied.
Saved.


## Charger les données en mémoire

In [5]:
train_set, val_set = DreemDatasets('dataset/train.h5', 'dataset/train_y.csv', 
                                   split_train_val=0.8, seed=seed, keep_datasets=use_datasets,
                                   verbose=False).get()

train_set.load_data("dataset/all_eegs/train") 

val_set.load_data("dataset/all_eegs/val")

train_set.close()  # Ne ferme que les fichiers h5. Si mis en mémoire, on a toujours accès aux données !
val_set.close()

Loading data in memory...
5412 in 7 datasets to load
Loading dataset eeg_1 ...
Loading dataset eeg_2 ...
Loading dataset eeg_3 ...
Loading dataset eeg_4 ...
Loading dataset eeg_5 ...
Loading dataset eeg_6 ...
Loading dataset eeg_7 ...
Done.
Loading data in memory...
1353 in 7 datasets to load
Loading dataset eeg_1 ...
Loading dataset eeg_2 ...
Loading dataset eeg_3 ...
Loading dataset eeg_4 ...
Loading dataset eeg_5 ...
Loading dataset eeg_6 ...
Loading dataset eeg_7 ...
Done.


## Simple random forest sur les données brutes

In [7]:
clf = RandomForestClassifier(n_estimators=100, random_state=0)

In [6]:
X, _, y = train_set[:]
X_val, _, y_val = val_set[:]

X = X.transpose((1, 0, 2))
X = X.reshape(-1, 7*1500)

X_val = X_val.transpose((1, 0, 2))
X_val = X_val.reshape(-1, 7*1500)

In [8]:
clf.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [9]:
labels_pred = clf.predict(X_val)
cm = confusion_matrix(y_val, labels_pred)
acc = accuracy_score(y_val, labels_pred)
f1 = f1_score(y_val, labels_pred, average='macro')

print(cm, acc, f1)

[[110  43  42  41  41]
 [ 63  72  52  41  57]
 [ 23  17  89  70  59]
 [ 30  12  44 170  11]
 [ 25  27  75  24 115]] 0.4109386548410939 0.404781216562207


## En utilisant les données séparées en bandes

In [10]:
train_set, val_set = DreemDatasets('dataset/train.h5', 'dataset/train_y.csv', 
                                   split_train_val=0.8, seed=seed, keep_datasets=use_datasets,
                                   verbose=False).get()

train_set.load_data("dataset/eegs_bands/train") 

val_set.load_data("dataset/eegs_bands/val")

train_set.close()  # Ne ferme que les fichiers h5. Si mis en mémoire, on a toujours accès aux données !
val_set.close()

In [11]:
clf = RandomForestClassifier(n_estimators=100, random_state=0)

In [14]:
X, _, y = train_set[:]
X_val, _, y_val = val_set[:]

X = X.transpose((1, 0, 2, 3))
X_val = X_val.transpose((1, 0, 2, 3))

X = X.reshape(-1, 7*4*1500)
X_val = X_val.reshape(-1, 7*4*1500)

(5412, 42000)
(1353, 42000)


In [15]:
clf.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [16]:
labels_pred = clf.predict(X_val)
cm = confusion_matrix(y_val, labels_pred)
acc = accuracy_score(y_val, labels_pred)
f1 = f1_score(y_val, labels_pred, average='macro')

print(cm, acc, f1)

[[137  38  49  23  30]
 [ 65  64  53  42  61]
 [ 27  27  78  65  61]
 [ 27  17  42 174   7]
 [ 41  24  59  26 116]] 0.4205469327420547 0.41196910424927663


## En utilisant les bandes en fréquence

In [7]:
train_set, val_set = DreemDatasets('dataset/train.h5', 'dataset/train_y.csv', 
                                   split_train_val=0.8, seed=seed, keep_datasets=use_datasets,
                                   verbose=False).get()

train_set.load_data("dataset/eegs_band_spectrum/train") 

val_set.load_data("dataset/eegs_band_spectrum/val")

train_set.close()  # Ne ferme que les fichiers h5. Si mis en mémoire, on a toujours accès aux données !
val_set.close()

In [8]:
clf = RandomForestClassifier(n_estimators=100, random_state=0)

In [11]:
X, _, y = train_set[:]
X_val, _, y_val = val_set[:]

X = X.transpose((2, 0, 1, 3, 4))
X_val = X_val.transpose((2, 0, 1, 3, 4))

X = X.reshape(-1, 7*4*1500)
X_val = X_val.reshape(-1, 7*4*1500)

In [12]:
clf.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [13]:
labels_pred = clf.predict(X_val)
cm = confusion_matrix(y_val, labels_pred)
acc = accuracy_score(y_val, labels_pred)
f1 = f1_score(y_val, labels_pred, average='macro')

print(cm, acc, f1)

[[163  43  24  15  32]
 [ 61  77  50  30  67]
 [ 25  20  96  64  53]
 [ 14  11  41 195   6]
 [ 31  23  54  23 135]] 0.49223946784922396 0.4826248025434654
