# Test on epirest data

In [9]:
import logging
import warnings
from dataclasses import asdict

import chika
from imblearn.metrics import geometric_mean_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import cohen_kappa_score, roc_auc_score
from spikebench import load_allen, load_fcx1, load_retina, load_epirest
from spikebench.helpers import set_random_seed, simple_undersampling, tsfresh_vectorize, tsfresh_vectorize_spike_count

warnings.filterwarnings('ignore')
logging.basicConfig(level=logging.INFO)

@chika.config
class ModelConfig:
    n_estimators: int = 200
    max_depth: int = 10

@chika.config
class Config:
    model: ModelConfig
    seed: int = 0
    dataset: str = 'epirest'
    balance_train: bool = False
    balance_test: bool = False
    tsfresh_scale_features: bool = True
    tsfresh_remove_low_variance: bool = True
    tsfresh_feature_set: str = 'distribution_features'
    window_size: int = 200
    step_size: int = 100

cfg = Config(
    model=ModelConfig(),
    seed=0,
    dataset='epirest',
    balance_train=False,
    balance_test=False,
    tsfresh_scale_features=True,
    tsfresh_remove_low_variance=True,
    tsfresh_feature_set='distribution_features',
    window_size=200,
    step_size=100
)

set_random_seed(cfg.seed)

loader_fn = {
    'fcx1': load_fcx1,
    'retina': load_retina,
    'allen': load_allen,
    'epirest': load_epirest,
}[cfg.dataset]
X_train, X_test, y_train, y_test, gr_train, gr_test = loader_fn(random_seed=cfg.seed)


if cfg.balance_train:
    X_train, y_train = simple_undersampling(X_train, y_train)
if cfg.balance_test:
    X_test, y_test = simple_undersampling(X_test, y_test)

X_train, X_test, y_train, y_test = tsfresh_vectorize(X_train, X_test, y_train, y_test, cfg)

print(f'X_train shape: {X_train.shape}, X_test shape: {X_test.shape}')


# Entraîner le modèle
model = RandomForestClassifier(
    **asdict(cfg.model),
    random_state=cfg.seed,
    n_jobs=-1,
)
model.fit(X_train, y_train)

# Évaluer le modèle
roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
gmean = geometric_mean_score(y_test, model.predict(X_test))
kappa = cohen_kappa_score(y_test, model.predict(X_test))

# Afficher les résultats
print(f'AUC ROC score value on {cfg.dataset} test set (random forest): {roc_auc}')
print(f'G-mean score value on {cfg.dataset} test set (random forest): {gmean}')
print(f'Kappa score value on {cfg.dataset} test set (random forest): {kappa}')

(8680, 200) (6546, 200) (8680,) (6546,)


42.327880859375

INFO:root:Started time series vectorization and preprocessing
Feature Extraction: 100%|██████████| 50/50 [00:03<00:00, 12.82it/s]
Feature Extraction: 100%|██████████| 50/50 [00:03<00:00, 14.75it/s]


X_train shape: (8680, 31), X_test shape: (6546, 31)
AUC ROC score value on epirest test set (random forest): 0.5
G-mean score value on epirest test set (random forest): 0.497708430511213
Kappa score value on epirest test set (random forest): 0.0


# Breakdown

# Prepare dataset

In [69]:
from spikebench.load_datasets import epirest_dataset
from sklearn.model_selection import GroupShuffleSplit
from pathlib import Path
import numpy as np
import os
import pandas as pd

dataset_path='./data/epirest'
label_delimitation=24
random_seed=0
test_size=0.3
n_samples=None
window_size=200
step_size=100
encoding='isi'
bin_size=80

DELIMITER = ','
dataset_path = Path(os.path.join(dataset_path, str(label_delimitation)))

if not os.path.exists(dataset_path):
    raise FileNotFoundError(
        'Epirest dataset not found.'
    )

preictal_spikes = epirest_dataset(dataset_path / 'preictal.csv')
interictal_spikes = epirest_dataset(dataset_path / 'interictal.csv')

display(preictal_spikes.head())
display(interictal_spikes.head())
print(f'Preictal spikes: {preictal_spikes.shape}')
print(f'Interictal spikes: {interictal_spikes.shape}')

# get train test split
group_split = GroupShuffleSplit(
    n_splits=1, test_size=test_size, random_state=random_seed
)
X = np.hstack([preictal_spikes.series.values, interictal_spikes.series.values])
y = np.hstack([np.ones(preictal_spikes.shape[0]), np.zeros(interictal_spikes.shape[0])])
groups = np.hstack([preictal_spikes.groups.values, interictal_spikes.groups.values])

print(f'X shape: {X.shape}, y shape: {y.shape}')
print(f'Groups shape: {groups.shape}')

for train_index, test_index in group_split.split(X, y, groups):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

# create dataframes based on the train and test split indices
X_train = pd.DataFrame({'series': X_train, 'groups': groups[train_index]})
X_test = pd.DataFrame({'series': X_test, 'groups': groups[test_index]})
display(X_train.head())
display(X_test.head())

Unnamed: 0,groups,series
0,epirest_neuron_7,"575.836181640625, 621.39892578125, 8.666992187..."
1,epirest_neuron_8,"2319.366455078125, 8.056640625, 1308.10546875,..."
2,epirest_neuron_9,"65.49072265625, 992.3095703125, 8382.8125, 101..."
3,epirest_neuron_10,"38.848876953125, 183.990478515625, 269.8974609..."
4,epirest_neuron_11,"3370.60546875, 1648.681640625, 810.08911132812..."


Unnamed: 0,groups,series
0,epirest_neuron_0,"100.64697265625, 408.8134765625, 111.663818359..."
1,epirest_neuron_1,"1295.59326171875, 412.353515625, 87.890625, 27..."
2,epirest_neuron_2,"616.88232421875, 9270.721435546875, 355.804443..."
3,epirest_neuron_3,"7.9345703125, 317.535400390625, 56.396484375, ..."
4,epirest_neuron_4,"22.308349609375, 22.491455078125, 129.57763671..."


Preictal spikes: (135, 2)
Interictal spikes: (364, 2)
X shape: (499,), y shape: (499,)
Groups shape: (499,)


Unnamed: 0,series,groups
0,"575.836181640625, 621.39892578125, 8.666992187...",epirest_neuron_7
1,"2319.366455078125, 8.056640625, 1308.10546875,...",epirest_neuron_8
2,"38.848876953125, 183.990478515625, 269.8974609...",epirest_neuron_10
3,"3370.60546875, 1648.681640625, 810.08911132812...",epirest_neuron_11
4,"13285.400390625, 9589.111328125, 21207.3059082...",epirest_neuron_12


Unnamed: 0,series,groups
0,"65.49072265625, 992.3095703125, 8382.8125, 101...",epirest_neuron_9
1,"287.078857421875, 88.07373046875, 10.650634765...",epirest_neuron_14
2,"206.48193359375, 161.865234375, 133.8806152343...",epirest_neuron_26
3,"282.379150390625, 885.009765625, 2137.05444335...",epirest_neuron_38
4,"1188.41552734375, 11.71875, 9.09423828125, 898...",epirest_neuron_89


# Basic normalization

In [66]:
import numpy as np
import spikebench.transforms as transforms
from spikebench.encoders import DFSpikeTrainTransform, TrainBinarizationTransform

X_train
X_test
y_train
y_test
delimiter=','
encoding='isi'
window_size=200
step_size=100
n_samples=None
bin_size=80

# Encode the spike trains using the specified encoding
normalizer = transforms.TrainNormalizeTransform(
    window=window_size, step=step_size, n_samples=n_samples
)
X_train, y_train, groups_train = normalizer.transform(
    X_train, y_train, delimiter=delimiter
)
print(f'X_train shape: {X_train.shape}, y_train shape: {y_train.shape}, groups_train shape: {groups_train.shape}')
print(f'label distribution: {np.unique(y_train, return_counts=True)}')
X_test, y_test, groups_test = normalizer.transform(
    X_test, y_test, delimiter=delimiter
)
if encoding == 'sce':
    binarizer = TrainBinarizationTransform(bin_size=bin_size)
    X_train = binarizer.transform(
        pd.DataFrame(
            {
                'series': [
                    ' '.join([str(v) for v in X_train[idx, :]])
                    for idx in range(X_train.shape[0])
                ]
            }
        )
    )
    X_test = binarizer.transform(
        pd.DataFrame(
            {
                'series': [
                    ' '.join([str(v) for v in X_test[idx, :]])
                    for idx in range(X_test.shape[0])
                ]
            }
        )
    )

#X_train, X_test, y_train, y_test, groups_train, groups_test

X_train shape: (26952, 200), y_train shape: (26952,), groups_train shape: (26952,)
label distribution: (array([0., 1.]), array([21553,  5399]))


In [51]:
a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])


window_size = 5
step_size = 5

normalized_train = np.zeros(window_size)
print(normalized_train)

def rolling_window(a, window, step):
    n_chunks = (a.shape[0] - window) // step + 1
    split_chunks = np.array(
        [np.roll(a, -step * index)[:window] for index in range(n_chunks)]
    )
    if split_chunks.any():
        return np.vstack(split_chunks)

split_chunks = rolling_window(a, window_size, step_size)
print(split_chunks)
print(split_chunks.shape)

normalized_train = np.vstack([normalized_train, split_chunks])
print(normalized_train)
print(normalized_train.shape)

target = np.array([])
target = np.append(target, [1] * split_chunks.shape[0])
print(target)

b = np.array([11, 12, 13, 14, 15, 16, 17, 18, 19, 20])
new_split_chunks = rolling_window(b, window_size, step_size)
print(new_split_chunks)

normalized_train = np.vstack([normalized_train, new_split_chunks])
print(normalized_train)
print(normalized_train.shape)

normalized_train = normalized_train[1:, :]
print(normalized_train)
print(normalized_train.shape)

print(np.vstack(normalized_train))
print(np.vstack(normalized_train).shape)

[0. 0. 0. 0. 0.]
[[ 1  2  3  4  5]
 [ 6  7  8  9 10]]
(2, 5)
[[ 0.  0.  0.  0.  0.]
 [ 1.  2.  3.  4.  5.]
 [ 6.  7.  8.  9. 10.]]
(3, 5)
[1. 1.]
[[11 12 13 14 15]
 [16 17 18 19 20]]
[[ 0.  0.  0.  0.  0.]
 [ 1.  2.  3.  4.  5.]
 [ 6.  7.  8.  9. 10.]
 [11. 12. 13. 14. 15.]
 [16. 17. 18. 19. 20.]]
(5, 5)
[[ 1.  2.  3.  4.  5.]
 [ 6.  7.  8.  9. 10.]
 [11. 12. 13. 14. 15.]
 [16. 17. 18. 19. 20.]]
(4, 5)
[[ 1.  2.  3.  4.  5.]
 [ 6.  7.  8.  9. 10.]
 [11. 12. 13. 14. 15.]
 [16. 17. 18. 19. 20.]]
(4, 5)


# test

In [22]:
# Importer les bibliothèques nécessaires
import logging
import warnings
from dataclasses import asdict

import chika
from imblearn.metrics import geometric_mean_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import cohen_kappa_score, roc_auc_score
from spikebench import load_allen, load_fcx1, load_retina
from spikebench.helpers import set_random_seed, simple_undersampling, tsfresh_vectorize

warnings.filterwarnings('ignore')
logging.basicConfig(level=logging.INFO)

# Définir la configuration du modèle
@chika.config
class ModelConfig:
    n_estimators: int = 200
    max_depth: int = 10

# Définir la configuration générale
@chika.config
class Config:
    model: ModelConfig
    seed: int = 0
    dataset: str = 'epirest'
    balance_train: bool = False
    balance_test: bool = False
    tsfresh_scale_features: bool = True
    tsfresh_remove_low_variance: bool = True
    tsfresh_feature_set: str = 'distribution_features'
    window_size: int = 200
    step_size: int = 100

# Charger la configuration
cfg = Config(
    model=ModelConfig(),
    seed=0,
    dataset='epirest',
    balance_train=False,
    balance_test=False,
    tsfresh_scale_features=True,
    tsfresh_remove_low_variance=True,
    tsfresh_feature_set='distribution_features',
    window_size=200,
    step_size=100
)

set_random_seed(cfg.seed)

# Charger les données
loader_fn = {
    'fcx1': load_fcx1,
    'retina': load_retina,
    'allen': load_allen,
    'epirest': load_epirest,
}[cfg.dataset]
X_train, X_test, y_train, y_test, gr_train, gr_test = loader_fn(random_seed=cfg.seed)

# Équilibrer les ensembles de données si nécessaire
if cfg.balance_train:
    X_train, y_train = simple_undersampling(X_train, y_train)
if cfg.balance_test:
    X_test, y_test = simple_undersampling(X_test, y_test)

# Prétraiter les données
X_train, X_test, y_train, y_test = tsfresh_vectorize(X_train, X_test, y_train, y_test, cfg)

# Entraîner le modèle
model = RandomForestClassifier(
    **asdict(cfg.model),
    random_state=cfg.seed,
    n_jobs=-1,
)
model.fit(X_train, y_train)

# Évaluer le modèle
roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
gmean = geometric_mean_score(y_test, model.predict(X_test))
kappa = cohen_kappa_score(y_test, model.predict(X_test))

# Afficher les résultats
print(f'AUC ROC score value on {cfg.dataset} test set (random forest): {roc_auc}')
print(f'G-mean score value on {cfg.dataset} test set (random forest): {gmean}')
print(f'Kappa score value on {cfg.dataset} test set (random forest): {kappa}')

INFO:root:Started time series vectorization and preprocessing
Feature Extraction: 100%|██████████| 50/50 [00:08<00:00,  6.18it/s]
Feature Extraction: 100%|██████████| 50/50 [00:04<00:00, 11.87it/s]


AUC ROC score value on epirest test set (random forest): 0.7018561316464829
G-mean score value on epirest test set (random forest): 0.2753861364476074
Kappa score value on epirest test set (random forest): 0.09495868550219166


In [73]:
import pickle

data = pickle.load(open(r'/Users/anthonypinto/Documents/These/git/spikebench/bin/tsfresh_features_epirest_simple_baseline.bin', 'rb'))
data

(         value__abs_energy  value__mean  value__median  value__maximum  \
 0.0               1.975995     1.944457       1.396913        2.228016   
 1.0               1.842754     1.833572       1.040507        2.225846   
 2.0               1.071235     1.534704       0.857052        1.708665   
 3.0               0.935080     1.481393       0.883686        1.701797   
 4.0               0.883857     1.503783       0.875302        2.054136   
 ...                    ...          ...            ...             ...   
 26947.0          -0.196850     0.286590       0.472808        0.388865   
 26948.0          -0.181972     0.270470       0.449306        0.390778   
 26949.0           0.027485     0.215518       0.512743       -0.023935   
 26950.0           0.143021     0.289113       0.563544        0.353705   
 26951.0          -0.099589     0.428003       0.588974        0.353685   
 
          value__standard_deviation  
 0.0                       1.442532  
 1.0                  

In [77]:
data[0]

Unnamed: 0,value__abs_energy,value__mean,value__median,value__maximum,value__standard_deviation
0.0,1.975995,1.944457,1.396913,2.228016,1.442532
1.0,1.842754,1.833572,1.040507,2.225846,1.528021
2.0,1.071235,1.534704,0.857052,1.708665,1.002900
3.0,0.935080,1.481393,0.883686,1.701797,0.891034
4.0,0.883857,1.503783,0.875302,2.054136,0.745654
...,...,...,...,...,...
26947.0,-0.196850,0.286590,0.472808,0.388865,0.551316
26948.0,-0.181972,0.270470,0.449306,0.390778,0.587037
26949.0,0.027485,0.215518,0.512743,-0.023935,0.986303
26950.0,0.143021,0.289113,0.563544,0.353705,1.154570


In [78]:
df_tsfresh = data[0]
df_tsfresh['label'] = data[1]
df_tsfresh.head()

Unnamed: 0,value__abs_energy,value__mean,value__median,value__maximum,value__standard_deviation,label
0.0,1.975995,1.944457,1.396913,2.228016,1.442532,1.0
1.0,1.842754,1.833572,1.040507,2.225846,1.528021,1.0
2.0,1.071235,1.534704,0.857052,1.708665,1.0029,1.0
3.0,0.93508,1.481393,0.883686,1.701797,0.891034,1.0
4.0,0.883857,1.503783,0.875302,2.054136,0.745654,1.0


In [88]:
X_train, y_train, X_test, y_test = pickle.load(open(path_to_file, 'rb'))
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((26952, 691), (26952,), (10488, 691), (10488,))

In [82]:
df_tsfresh_full = data_full[0]
df_tsfresh_full['label'] = data_full[1]
df_tsfresh_full.shape

(26952, 692)

In [84]:
test_group = pd.read_csv(r'/Users/anthonypinto/Documents/These/git/spikebench/csv/epirest_test_groups.csv')
test_group

Unnamed: 0,0
0,epirest_neuron_9
1,epirest_neuron_9
2,epirest_neuron_9
3,epirest_neuron_9
4,epirest_neuron_9
...,...
10483,epirest_neuron_510
10484,epirest_neuron_510
10485,epirest_neuron_510
10486,epirest_neuron_510


In [85]:
train_group = pd.read_csv(r'/Users/anthonypinto/Documents/These/git/spikebench/csv/epirest_train_groups.csv')
train_group

Unnamed: 0,0
0,epirest_neuron_7
1,epirest_neuron_7
2,epirest_neuron_7
3,epirest_neuron_7
4,epirest_neuron_7
...,...
26947,epirest_neuron_512
26948,epirest_neuron_512
26949,epirest_neuron_512
26950,epirest_neuron_512
