In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

from sklearn.pipeline import Pipeline, FeatureUnion

from sklearn.model_selection import cross_val_score, cross_val_predict
from xgboost import XGBClassifier
import numpy as np
import itertools
import seaborn
from sklearn.metrics import roc_auc_score
from experiments.libs import loaders
from sklearn.model_selection import StratifiedKFold

from sklearn.svm import LinearSVC
from sklearn.decomposition import PCA

from xgboost import XGBModel

from lightgbm import LGBMClassifier

import mne
from scipy.io import loadmat

%matplotlib inline



In [2]:
filepath = '/fileshare/BCI_Comp_III_Wads_2004/Subject_A_Train.mat'
srate = 240 # Hz
# Filtered 0.1 - 60Hz

In [3]:
def create_event_frame(flash_series, stimtype_series):
    diff_seq = flash_series.diff()
    # First event will be missing so fill with 1
    return pd.DataFrame({'flash_onset':diff_seq.fillna(1), 
                         'stim_type': stimtype_series})

In [4]:
def epoch_generator(eeg_data, event_df, duration=0.8, srate=240):
    """ Yields epoch as well as classification label
    """
    offset = int(np.round(duration * srate))
    for idx in event_df[event_df['flash_onset']==1].index:
        data = eeg_data[idx:idx+offset, :] - eeg_data[idx, :] # Removing offset
        yield data, event_df['stim_type'].at[idx]

In [5]:
def dataset_epoch_generator(filename, srate=240):
    dataset_dict = loadmat(filename)
    num_runs = dataset_dict['Flashing'].shape[0]
    
    for run in range(num_runs):
        eeg_array = dataset_dict['Signal'][run,:,:]
        flash_series = pd.Series(dataset_dict['Flashing'][run, :])
        stimtype_series = pd.Series(dataset_dict['StimulusType'][run, :])
        event_df = create_event_frame(flash_series, stimtype_series)
        # Transforming from time by channel to channel by time
        eeg_array = mne.filter.filter_data(eeg_array.astype(np.float64).T, srate, None, 18, verbose=False).T
        for epoch in epoch_generator(eeg_array, event_df, duration=0.8, srate=srate):
            yield epoch

In [6]:
egen = dataset_epoch_generator(filepath, srate=srate)
data_list = list(egen)

In [10]:
len(data_list)

15300

In [32]:
X = np.array([d[0][::6].ravel() for d in data_list])

In [33]:
y = np.array([d[1] for d in data_list])

In [34]:
X.shape

(15300, 2048)

In [35]:
y.shape

(15300,)

In [89]:
pipeline_steps = [('scale', StandardScaler()), ('pca', PCA(n_components=800))]
continuous_pipeline = Pipeline(steps=pipeline_steps)

In [90]:
featurisers = [('continuous', continuous_pipeline)]

In [124]:
classification_pipeline = Pipeline(steps=[('features', FeatureUnion(featurisers)),
                                          ('clf', XGBClassifier(max_depth=2, 
                                                                learning_rate=0.1, 
                                                                scale_pos_weight=2,
                                                                n_estimators=100,
                                                                gamma=0.1,
                                                                subsample=1))]) 

In [128]:
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)

In [129]:
%%time
cv_results = cross_val_score(classification_pipeline, X, y, cv=cv, scoring='roc_auc')

CPU times: user 7min 9s, sys: 2min 12s, total: 9min 22s
Wall time: 57.6 s


In [130]:
cv_results

array([ 0.67144304,  0.67513938,  0.67762962])

In [135]:
classification_pipeline = Pipeline(steps=[('features', FeatureUnion(featurisers)),
                                          ('clf', LGBMClassifier(max_depth=2, 
                                                                learning_rate=0.1, 
                                                                scale_pos_weight=2,
                                                                n_estimators=100,
                                                                subsample=1))]) 

In [136]:
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)

In [137]:
%%time
cv_results = cross_val_score(classification_pipeline, X, y, cv=cv, scoring='roc_auc')

CPU times: user 4min 13s, sys: 1min 34s, total: 5min 47s
Wall time: 42.7 s


In [138]:
cv_results

array([ 0.67125979,  0.68006145,  0.67293426])