In [3]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

from sklearn.pipeline import Pipeline, FeatureUnion

from sklearn.model_selection import cross_val_score, cross_val_predict
from xgboost import XGBClassifier
import numpy as np
import itertools
import seaborn
from sklearn.metrics import roc_auc_score
from experiments.libs import loaders
from sklearn.model_selection import StratifiedKFold

from sklearn.svm import LinearSVC
from sklearn.decomposition import PCA
from sklearn.metrics import roc_auc_score
from xgboost import XGBModel

from lightgbm import LGBMClassifier

import mne
from scipy.io import loadmat

from matplotlib import pyplot as plt
%matplotlib inline



In [4]:
filepath = '/fileshare/BCI_Comp_III_Wads_2004/Subject_A_Train.mat'
srate = 240 # Hz
# Filtered 0.1 - 60Hz

In [5]:
def create_event_frame(flash_series, stimtype_series):
    diff_seq = flash_series.diff()
    # First event will be missing so fill with 1
    return pd.DataFrame({'flash_onset':diff_seq.fillna(1), 
                         'stim_type': stimtype_series})

In [6]:
def epoch_generator(eeg_data, event_df, duration=0.8, srate=240):
    """ Yields epoch as well as classification label
    """
    offset = int(np.round(duration * srate))
    for idx in event_df[event_df['flash_onset']==1].index:
        data = eeg_data[idx:idx+offset, :] - eeg_data[idx, :] # Removing offset
        yield data, event_df['stim_type'].at[idx]

In [7]:
def dataset_epoch_generator(filename, srate=240):
    dataset_dict = loadmat(filename)
    num_runs = dataset_dict['Flashing'].shape[0]
    
    for run in range(num_runs):
        eeg_array = dataset_dict['Signal'][run,:,:]
        flash_series = pd.Series(dataset_dict['Flashing'][run, :])
        stimtype_series = pd.Series(dataset_dict['StimulusType'][run, :])
        event_df = create_event_frame(flash_series, stimtype_series)
        # Transforming from time by channel to channel by time
        eeg_array = mne.filter.filter_data(eeg_array.astype(np.float64).T, srate, None, 18, verbose=False).T
        for epoch in epoch_generator(eeg_array, event_df, duration=0.8, srate=srate):
            yield epoch

In [8]:
egen = dataset_epoch_generator(filepath, srate=srate)
data_list = list(egen)

In [9]:
len(data_list)

15300

In [10]:
X = np.array([d[0][::6].ravel() for d in data_list])

In [11]:
y = np.array([d[1] for d in data_list])

In [12]:
X.shape

(15300, 2048)

In [13]:
y.shape

(15300,)

In [14]:
pipeline_steps = [('scale', StandardScaler())]
continuous_pipeline = Pipeline(steps=pipeline_steps)

In [15]:
featurisers = [('continuous', continuous_pipeline)]

In [16]:
xgb_clf_pipeline = Pipeline(steps=[('features', FeatureUnion(featurisers)),
                                          ('clf', XGBClassifier(max_depth=2, 
                                                                learning_rate=0.1, 
                                                                scale_pos_weight=2,
                                                                n_estimators=100,
                                                                gamma=0.1,
                                                                subsample=1))]) 

In [17]:
lgbm_clf_pipeline = Pipeline(steps=[('features', FeatureUnion(featurisers)),
                                          ('clf', LGBMClassifier(max_depth=2, 
                                                                learning_rate=0.1, 
                                                                scale_pos_weight=2,
                                                                n_estimators=100,
                                                                subsample=1))]) 

In [18]:
letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ123456789_'
stim_code_translation_dict = dict((l, (i%6+1, int(np.floor(i/6)+7)) ) for i, l in enumerate(letters))

In [19]:
def event_frame_from(stim_code, target_letter, stim_code_translation_dict):
    diff_seq = stim_code.diff()
    diff_seq = diff_seq.fillna(stim_code)
    
    target_codes = stim_code_translation_dict[target_letter]
    target_index = diff_seq[diff_seq.isin(target_codes)].index
    stimtype_series = pd.Series(0, index=stim_code.index)
    stimtype_series[target_index]=1
    diff_seq = (diff_seq/diff_seq.abs()).fillna(0)
    
    return pd.DataFrame({'flash_onset':diff_seq, 
                         'stim_type': stimtype_series})

In [20]:
test_filename = '/fileshare/BCI_Comp_III_Wads_2004/Subject_A_Test.mat'
labels_filename = '/fileshare/BCI_Comp_III_Wads_2004/true_labels_a.txt'

In [21]:
def read_labels(filename):
    return open(filename).readline().strip()

In [22]:
def test_dataset_epoch_generator(data_filename, labels_filename, stim_code_translation_dict, srate=240):
    dataset_dict = loadmat(data_filename)
    labels = read_labels(labels_filename)
    
    for run, target in enumerate(labels):
        eeg_array = dataset_dict['Signal'][run,:,:]
        stimcode_series = pd.Series(dataset_dict['StimulusCode'][run, :])
        
        event_df = event_frame_from(stimcode_series, target, stim_code_translation_dict)
        
        # Transforming from time by channel to channel by time
        eeg_array = mne.filter.filter_data(eeg_array.astype(np.float64).T, srate, None, 18, verbose=False).T
        for epoch in epoch_generator(eeg_array, event_df, duration=0.8, srate=srate):
            yield epoch

In [23]:
data_gen = test_dataset_epoch_generator(test_filename, labels_filename, stim_code_translation_dict)

In [24]:
data_list = list(data_gen)

In [25]:
X_test = np.array([d[0][::6].ravel() for d in data_list])

In [26]:
y_test = np.array([d[1] for d in data_list])

In [27]:
X_train = np.concatenate([X, X_test[:9000]])

In [29]:
y_train = np.concatenate([y, y_test[:9000]])

In [30]:
%%time
xgb_clf_pipeline.fit(X_train, y_train)

CPU times: user 3min 43s, sys: 16.5 s, total: 3min 59s
Wall time: 13.9 s


Pipeline(steps=[('features', FeatureUnion(n_jobs=1,
       transformer_list=[('continuous', Pipeline(steps=[('scale', StandardScaler(copy=True, with_mean=True, with_std=True))]))],
       transformer_weights=None)), ('clf', XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0...logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=2, seed=0, silent=True, subsample=1))])

In [31]:
y_pred = xgb_clf_pipeline.predict_proba(X_test[9000:])
roc_auc_score(y_test[9000:], y_pred[:, 1])

0.67662137777777787

In [32]:
%%time
lgbm_clf_pipeline.fit(X_train, y_train)

CPU times: user 35.4 s, sys: 1.2 s, total: 36.6 s
Wall time: 6.91 s


Pipeline(steps=[('features', FeatureUnion(n_jobs=1,
       transformer_list=[('continuous', Pipeline(steps=[('scale', StandardScaler(copy=True, with_mean=True, with_std=True))]))],
       transformer_weights=None)), ('clf', LGBMClassifier(boosting_type='gbdt', colsample_bytree=1, drop_rate=0.1,
        is_un...   subsample_for_bin=50000, subsample_freq=1, uniform_drop=False,
        xgboost_dart_mode=False))])

In [33]:
y_pred = lgbm_clf_pipeline.predict_proba(X_test[9000:])
roc_auc_score(y_test[9000:], y_pred[:, 1])

0.67720866666666668

In [None]:
for X_chunk, y_chunk in zip()