# Preperation of data set A from BCI Competition 2008

In [1]:
#Load relevant packages
import scipy.io as sio
import numpy as np
import pandas as pd
from sklearn.decomposition import FastICA
from sklearn import svm
import pandas as pd
from tqdm.notebook import tqdm

In [2]:
def extract_data(subject, training, artifactius = False):
    '''
    input:
        subject     - integer in range 1,..,9. Specifies from which subject to load data.
        training    - boolean. Specifies if training file or evaluation file should be loaded.
        artifactius - To include artifacts or not.
    output:
        tuple: first entry contains a 3d-array of signal data with shape is n_trial x n_channels x signal length.
               second entry contains a vector of classes
    '''
    n_channels = 22    #Corresponding to the 22 EEG channels
    n_trials = 6*48    #Each session has 6 experimental runs with 48 trials each
    offset = 3*250     #record 3 seconds after trial marker
    win_length = 3*250 #stop after 3 seconds of recording
    
    class_return = np.zeros(n_trials)
    data_return = np.zeros((n_trials, n_channels, win_length))
    
    n_valid_trials = 0
    
    if training:
        Z = sio.loadmat('A0'+str(subject)+'T.mat')
    else:
        Z = sio.loadmat('A0'+str(subject)+'E.mat')
    Z_data = Z["data"]
    for i in range(Z_data.size):
        Z_data1 = Z_data[0,i]
        Z_data2 = [Z_data1[0,0]]
        Z_data3 = Z_data2[0]
        
        Z_X         = Z_data3[0]
        Z_trial     = Z_data3[1]
        Z_y         = Z_data3[2]
        Z_fs        = Z_data3[3]
        Z_classes   = Z_data3[4]
        Z_artifacts = Z_data3[5]
        Z_gender    = Z_data3[6]
        Z_age       = Z_data3[7]
        for trial in range(0,Z_trial.size):
            if Z_artifacts[trial]==0 or artifactius:
                data_return[n_valid_trials,:,:] = np.transpose(
                    Z_X[int(Z_trial[trial])+offset:(int(Z_trial[trial])+offset+win_length),:22]
                )
                class_return[n_valid_trials] = int(Z_y[trial])
                n_valid_trials +=1
                
    return data_return[0:n_valid_trials,:,:], class_return[0:n_valid_trials]

Try extracting data from training session of subject 

In [3]:
testData = extract_data(1,True)
print(testData[0].shape)
print(testData[1].shape)

(273, 22, 750)
(273,)


## Classification on raw data to sanity check

In [4]:
# Define bandpowerfeatures as log of variances of signals.
def bandpower_feat(Data):
    x,y,z = Data.shape
    output = np.zeros((x,y))
    
    for i in range(x):
        output[i,:] = np.log(np.var(Data[i,:,:], axis = 1))
    return output

In [5]:
bandwidths = bandpower_feat(testData[0])

Try fitting a classifier, e.g. a linear SVM

In [6]:
clf = svm.SVC(decision_function_shape='ovo')
clf.fit(bandwidths, testData[1])

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovo', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

Predict on the same data to see if data is loaded sensibly

In [7]:
predicts = clf.predict(bandwidths)
np.mean(predicts==testData[1])

0.4542124542124542

Well above chance, so data should be loaded correctly.

# Try bandpass filtering

Setup parameters for filtering

In [8]:
from scipy.signal import butter, lfilter
lowcut = 8    # Lower frequency cutoff
highcut = 30  # Upper frequency cutoff
fs = 250      # sampling frequency
order = 3
nyq = 0.5 * fs
low = lowcut / nyq
high = highcut / nyq
b, a = butter(order, [low, high], btype='band')

Define a function that concatenates signals

In [9]:
def reshape_signal(arr):
    x,y,z = arr.shape
    arr2 = np.stack(arr, axis = 1)
    arr2 = arr2.reshape((y, x*z))
    return arr2

Try on the testdata

In [10]:
sig = reshape_signal(testData[0])

In [11]:
sig_filter = lfilter(b, a, sig)

In [12]:
sig_filter = np.stack((sig_filter).reshape(22,sig.shape[1]//750,750),axis=1)

In [13]:
bandwidths = bandpower_feat(sig_filter)

In [14]:
clf = svm.SVC(decision_function_shape='ovo')
clf.fit(bandwidths, testData[1])

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovo', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [15]:
predicts = clf.predict(bandwidths)
np.mean(predicts==testData[1])

0.608058608058608

This significantly improves the prediction accuracy

# CAR

Define functions for car and projection on to the null complement

In [16]:
def car(samples):
    d = samples.shape[0]
    centering = np.eye(d) - np.ones((d, d)) / d
    return centering.dot(samples)
    #return samples - np.mean(samples, axis=0)

# returns basis of A's null space
def null(A, eps=1e-15):
    # svd
    u, s, v = np.linalg.svd(A)
    # dimension of null space
    padding = max(0, np.shape(A)[1] - np.shape(s)[0])
    # select columns/rows corresponding to v
    null_mask = np.concatenate(((s <= eps),
                                np.ones((padding,), dtype=bool)), axis=0)
    null_space = np.compress(null_mask, v, axis=0)
    return null_space


def carcomplement(samples):
    d = samples.shape[0]
    carcomp = null(np.ones((1, d)))
    return carcomp.dot(samples)

Try again with CAR signals

In [17]:
sig_car = car(sig)

In [31]:
sig_filter = lfilter(b, a, sig_car)
sig_filter = np.delete(sig_filter, (21), axis=0)
sig_filter = np.stack((sig_filter).reshape(21,sig.shape[1]//750,750),axis=1)
bandwidths = bandpower_feat(sig_filter)
clf = svm.SVC(decision_function_shape='ovo')
clf.fit(bandwidths, testData[1])

predicts = clf.predict(bandwidths)
np.mean(predicts==testData[1])

0.7106227106227107

# Cleaning all data

In [23]:
def clean_data(subs = range(1,10),datatype="float32",filt = False,art = False):
    Signal_list = []
    Class_list = []
    Session_lengths = []

    for i in tqdm(subs):
        #Extract data for training session
        data = extract_data(i,True,art)

        #Concatenate cignals, CAR, and project to null complement
        sig = reshape_signal(data[0])
        sig = carcomplement(sig)

        #Filter
        if filt:
            sig = lfilter(b, a, sig)
        
        #Reshape
        sig = np.stack(sig.reshape(21,sig.shape[1]//750,750),axis=1)
        Signal_list.append(sig)

        Class_list.append(data[1])
        Session_lengths.append((data[1].shape)[0])

        #Repeat for eval
        data = extract_data(i,False,art)
        
        sig = reshape_signal(data[0])
        sig = carcomplement(sig)
        
        if filt:
            sig = lfilter(b, a, sig)
        
        sig = np.stack(sig.reshape(21,sig.shape[1]//750,750),axis=1)
        Signal_list.append(sig)
        Class_list.append(data[1])
        Session_lengths.append((data[1].shape)[0])

    Signals = np.concatenate(Signal_list)
    Classes = np.concatenate(Class_list)
    
    Signals = Signals.astype(datatype)
    
    return Signals,Classes, Session_lengths

In [24]:
Signals,Classes,n_se = clean_data()

HBox(children=(IntProgress(value=0, max=9), HTML(value='')))




In [25]:
print(Signals.shape, Classes.shape,n_se)

(4696, 21, 750) (4696,) [273, 281, 270, 283, 270, 273, 262, 228, 262, 276, 219, 215, 271, 277, 264, 271, 237, 264]


In [32]:
#np.save("Signals1.npy", Signals)

In [33]:
#np.save("Classes1.npy", Classes)