In [1]:
import numpy as np
from scipy.signal import butter, lfilter
from scipy.io import loadmat
import pandas as pd
from sklearn.model_selection  import train_test_split
from mne.decoding import CSP
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

# View Data

In [2]:
mat  = loadmat("BCI/A01T.mat")

In [19]:
pd.DataFrame(mat['data'][0][3][0][0][1])  

Unnamed: 0,0
0,251
1,2254
2,4172
3,6124
4,8132
5,10243
6,12160
7,14210
8,16141
9,18139


In [20]:
pd.DataFrame(mat['data'][0][3][0][0][0]) # trials

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,0.341797,0.244141,-3.222656,-7.861328,-6.152344,-4.833984,0.976562,-6.347656,-10.595703,-11.962891,...,-10.400391,-10.302734,-7.128906,-8.544922,-7.519531,-6.982422,-3.564453,10.253906,20.507812,5.859375
1,-6.347656,-7.958984,-10.498047,-15.332031,-9.179688,-9.667969,-4.394531,-13.916016,-17.187500,-17.968750,...,-14.746094,-12.060547,-9.277344,-11.767578,-10.546875,-9.716797,-7.324219,2.441406,7.812500,-4.882812
2,-1.806641,-7.177734,-8.154297,-10.644531,-4.785156,-5.126953,-0.292969,-10.888672,-12.841797,-13.916016,...,-9.619141,-8.984375,-5.419922,-7.666016,-6.005859,-5.761719,-1.904297,6.347656,13.671875,-0.488281
3,-9.570312,-11.767578,-15.527344,-15.771484,-13.525391,-9.863281,-8.105469,-15.039062,-20.849609,-19.677734,...,-15.039062,-13.525391,-11.376953,-11.376953,-10.937500,-11.523438,-6.738281,4.394531,12.695312,-1.953125
4,-12.939453,-14.697266,-17.480469,-21.777344,-16.943359,-16.259766,-4.052734,-15.087891,-20.117188,-23.388672,...,-17.578125,-17.333984,-15.136719,-12.402344,-13.916016,-15.136719,-12.500000,1.464844,4.882812,-5.371094
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96730,15.722656,8.007812,10.986328,10.302734,13.574219,13.378906,3.466797,4.736328,8.105469,6.250000,...,5.712891,6.445312,6.054688,12.011719,9.472656,8.691406,18.554688,0.488281,54.199219,-2.929688
96731,5.078125,2.832031,1.464844,1.171875,1.855469,4.248047,1.367188,-0.048828,-2.636719,-2.832031,...,-2.392578,-2.539062,-2.441406,6.250000,2.441406,2.099609,14.208984,-8.789062,46.875000,-10.253906
96732,0.292969,0.048828,-3.076172,-2.343750,-4.052734,0.927734,-2.294922,-2.880859,-7.617188,-5.810547,...,-3.710938,-2.294922,-0.634766,7.421875,2.832031,4.687500,17.871094,-2.441406,45.898438,-4.394531
96733,-6.347656,-5.126953,-9.716797,-10.546875,-11.572266,-4.882812,-2.294922,-5.615234,-12.109375,-11.621094,...,-6.396484,-6.542969,-3.027344,4.980469,0.390625,2.978516,16.845703,-4.882812,38.574219,-4.394531


In [245]:
pd.DataFrame(mat['data'][0][3][0][0][2]) # labels

Unnamed: 0,0
0,4
1,3
2,2
3,1
4,1
5,2
6,3
7,4
8,2
9,3


# Preprocessing

In [21]:
# Band Pass Filter with low cut of 4 hz and high cut of 38 hz
# This choice is because of the fact that motor imagery features generally happen in alpha and beta band of EEG.
def butter_bandpass (lowcut, highcut, fs, order=6):
    nyq = 0.5 * fs
    low = lowcut / nyq
    high = highcut / nyq
    b, a = butter(order, [low, high], btype='band')
    return b, a


def butter_bandpass_filter (signal, lowcut, highcut, fs, order=6):
    b, a = butter_bandpass(lowcut, highcut, fs, order=order)
    y = lfilter(b, a, signal)
    return y


def Preprocessing(path):
    # load data
    mat  = loadmat(path)
    data = mat['data']
    
    finalResult_x = np.empty([1, 22, 1500-500]) # signals
    finalResult_y = np.empty([1])              # labels
    N  = data.shape[1]              # number of samples


    # iteration on motor imagery task sessions (left hand, right hand, feet, tongue)
    # iterative from 3 to 8 => 6 runs 
    # each run callect 48 trials
    # then 6*48 = 288 trials
    
    # The signals were obtained using 22 electrodes for EEG and 3 electrodes for EOG => 22+3 = 25 channel
    # each signal have 25 channel and number of signals is ~ 10000
    
    for j in range(3,N):
        samples = data[0][j][0][0][0] # whole signal of the session
        trials  = data[0][j][0][0][1] # indices of successive trials
        labels  = data[0][j][0][0][2] # labels of corresponding task

        # iteration on tasks in each session
        for i in range(48):
            
            # we interested on signals between trials so we store it in x variable 
            if i < 47:
                x = samples[trials[i,0]:trials[i+1,0]]
            else:
                x = samples[trials[i,0]:]
            
            # remove the last 3 channels (EOG) and transpose the matrix
            x = x[500: 1500, 0: -3].T

            # apply band pass filter
            x = butter_bandpass_filter(
                signal  = x,
                lowcut  = 4,
                highcut = 38,
                fs      = 250,
                order   = 6
            )
            
            # normalize
            x = (x - np.mean(x))/np.std(x)

            x = np.expand_dims(x, axis=0)
            y = np.array([labels[i,0]])

            finalResult_x = np.concatenate((finalResult_x, x),axis=0)
            finalResult_y = np.concatenate((finalResult_y, y),axis=0)

    return finalResult_x[1:], finalResult_y[1:] # first sample is empty
    
    

In [101]:
data, labels = Preprocessing('BCI/A03T.mat')
data2, labels2 = Preprocessing('BCI/A07T.mat')

data_test, labels_test = Preprocessing('BCI/A03E.mat')

In [87]:
np.array(labels).shape

(288,)

In [88]:
# convert labels from list of float to list of int 
labels=[int(x) for x in labels]
labels_test=[int(x) for x in labels_test]
labels2=[int(x) for x in labels2]

labels=np.array(labels)
labels2=np.array(labels2)
labels_test=np.array(labels_test)

# Feature Extraction

In [89]:
train_data=np.concatenate((data,data2))

In [90]:
train_labels=np.concatenate((labels,labels2))

In [91]:
train_labels.shape

(576,)

In [92]:
train_data.shape

(576, 22, 1000)

In [171]:
csp =CSP(n_components=4)
csp.fit(train_data,train_labels)
train_data_csp= csp.transform(train_data)

csp.fit(data_test,labels_test)
data_test_csp  = csp.fit_transform(data_test)

Computing rank from data with rank=None
    Using tolerance 7.9 (2.2e-16 eps * 22 dim * 1.6e+15  max singular value)
    Estimated rank (mag): 22
    MAG: rank 22 computed from 22 data channels with 0 projectors
Reducing data rank from 22 -> 22
Estimating covariance using EMPIRICAL
Done.
Computing rank from data with rank=None
    Using tolerance 7.9 (2.2e-16 eps * 22 dim * 1.6e+15  max singular value)
    Estimated rank (mag): 22
    MAG: rank 22 computed from 22 data channels with 0 projectors
Reducing data rank from 22 -> 22
Estimating covariance using EMPIRICAL
Done.
Computing rank from data with rank=None
    Using tolerance 7.9 (2.2e-16 eps * 22 dim * 1.6e+15  max singular value)
    Estimated rank (mag): 22
    MAG: rank 22 computed from 22 data channels with 0 projectors
Reducing data rank from 22 -> 22
Estimating covariance using EMPIRICAL
Done.
Computing rank from data with rank=None
    Using tolerance 7.8 (2.2e-16 eps * 22 dim * 1.6e+15  max singular value)
    Estimated ra

In [164]:
pd.DataFrame(train_data_csp)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-0.733600,-0.177453,-0.426954,-0.163532,-0.210341,0.720986,0.239567,0.054083,0.313896,-0.107626
1,-0.615410,0.308617,-0.452539,-0.457616,0.090186,0.658806,0.407262,-0.121972,-0.001217,0.158775
2,-0.796599,0.364215,-0.167576,-0.468015,0.222380,0.767567,0.459140,0.072891,0.363212,0.257564
3,1.327256,-0.542011,-0.146094,0.108523,-0.208823,0.727262,0.461565,0.275923,0.119646,-0.021262
4,-0.839230,-0.086894,-0.008300,-0.374210,0.098287,0.589875,0.313578,-0.042497,0.187630,-0.019570
...,...,...,...,...,...,...,...,...,...,...
571,-0.138794,0.141265,-0.081002,0.094717,0.087938,-0.316946,0.089228,0.206999,0.093415,-0.259403
572,0.157916,0.444683,0.387248,0.436527,0.280027,-0.173717,0.199442,0.433964,0.248878,-0.327285
573,-0.210566,0.262181,0.533352,0.008891,0.029210,-0.277333,0.027725,0.159828,0.299822,-0.564756
574,-0.645045,0.148998,-0.037759,-0.141174,0.037075,0.004279,-0.003817,0.337132,0.173641,-0.387868


# Split data

In [165]:
train_data_csp.shape,train_labels.shape

((576, 10), (576,))

In [166]:
x_train,x_test ,y_train,y_test = train_test_split(train_data_csp,train_labels,test_size = 0.25 , random_state =42 , shuffle = True,stratify = train_labels )
x_train.shape,x_test.shape ,y_train.shape,y_test.shape

((432, 10), (144, 10), (432,), (144,))

# Classifiers

In [167]:
randomforest =RandomForestClassifier().fit(x_train,y_train)
y_te_random = randomforest.predict(x_test)

clf = svm.SVC().fit(x_train, y_train)
y_te_svm = clf.predict(x_test)

knn = KNeighborsClassifier(n_neighbors=3).fit(x_train, y_train)
knn_pred = knn.predict(x_test)

lm = LogisticRegression(multi_class='ovr', solver='liblinear').fit(x_train, y_train)
lm_pred = lm.predict(x_test)

print("Random Forst =",accuracy_score(y_te_random,y_test)*100)
print("SVM =",accuracy_score(y_te_svm,y_test)*100)
print("KNN =",accuracy_score(knn_pred,y_test)*100)
print("Logistic Regression =",accuracy_score(lm_pred,y_test)*100)

Random Forst = 76.38888888888889
SVM = 79.16666666666666
KNN = 75.0
Logistic Regression = 80.55555555555556


In [168]:
print(classification_report(y_test,y_te_random))
print(classification_report(y_test,lm_pred))


              precision    recall  f1-score   support

           1       0.74      0.69      0.71        36
           2       0.79      0.72      0.75        36
           3       0.77      0.75      0.76        36
           4       0.76      0.89      0.82        36

    accuracy                           0.76       144
   macro avg       0.76      0.76      0.76       144
weighted avg       0.76      0.76      0.76       144

              precision    recall  f1-score   support

           1       0.82      0.89      0.85        36
           2       0.87      0.75      0.81        36
           3       0.78      0.78      0.78        36
           4       0.76      0.81      0.78        36

    accuracy                           0.81       144
   macro avg       0.81      0.81      0.81       144
weighted avg       0.81      0.81      0.81       144



In [169]:
data_test_csp.shape

(288, 10)

In [170]:
y_val_random = randomforest.predict(data_test_csp)
y_val_svm = clf.predict(data_test_csp)
knn_val_pred = knn.predict(data_test_csp)
lm_val_pred = lm.predict(data_test_csp)

print("Random Forst =",accuracy_score(y_val_random,labels_test)*100)
print("SVM =",accuracy_score(y_val_svm,labels_test)*100)
print("KNN =",accuracy_score(knn_val_pred,labels_test)*100)
print("Logistic Regression =",accuracy_score(lm_val_pred,labels_test)*100)

Random Forst = 59.02777777777778
SVM = 59.02777777777778
KNN = 52.77777777777778
Logistic Regression = 65.97222222222221


# Save model

In [261]:
import joblib

# save
#joblib.dump(randomforest, "models/my_random_forest.joblib")



['my_random_forest.joblib']

In [262]:

# load model
loaded_rf = joblib.load("models/my_random_forest.joblib")

y_val_random = randomforest.predict(data_test_csp)
print("Random Forst =",accuracy_score(y_val_random,labels_test)*100)


Random Forst = 68.40277777777779
