# CyberBrain: Cybersecurity in BCI for Advanced Driver Assistance
## Milestone MS3: Framework to detect and measure the cyberattacks impact.
#### University of Murcia, Spain

In [1]:
import pickle
import mne
import numpy as np
import pandas as pd
import threading

## Data acquisition

In [None]:
!wget -O dataset/p300-umu https://univmurcia-my.sharepoint.com/:u:/g/personal/enriquetomas_um_es/EUqf9NlxBC9HudvMJ4FUvJMBcU4ngGDug5bobNka_p9FwQ?e=Rx1lc3

zsh:1: no matches found: https://univmurcia-my.sharepoint.com/:u:/g/personal/enriquetomas_um_es/EUqf9NlxBC9HudvMJ4FUvJMBcU4ngGDug5bobNka_p9FwQ?e=Rx1lc3


In [6]:
dataset="dataset/p300-umu"

In [7]:

with open(dataset, 'rb') as file:
    data = pickle.load(file)

In [None]:
from framework_acquisition import *
t = threading.Thread(name='framework_acquisition', target=acquire_signals(), args=(data,))
t.start()

## Data processing

In [8]:
subjects = {}
for i, d in enumerate(data):
    subjects[f'Subject {i}'] = d

In [9]:
subjects

{'Subject 0': <EpochsArray |  3982 events (all good), 0 - 0.585938 sec, baseline off, ~37.0 MB, data loaded,
  'neg': 3413
  'pos': 569>,
 'Subject 1': <EpochsArray |  3916 events (all good), 0 - 0.585938 sec, baseline off, ~36.4 MB, data loaded,
  'neg': 3362
  'pos': 554>,
 'Subject 2': <EpochsArray |  2053 events (all good), 0 - 0.585938 sec, baseline off, ~19.1 MB, data loaded,
  'neg': 1760
  'pos': 293>,
 'Subject 3': <EpochsArray |  6516 events (all good), 0 - 0.585938 sec, baseline off, ~60.5 MB, data loaded,
  'neg': 5589
  'pos': 927>,
 'Subject 4': <EpochsArray |  3396 events (all good), 0 - 0.585938 sec, baseline off, ~31.5 MB, data loaded,
  'neg': 2912
  'pos': 484>,
 'Subject 5': <EpochsArray |  3975 events (all good), 0 - 0.585938 sec, baseline off, ~36.9 MB, data loaded,
  'neg': 3404
  'pos': 571>,
 'Subject 6': <EpochsArray |  1163 events (all good), 0 - 0.585938 sec, baseline off, ~10.8 MB, data loaded,
  'neg': 871
  'pos': 292>,
 'Subject 7': <EpochsArray |  1174 

In [19]:
target = subjects['Subject 1']['pos']
nonTarget = subjects['Subject 1']['neg']

In [20]:
target_data = subjects['Subject 1']['pos'].get_data()
nontarget_data = subjects['Subject 1']['neg'].get_data()


In [21]:
target_data.shape

(554, 16, 76)

In [22]:
nontarget_data.shape

(3362, 16, 76)

In [23]:
alldata = X = np.concatenate([target_data, nontarget_data])

In [24]:
alldata.shape

(3916, 16, 76)

In [25]:
nontarget_data = nontarget_data[:593][:][:]

In [26]:
X, Y = [], []

X = np.concatenate([target_data, nontarget_data])
Y = np.concatenate([np.ones(target_data.shape[0]), np.zeros(nontarget_data.shape[0])])

In [27]:
Y.shape

(1147,)

In [28]:
X.shape

(1147, 16, 76)

## Data classification

In [None]:
from sklearn.model_selection import train_test_split

# Split dataset in training and testing
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.10, random_state=42,shuffle=True)

In [None]:
X_train.shape, X_test.shape

((1032, 16, 76), (115, 16, 76))

In [None]:
y_train.shape, y_test.shape

((1032,), (115,))

In [None]:
X.shape, Y.shape

((1147, 16, 76), (1147,))

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from mne.decoding import Vectorizer
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import classification_report

# Instancia validador cruzado
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Instancia clasificador
clf = make_pipeline(Vectorizer(), StandardScaler(), LogisticRegression(solver='liblinear', C=1, class_weight="balanced"))

# Proceso de validación cruzada
preds = np.empty(len(Y))
for train, test in cv.split(X, Y):
    clf.fit(X[train], Y[train]) # ajustar
    preds[test] = clf.predict(X[test]) #

# Info del proceso
target_names = ['NoTarget', 'Target']
report = classification_report(Y, preds, target_names=target_names)
print(report)

              precision    recall  f1-score   support

    NoTarget       0.78      0.77      0.78       593
      Target       0.76      0.76      0.76       554

    accuracy                           0.77      1147
   macro avg       0.77      0.77      0.77      1147
weighted avg       0.77      0.77      0.77      1147



In [None]:
from sklearn.model_selection import GridSearchCV

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

param_grid = {'C': [0.1,1,10], 'gamma': [0.1,0.01],'kernel': ['rbf', 'linear']}
#clf = SVC(kernel='linear', C=1, class_weight = "balanced")
svc = SVC()
clf = GridSearchCV(svc, param_grid)

nsamples, nx, ny = X.shape
d2_train_dataset = X.reshape((nsamples,nx*ny))

# Proceso de validación cruzada
preds = np.empty(len(Y))
for train, test in cv.split(d2_train_dataset, Y):
    clf.fit(d2_train_dataset[train], Y[train]) # ajustar
    preds[test] = clf.predict(d2_train_dataset[test]) #

# Info del proceso
target_names = ['NoTarget', 'Target']
report = classification_report(Y, preds, target_names=target_names)
print(report)

In [None]:
from sklearn.ensemble import RandomForestClassifier

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

clf = RandomForestClassifier(max_depth=2, random_state=0)

nsamples, nx, ny = X.shape
d2_train_dataset = X.reshape((nsamples,nx*ny))

# Proceso de validación cruzada
preds = np.empty(len(Y))
for train, test in cv.split(d2_train_dataset, Y):
    clf.fit(d2_train_dataset[train], Y[train]) # ajustar
    preds[test] = clf.predict(d2_train_dataset[test]) #

# Info del proceso
target_names = ['NoTarget', 'Target']
report = classification_report(Y, preds, target_names=target_names)
print(report)

              precision    recall  f1-score   support

    NoTarget       0.69      0.81      0.74       593
      Target       0.75      0.60      0.67       554

    accuracy                           0.71      1147
   macro avg       0.72      0.71      0.71      1147
weighted avg       0.72      0.71      0.71      1147



## PRUEBAS CON NO SUPERVISADOS

In [11]:
contamination_factor=0.05

In [12]:
from sklearn.model_selection import train_test_split

# Split dataset in training and testing
X_clean, X_noise = train_test_split(X, test_size=0.50, random_state=42,shuffle=False)

In [13]:
X_clean_train, X_clean_test = train_test_split(X_clean, test_size=0.10, random_state=42,shuffle=False)

In [19]:
from pyod.models.iforest import IForest

#clf = OCSVM(kernel='rbf',gamma=0.0001, nu=0.3, contamination=contamination_factor)
clf = IForest(random_state=42, contamination=contamination_factor)

nsamples, nx, ny = X_clean_train.shape
d2_train_clean_dataset = X_clean_train.reshape((nsamples,nx*ny))

nsamples, nx, ny = X_clean_test.shape
d2_test_clean_dataset = X_clean_test.reshape((nsamples,nx*ny))
# Model training
clf.fit(d2_train_clean_dataset)

IForest(behaviour='old', bootstrap=False, contamination=0.05,
    max_features=1.0, max_samples='auto', n_estimators=100, n_jobs=1,
    random_state=42, verbose=0)

## Clasificador entrenado sin ruido prediciendo datos sin ruido

In [20]:
pred=clf.predict(d2_test_clean_dataset)
unique_elements, counts_elements = np.unique(pred, return_counts=True)
print("\t",unique_elements,"    ",counts_elements)

	 [0 1]      [50  8]


In [21]:
from matplotlib import pyplot as plt

# max_noise = 0.000000001 # imperceptible a nivel de gráfica
max_noise = 60 # perceptible a nivel de gráfica
# Se crean muestras parametrizadas de una distribución normal (gaussiana) para generar ruido en la señal
noise = np.random.normal(0.0, max_noise, X_noise.shape)

print (noise[:50]) # muestra con los primeros 50 datos

[[[  -6.53015869   92.0860832   -46.75859684 ...   18.94242017
    -29.76866507  110.57891308]
  [ -18.51835213   -1.1854269   -10.956164   ...  -23.14063196
     23.56420454  -34.61911915]
  [  77.03099044  -23.18887769  -22.27294066 ...   60.01265066
     -0.63109637   -2.10043766]
  ...
  [ -19.48227968   63.5393721    39.51911301 ...   54.29562978
   -112.75854704   34.91757016]
  [ -89.01850594  126.14737807   88.88634331 ...  105.37971778
     68.27720412  -34.48569179]
  [  14.43809266  -37.70635697  -12.37470283 ...   -2.04012648
     -8.33414971  -51.98083302]]

 [[ -26.38269244  -17.61638872   34.72787825 ...  -36.93124892
    152.78673524   51.21041514]
  [ -42.98287436  -87.0475182   -40.19286665 ...   18.10124207
    -42.48757432   91.01048778]
  [ -30.29669154  -20.74235882  -46.10832565 ...   14.20207116
    -61.08407644   48.91134349]
  ...
  [  -3.55697494  -99.89629985   -8.63897478 ...  -21.87627399
    -27.13066509  -79.56213866]
  [  27.71137859  -10.83226059  -66.

In [22]:
X_noise = X_noise + noise

In [23]:
X_noise_train, X_noise_test = train_test_split(X_noise, test_size=0.10, random_state=42,shuffle=False)

In [26]:
from pyod.models.iforest import IForest

#clf = OCSVM(kernel='rbf',gamma=0.0001, nu=0.3, contamination=contamination_factor)
clfNoise = IForest(random_state=42, contamination=contamination_factor)


nsamples, nx, ny = X_noise_train.shape
d2_train_noise_dataset = X_noise_train.reshape((nsamples,nx*ny))

nsamples, nx, ny = X_noise_test.shape
d2_test_noise_dataset = X_noise_test.reshape((nsamples,nx*ny))

# Model training
clfNoise.fit(d2_train_noise_dataset)


IForest(behaviour='old', bootstrap=False, contamination=0.05,
    max_features=1.0, max_samples='auto', n_estimators=100, n_jobs=1,
    random_state=42, verbose=0)

## Clasificador entrenado con ruido prediciendo datos con ruido

In [28]:
pred=clfNoise.predict(d2_test_noise_dataset)
unique_elements, counts_elements = np.unique(pred, return_counts=True)
print("\t",unique_elements,"    ",counts_elements)

	 [0 1]      [53  5]


## JUNTAMOS CLEAN + RUIDO

In [47]:
dataset_test_concatenado = np.concatenate([X_clean_test, X_noise_test])

nsamples, nx, ny = dataset_test_concatenado.shape
dataset_test_concatenado = dataset_test_concatenado.reshape((nsamples,nx*ny))

## Resultados de clasificador entrenado sin ruido al predecir Clean+Ruido

In [49]:
pred=clf.predict(dataset_test_concatenado)
unique_elements, counts_elements = np.unique(pred, return_counts=True)
print("\t",unique_elements,"    ",counts_elements)

	 [0 1]      [50 66]


In [34]:
X_noise_test.shape

(58, 16, 76)

In [35]:
X_clean_test.shape

(58, 16, 76)

## Resultados de clasificador entrenado con ruido al predecir Clean+Ruido

In [33]:
pred=clfNoise.predict(dataset_test_concatenado)
unique_elements, counts_elements = np.unique(pred, return_counts=True)
print("\t",unique_elements,"    ",counts_elements)

	 [0 1]      [111   5]


## Entrenamiento con datos combinados

In [45]:
#clf = OCSVM(kernel='rbf',gamma=0.0001, nu=0.3, contamination=contamination_factor)
clfComb = IForest(random_state=42, contamination=contamination_factor)

dataset_train_concatenado = np.concatenate([X_clean_train, X_noise_train])

nsamples, nx, ny = dataset_train_concatenado.shape
dataset_train_concatenado = dataset_train_concatenado.reshape((nsamples,nx*ny))

# Model training
clfComb.fit(dataset_train_concatenado)

IForest(behaviour='old', bootstrap=False, contamination=0.05,
    max_features=1.0, max_samples='auto', n_estimators=100, n_jobs=1,
    random_state=42, verbose=0)

In [48]:
pred=clfComb.predict(dataset_test_concatenado)
unique_elements, counts_elements = np.unique(pred, return_counts=True)
print("\t",unique_elements,"    ",counts_elements)

	 [0 1]      [115   1]


## Pruebas con clasificador binario

In [83]:
# max_noise = 0.000000001 # imperceptible a nivel de gráfica
max_noise = 0.000000001 # perceptible a nivel de gráfica
# Se crean muestras parametrizadas de una distribución normal (gaussiana) para generar ruido en la señal
noise = np.random.normal(0.0, max_noise, target_data.shape)

## Insertamos ruido solo a target

In [84]:
X, Y = [], []

X = np.concatenate([target_data+noise, nontarget_data])
Y = np.concatenate([np.ones(target_data.shape[0]), np.zeros(nontarget_data.shape[0])])

In [85]:
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import classification_report

# Instancia validador cruzado
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

nsamples, nx, ny = X.shape
d2_train_dataset = X.reshape((nsamples,nx*ny))

# Instancia clasificador
clf = make_pipeline(StandardScaler(),RandomForestClassifier(max_depth=2, random_state=0))

# Proceso de validación cruzada
preds = np.empty(len(Y))
for train, test in cv.split(d2_train_dataset, Y):
    clf.fit(d2_train_dataset[train], Y[train]) # ajustar
    preds[test] = clf.predict(d2_train_dataset[test]) #

# Info del proceso
target_names = ['NoRuido', 'Ruido']
report = classification_report(Y, preds, target_names=target_names)
print(report)

              precision    recall  f1-score   support

     NoRuido       0.69      0.81      0.74       593
       Ruido       0.75      0.60      0.67       554

    accuracy                           0.71      1147
   macro avg       0.72      0.71      0.71      1147
weighted avg       0.72      0.71      0.71      1147



In [90]:
# max_noise = 0.000000001 # imperceptible a nivel de gráfica
max_noise = 1 # perceptible a nivel de gráfica
# Se crean muestras parametrizadas de una distribución normal (gaussiana) para generar ruido en la señal
noise = np.random.normal(0.0, max_noise, nontarget_data.shape)


## Probamos con non-target

In [91]:
X, Y = [], []

X = np.concatenate([target_data, nontarget_data+noise])
Y = np.concatenate([np.ones(target_data.shape[0]), np.zeros(nontarget_data.shape[0])])

In [92]:
# Instancia validador cruzado
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

nsamples, nx, ny = X.shape
d2_train_dataset = X.reshape((nsamples,nx*ny))

# Instancia clasificador
clf = make_pipeline(StandardScaler(),RandomForestClassifier(max_depth=2, random_state=0))

# Proceso de validación cruzada
preds = np.empty(len(Y))
for train, test in cv.split(d2_train_dataset, Y):
    clf.fit(d2_train_dataset[train], Y[train]) # ajustar
    preds[test] = clf.predict(d2_train_dataset[test]) #

# Info del proceso
target_names = ['NoRuido', 'Ruido']
report = classification_report(Y, preds, target_names=target_names)
print(report)

              precision    recall  f1-score   support

     NoRuido       0.99      0.99      0.99       593
       Ruido       0.99      0.99      0.99       554

    accuracy                           0.99      1147
   macro avg       0.99      0.99      0.99      1147
weighted avg       0.99      0.99      0.99      1147



In [93]:
X, Y = [], []

X = np.concatenate([target_data, nontarget_data])
Y = np.concatenate([np.zeros(X.shape[0])])

In [96]:
X.shape


(1147, 16, 76)

In [95]:
Y.shape

(1147,)

## Generamos ruido de forma intermitente, pares=SI impares = N0

In [144]:
cont = 0
max_noise = 0.001 # perceptible a nivel de gráfica
X_mix = []
Y_mix =[]
for i in X:
    if(cont%2 == 0):
        noise = np.random.normal(0.0, max_noise, i.shape)
        X_mix.append(i+noise)
        Y_mix.append(1)
    else:
        X_mix.append(i)
        Y_mix.append(0)
    cont+=1

In [145]:
X_mix = np.array(X_mix)
Y_mix =np.array(Y_mix)

In [149]:
# Instancia validador cruzado
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

nsamples, nx, ny = X_mix.shape
d2_train_dataset_mix = X_mix.reshape((nsamples,nx*ny))

# Instancia clasificador
clf = make_pipeline(StandardScaler(),RandomForestClassifier(max_depth=2, random_state=0))

# Proceso de validación cruzada
preds = np.empty(len(Y_mix))
for train, test in cv.split(d2_train_dataset_mix, Y_mix):
    clf.fit(d2_train_dataset_mix[train], Y_mix[train]) # ajustar
    preds[test] = clf.predict(d2_train_dataset_mix[test]) #

# Info del proceso
target_names = ['NoRuido', 'Ruido']
report = classification_report(Y_mix, preds, target_names=target_names)
print(report)

              precision    recall  f1-score   support

     NoRuido       0.49      0.48      0.48       573
       Ruido       0.49      0.50      0.50       574

    accuracy                           0.49      1147
   macro avg       0.49      0.49      0.49      1147
weighted avg       0.49      0.49      0.49      1147



## Comparamos con un no supervisado

In [146]:
X_mix_train, X_mix_test = train_test_split(X_mix, test_size=0.20, random_state=42,shuffle=False)

In [147]:

#clf = OCSVM(kernel='rbf',gamma=0.0001, nu=0.3, contamination=contamination_factor)
clfMix = IForest(random_state=42, contamination=contamination_factor)


nsamples, nx, ny = X_mix_train.shape
d2_train_mix_dataset = X_mix_train.reshape((nsamples,nx*ny))

nsamples, nx, ny = X_mix_test.shape
d2_test_mix_dataset = X_mix_test.reshape((nsamples,nx*ny))

# Model training
clfMix.fit(d2_train_mix_dataset)

IForest(behaviour='old', bootstrap=False, contamination=0.05,
    max_features=1.0, max_samples='auto', n_estimators=100, n_jobs=1,
    random_state=42, verbose=0)

In [148]:
pred=clf.predict(d2_test_mix_dataset)
unique_elements, counts_elements = np.unique(pred, return_counts=True)
print("\t",unique_elements,"    ",counts_elements)

	 [0]      [230]


## Generamos ruido de forma aleatoria

In [150]:
cont = 0
X_mix = []
Y_mix =[]
for i in X:
    if(cont%2 == 0):
        noise = np.random.normal(0.0, np.random.random_sample(), i.shape)
        X_mix.append(i+noise)
        Y_mix.append(1)
    else:
        X_mix.append(i)
        Y_mix.append(0)
    cont+=1

In [151]:
X_mix_train, X_mix_test = train_test_split(X_mix, test_size=0.20, random_state=42,shuffle=False)

In [152]:
X_mix = np.array(X_mix)
Y_mix =np.array(Y_mix)

In [153]:
# Instancia validador cruzado
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

nsamples, nx, ny = X_mix.shape
d2_train_dataset_mix = X_mix.reshape((nsamples,nx*ny))

# Instancia clasificador
clf = make_pipeline(StandardScaler(),RandomForestClassifier(max_depth=2, random_state=0))

# Proceso de validación cruzada
preds = np.empty(len(Y_mix))
for train, test in cv.split(d2_train_dataset_mix, Y_mix):
    clf.fit(d2_train_dataset_mix[train], Y_mix[train]) # ajustar
    preds[test] = clf.predict(d2_train_dataset_mix[test]) #

# Info del proceso
target_names = ['NoRuido', 'Ruido']
report = classification_report(Y_mix, preds, target_names=target_names)
print(report)

              precision    recall  f1-score   support

     NoRuido       0.72      0.94      0.82       573
       Ruido       0.91      0.64      0.75       574

    accuracy                           0.79      1147
   macro avg       0.82      0.79      0.78      1147
weighted avg       0.82      0.79      0.78      1147



In [154]:
X_mix_train, X_mix_test = train_test_split(X_mix, test_size=0.20, random_state=42,shuffle=False)

In [155]:
#clf = OCSVM(kernel='rbf',gamma=0.0001, nu=0.3, contamination=contamination_factor)
clfMix = IForest(random_state=42, contamination=contamination_factor)


nsamples, nx, ny = X_mix_train.shape
d2_train_mix_dataset = X_mix_train.reshape((nsamples,nx*ny))

nsamples, nx, ny = X_mix_test.shape
d2_test_mix_dataset = X_mix_test.reshape((nsamples,nx*ny))

# Model training
clfMix.fit(d2_train_mix_dataset)

IForest(behaviour='old', bootstrap=False, contamination=0.05,
    max_features=1.0, max_samples='auto', n_estimators=100, n_jobs=1,
    random_state=42, verbose=0)

In [156]:
pred=clf.predict(d2_test_mix_dataset)
unique_elements, counts_elements = np.unique(pred, return_counts=True)
print("\t",unique_elements,"    ",counts_elements)

	 [0 1]      [153  77]
