# CyberBrain: Cybersecurity in BCI for Advanced Driver Assistance
## Milestone MS3: Framework to detect and measure the cyberattacks impact.
#### University of Murcia, Spain

In [3]:
import pickle
import numpy as np
import threading
import time
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from mne.decoding import Vectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

## Data acquisition

In [4]:
dataset="dataset/p300-umu"

In [5]:
with open(dataset, 'rb') as file:
    data = pickle.load(file)

In [57]:
from framework_acquisition import *

def framework_acquisition():

    time.sleep(2)
    print("[+] Acquiring signals...")
    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
        s.bind((HOST, PORT))
        s.listen()
        conn, addr = s.accept()
        with open("rawdata.csv", "wb") as f:
            print(f"Connected by {addr}")
            while True:
                bytes_read = conn.recv(BUFFER_SIZE)
                if not bytes_read:
                    print("Finished file transfer")
                    break
                f.write(bytes_read)
        f.close()
        conn.close()
        print("File transfer complete")

t = threading.Thread(name='framework_acquisition', target=framework_acquisition)
t.start()
time.sleep(4)

[+] Acquiring signals...


## Data processing

In [6]:
subjects = {}
for i, d in enumerate(data):
    subjects[f'Subject {i}'] = d

In [None]:
subjects

{'Subject 0': <EpochsArray |  3982 events (all good), 0 - 0.585938 sec, baseline off, ~37.0 MB, data loaded,
  'neg': 3413
  'pos': 569>,
 'Subject 1': <EpochsArray |  3916 events (all good), 0 - 0.585938 sec, baseline off, ~36.4 MB, data loaded,
  'neg': 3362
  'pos': 554>,
 'Subject 2': <EpochsArray |  2053 events (all good), 0 - 0.585938 sec, baseline off, ~19.1 MB, data loaded,
  'neg': 1760
  'pos': 293>,
 'Subject 3': <EpochsArray |  6516 events (all good), 0 - 0.585938 sec, baseline off, ~60.5 MB, data loaded,
  'neg': 5589
  'pos': 927>,
 'Subject 4': <EpochsArray |  3396 events (all good), 0 - 0.585938 sec, baseline off, ~31.5 MB, data loaded,
  'neg': 2912
  'pos': 484>,
 'Subject 5': <EpochsArray |  3975 events (all good), 0 - 0.585938 sec, baseline off, ~36.9 MB, data loaded,
  'neg': 3404
  'pos': 571>,
 'Subject 6': <EpochsArray |  1163 events (all good), 0 - 0.585938 sec, baseline off, ~10.8 MB, data loaded,
  'neg': 871
  'pos': 292>,
 'Subject 7': <EpochsArray |  1174 

In [7]:
target = subjects['Subject 1']['pos']
nonTarget = subjects['Subject 1']['neg']

In [8]:
target_data = subjects['Subject 1']['pos'].get_data()
nontarget_data = subjects['Subject 1']['neg'].get_data()


# Target and non-target labels are balanced for correct classification

In [9]:
target_data.shape

(554, 16, 76)

In [10]:
nontarget_data.shape

(3362, 16, 76)

In [11]:
alldata = X = np.concatenate([target_data, nontarget_data])

In [12]:
alldata.shape

(3916, 16, 76)

In [13]:
nontarget_data = nontarget_data[:593][:][:]

In [14]:
X, Y = [], []

X = np.concatenate([target_data, nontarget_data])
Y = np.concatenate([np.ones(target_data.shape[0]), np.zeros(nontarget_data.shape[0])])

In [None]:
Y.shape

(1147,)

In [None]:
X.shape

(1147, 16, 76)

## Data storage layer

In [33]:
import pandas as pd

nsamples, nx, ny = X.shape
X_csv = X.reshape((nsamples,nx*ny))
pd.DataFrame(X_csv, Y).to_csv("data.csv")

In [34]:
from cryptography.fernet import Fernet

# key generation
key = Fernet.generate_key()

# string the key in a file
with open('password.key', 'wb') as passwordfile:
    passwordfile.write(key)

Encrypt the file using the key generated

Now we have an encrypted key and file to be encrypted. Now write code to encrypt this file:
- Open the file that contains the key.
- Initialize the Fernet object.
- Read the original file.
- Encrypt the file and store it.
- Then write the encrypted data

In [35]:
with open('password.key', 'rb') as passwordfile:
    key = passwordfile.read()

fernet = Fernet(key)

with open('data.csv', 'rb') as file:
    original = file.read()

encrypted = fernet.encrypt(original)

with open('data.csv.enc', 'wb') as encrypted_file:
    encrypted_file.write(encrypted)

## Data classification use case 1: P300 detection

First we are going to perform a test of how use case 1 should work for P300 detection. For this, we have created a simple binary classifier that allows the detection of P300.

Test with Logistic regression algorithm and cross validation.

In [15]:
def generateNoise(shape, maxNoise):
    return np.random.normal(0.0, maxNoise, shape)

In [32]:
# Cross-validator instance
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Classifier instance
clf = make_pipeline(Vectorizer(), StandardScaler(), LogisticRegression(solver='liblinear', C=1, class_weight="balanced"))

# Cross-validation process
preds = np.empty(len(Y))
predsNoiseLow = np.empty(len(Y))
predsNoiseMid = np.empty(len(Y))
predsNoiseHigh = np.empty(len(Y))

for train, test in cv.split(X, Y):
    clf.fit(X[train], Y[train]) # ajustar
    preds[test] = clf.predict(X[test])
    predsNoiseLow[test] = clf.predict(X[test]+generateNoise(X[test].shape, 0.001)) #
    predsNoiseMid[test] = clf.predict(X[test]+generateNoise(X[test].shape, 0.4)) #
    predsNoiseHigh[test] = clf.predict(X[test]+generateNoise(X[test].shape, 0.8)) #


# Process information
target_names = ['NoTarget', 'Target']
report = classification_report(Y, preds, target_names=target_names)
reportNoiseLow = classification_report(Y, predsNoiseLow, target_names=target_names)
reportNoiseMid = classification_report(Y, predsNoiseMid, target_names=target_names)
reportNoiseHigh = classification_report(Y, predsNoiseHigh, target_names=target_names)

print("Classification results without noise")
print(report)
print("Classification results with low noise")
print(reportNoiseLow)
print("Classification results with mid noise")
print(reportNoiseMid)
print("Classification results with high noise")
print(reportNoiseHigh)

Classification results without noise
              precision    recall  f1-score   support

    NoTarget       0.78      0.77      0.78       593
      Target       0.76      0.76      0.76       554

    accuracy                           0.77      1147
   macro avg       0.77      0.77      0.77      1147
weighted avg       0.77      0.77      0.77      1147

Classification results with low noise
              precision    recall  f1-score   support

    NoTarget       0.78      0.78      0.78       593
      Target       0.76      0.76      0.76       554

    accuracy                           0.77      1147
   macro avg       0.77      0.77      0.77      1147
weighted avg       0.77      0.77      0.77      1147

Classification results with mid noise
              precision    recall  f1-score   support

    NoTarget       0.66      0.65      0.66       593
      Target       0.63      0.65      0.64       554

    accuracy                           0.65      1147
   macro avg   

Test with Random Forest algorithm and cross validation.

In [36]:
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

clf = RandomForestClassifier(max_depth=2, random_state=0)

nsamples, nx, ny = X.shape
X_train_2d = X.reshape((nsamples,nx*ny))

# Cross-validation process
preds = np.empty(len(Y))
predsNoiseLow = np.empty(len(Y))
predsNoiseMid = np.empty(len(Y))
predsNoiseHigh = np.empty(len(Y))

for train, test in cv.split(X, Y):
    clf.fit(X_train_2d[train], Y[train]) # ajustar
    preds[test] = clf.predict(X_train_2d[test])
    predsNoiseLow[test] = clf.predict(X_train_2d[test]+generateNoise(X_train_2d[test].shape, 0.001)) #
    predsNoiseMid[test] = clf.predict(X_train_2d[test]+generateNoise(X_train_2d[test].shape, 1)) #
    predsNoiseHigh[test] = clf.predict(X_train_2d[test]+generateNoise(X_train_2d[test].shape, 3)) #

# Process information
target_names = ['NoTarget', 'Target']
report = classification_report(Y, preds, target_names=target_names)
reportNoiseLow = classification_report(Y, predsNoiseLow, target_names=target_names)
reportNoiseMid = classification_report(Y, predsNoiseMid, target_names=target_names)
reportNoiseHigh = classification_report(Y, predsNoiseHigh, target_names=target_names)

print("Classification results without noise")
print(report)
print("Classification results with low noise")
print(reportNoiseLow)
print("Classification results with mid noise")
print(reportNoiseMid)
print("Classification results with high noise")
print(reportNoiseHigh)

Classification results without noise
              precision    recall  f1-score   support

    NoTarget       0.69      0.81      0.74       593
      Target       0.75      0.60      0.67       554

    accuracy                           0.71      1147
   macro avg       0.72      0.71      0.71      1147
weighted avg       0.72      0.71      0.71      1147

Classification results with low noise
              precision    recall  f1-score   support

    NoTarget       0.69      0.82      0.75       593
      Target       0.75      0.60      0.67       554

    accuracy                           0.71      1147
   macro avg       0.72      0.71      0.71      1147
weighted avg       0.72      0.71      0.71      1147

Classification results with mid noise
              precision    recall  f1-score   support

    NoTarget       0.69      0.80      0.74       593
      Target       0.75      0.62      0.68       554

    accuracy                           0.71      1147
   macro avg   

It can be seen that the algorithms yield about 75% accuracy in classifying the P300. This could be improved by adjusting their hyperparameters, although this is not the objective. Noise is then applied to the signal until the previously trained classifier is unable to recognize the different classes.

In [16]:
from sklearn.model_selection import train_test_split

# Split dataset in 50% clean and 50% with noise
X_clean, X_noise = train_test_split(X, test_size=0.50, random_state=42,shuffle=False)

In [17]:
nsamples, nx, ny = X_clean.shape
X_clean_2d = X_clean.reshape((nsamples,nx*ny))

nsamples, nx, ny = X_noise.shape
X_noise_2d = X_noise.reshape((nsamples,nx*ny))

Next, we will create an unsupervised classification model to detect this noise.

In [18]:
contamination_factor = 0.05

A model based on the IForest algorithm is trained with noise-free data only.

In [19]:
from pyod.models.iforest import IForest

clf = IForest(random_state=42, contamination=contamination_factor)

# Model training
clf.fit(X_clean_2d)

IForest(behaviour='old', bootstrap=False, contamination=0.05,
    max_features=1.0, max_samples='auto', n_estimators=100, n_jobs=1,
    random_state=42, verbose=0)

A model based on the One-Class Support Vector Machine algorithm is trained with noise-free data only.

In [20]:
from pyod.models.ocsvm import OCSVM

clfOCSV = OCSVM(kernel='rbf',gamma=0.00001, contamination=contamination_factor)

# Model training
clfOCSV.fit(X_clean_2d)

OCSVM(cache_size=200, coef0=0.0, contamination=0.05, degree=3, gamma=1e-05,
   kernel='rbf', max_iter=-1, nu=0.5, shrinking=True, tol=0.001,
   verbose=False)

Classifier results are obtained with noise-free data.

In [25]:
lowNoise = generateNoise(X_noise_2d.shape, 0.001)
midNoise = generateNoise(X_noise_2d.shape, 0.4)
highNoise = generateNoise(X_noise_2d.shape, 0.8)

In [21]:
pred = clf.predict(X_noise_2d)
unique_elements, counts_elements = np.unique(pred, return_counts=True)
print("\t",unique_elements,"    ",counts_elements)

	 [0 1]      [540  34]


In [22]:
pred = clfOCSV.predict(X_noise_2d)
unique_elements, counts_elements = np.unique(pred, return_counts=True)
print("\t",unique_elements,"    ",counts_elements)

	 [0 1]      [548  26]


It can be seen how it identifies two different classes, this may be due to the differences in a signal in the p300 and non p300 stages. Next, we will add noise to the signal to see how the classifier behaves.

In [29]:
predLow = clf.predict(X_noise_2d + lowNoise)
predMid = clf.predict(X_noise_2d + midNoise)
predHigh = clf.predict(X_noise_2d + highNoise)

print("Predictions with low noise")
unique_elements, counts_elements = np.unique(predLow, return_counts=True)
print("\t",unique_elements,"    ",counts_elements)
print("Predictions with mid noise")
unique_elements, counts_elements = np.unique(predMid, return_counts=True)
print("\t",unique_elements,"    ",counts_elements)
print("Predictions with high noise")
unique_elements, counts_elements = np.unique(predHigh, return_counts=True)
print("\t",unique_elements,"    ",counts_elements)

Predictions with low noise
	 [0 1]      [540  34]
Predictions with mid noise
	 [0 1]      [143 431]
Predictions with high noise
	 [1]      [574]


In [30]:
predLow = clfOCSV.predict(X_noise_2d + lowNoise)
predMid = clfOCSV.predict(X_noise_2d + midNoise)
predHigh = clfOCSV.predict(X_noise_2d + highNoise)

print("Predictions with low noise")
unique_elements, counts_elements = np.unique(predLow, return_counts=True)
print("\t",unique_elements,"    ",counts_elements)
print("Predictions with mid noise")
unique_elements, counts_elements = np.unique(predMid, return_counts=True)
print("\t",unique_elements,"    ",counts_elements)
print("Predictions with high noise")
unique_elements, counts_elements = np.unique(predHigh, return_counts=True)
print("\t",unique_elements,"    ",counts_elements)

Predictions with low noise
	 [0 1]      [548  26]
Predictions with mid noise
	 [0 1]      [122 452]
Predictions with high noise
	 [1]      [574]


## Clasificador binario para ruido

In [37]:
X_train, X_test = train_test_split(X, test_size=0.20, random_state=42,shuffle=False)

In [42]:
cont = 0
X_mix = []
Y_mix =[]
for i in X_train:
    if(cont%2 == 0):
        noise = generateNoise(i.shape, 0.1)
        X_mix.append(i+noise)
        Y_mix.append(1)
    else:
        X_mix.append(i)
        Y_mix.append(0)
    cont+=1

In [43]:
clf = RandomForestClassifier(max_depth=2, random_state=0)

X_mix_train = np.array(X_mix)
Y_mix_train = np.array(Y_mix)

nsamples, nx, ny = X_mix_train.shape
X_mix_train_2d = X_mix_train.reshape((nsamples,nx*ny))

clf.fit(X_mix_train_2d, Y_mix_train)

RandomForestClassifier(max_depth=2, random_state=0)

In [44]:
cont = 0
X_mix_test = []
Y_mix_test =[]
for i in X_test:
    if(cont%2 == 0):
        noise = generateNoise(i.shape, 0.8)
        X_mix_test.append(i+noise)
        Y_mix_test.append(1)
    else:
        X_mix_test.append(i)
        Y_mix_test.append(0)
    cont+=1

In [45]:
X_mix_test = np.array(X_mix_test)
Y_mix_test = np.array(Y_mix_test)

nsamples, nx, ny = X_mix_test.shape
X_mix_test_2d = X_mix_test.reshape((nsamples,nx*ny))

pred = clf.predict(X_mix_test_2d)
target_names = ['NoNoise', 'Noise']
report = classification_report(Y_mix_test, pred, target_names=target_names)
print(report)

              precision    recall  f1-score   support

     NoNoise       0.71      0.50      0.59       115
       Noise       0.61      0.79      0.69       115

    accuracy                           0.65       230
   macro avg       0.66      0.65      0.64       230
weighted avg       0.66      0.65      0.64       230

