# Train XGBoost classifier using the eef2vec embeddings

In [53]:
from eeg2vec.models.eeg2vec import EEG2Vec

import numpy as np
import torch
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.multioutput import MultiOutputClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.metrics import cohen_kappa_score

from pathlib import Path
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.signal import butter, lfilter
import pandas as pd
import pickle

In [54]:
ROOT_PATH = Path("train/")
training_data = [(np.load(ROOT_PATH / f"data_{i}.npy"),np.load(ROOT_PATH / f"target_{i}.npy")) for i in range(4)]

In [55]:
def butter_bandpass(lowcut, highcut, fs, order=5):
    return butter(order, [lowcut, highcut], fs=fs, btype='band')

def butter_bandpass_filter(data, lowcut, highcut, fs, order=5):
    b, a = butter_bandpass(lowcut, highcut, fs, order=order)
    y = lfilter(b, a, data)
    return y

In [56]:
# Apply bandpass filter to the data
training_data_filtered = [(butter_bandpass_filter(data, 0.1, 18, 100), target) for data, target in training_data]

In [57]:
# First we need to get the point that maps to a label

def reshape_array_into_windows(x, sample_rate, window_duration_in_seconds):
    """
    Reshape the data into an array of shape (C, T, window) where 'window' contains
    the points corresponding to 'window_duration' seconds of data.

    Parameters:
    x (numpy array): The input data array.
    sample_rate (int): The number of samples per second.
    window_duration_in_seconds (float): The duration of each window in seconds.

    Returns:
    reshaped_x (numpy array): The reshaped array with shape (C, T, window).
    """
    # Calculate the number of samples in one window
    window_size = int(window_duration_in_seconds * sample_rate)
    
    # Ensure the total length of x is a multiple of window_size
    total_samples = x.shape[-1]
    if total_samples % window_size != 0:
        # Truncate or pad x to make it divisible by window_size
        x = x[..., :total_samples - (total_samples % window_size)]
    # Reshape x into (C, T, window)
    reshaped_x = x.reshape(x.shape[0], -1, window_size)

    return reshaped_x

In [58]:
# We first load and reshape all the data
all_data = []
all_targets = []
# We need to have
# data of Shape: [num_samples, num_channels (5), sequence_length]
# labels of Shape: [num_samples, 5]

for data, target in training_data_filtered:
    reshaped_data = reshape_array_into_windows(data, 250, 2)
    reshaped_data = reshaped_data.transpose(1, 0, 2)
    target = target.reshape(-1, 5)
    all_data.append(reshaped_data)
    all_targets.append(target)

all_data = np.concatenate(all_data, axis=0)
all_targets = np.concatenate(all_targets, axis=0)

In [59]:
print(all_data.shape)

(52351, 5, 500)


In [60]:
data, labels = all_data, all_targets

In [61]:
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.3, random_state=42)

In [62]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((36645, 5, 500), (15706, 5, 500), (36645, 5), (15706, 5))

In [63]:
np.unique(X_train), np.unique(X_test), np.unique(y_train), np.unique(y_test)

(array([-7364947.32675632, -7347991.75072583, -7133418.38183685, ...,
         5642409.53231173,  5789571.47749782,  5793749.64445628]),
 array([-3798270.10160029, -3751978.25820647, -3736336.11508017, ...,
         4510525.88312207,  4514229.7227241 ,  4519604.36777344]),
 array([0, 1], dtype=int64),
 array([0, 1], dtype=int64))

## Compute embeddings

In [71]:
## Load eeg2vecmodel
d_model = 8
n_heads = 2
n_layers = 2
dim_feedforward = 1
conv_size = 8
out_size = 8
kernel_size = 2
kernel_size_2 = 2
eeg2vec_model = EEG2Vec(d_model, n_heads, n_layers, dim_feedforward, conv_size, out_size, kernel_size, kernel_size_2)
model_path = "eeg2vec/data/saved_models/eeg2vec_8_2_2_1_28nov_10000points.pth"
eeg2vec_model.load_state_dict(torch.load(model_path))

  eeg2vec_model.load_state_dict(torch.load(model_path))


<All keys matched successfully>

In [72]:
# Cuda if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [73]:
# Compute embeddings for xgboost training data
eeg2vec_model.to(device)
eeg2vec_model.eval()
training_embeddings = []
for i in range(0, len(X_train), 100):
    x = torch.tensor(X_train[i:i+100], dtype=torch.float32).to(device)
    with torch.no_grad():
        training_embeddings.append(eeg2vec_model(x).cpu().numpy())
training_embeddings = np.concatenate(training_embeddings, axis=0)


In [74]:
training_embeddings.shape

(36645, 248, 8)

In [75]:
# Compute embeddings for xgboost test data
test_embeddings = []
for i in range(0, len(X_test), 100):
    x = torch.tensor(X_test[i:i+100], dtype=torch.float32).to(device)
    with torch.no_grad():
        test_embeddings.append(eeg2vec_model(x).cpu().numpy())
test_embeddings = np.concatenate(test_embeddings, axis=0)


In [76]:
test_embeddings.shape

(15706, 248, 8)

In [88]:
np.unique(test_embeddings)

array([-1.2661096 , -1.2661095 , -1.2661093 , ...,  0.20249058,
        0.20249073,  0.20249139], dtype=float32)

In [79]:
training_embeddings = training_embeddings.reshape(training_embeddings.shape[0], -1)
test_embeddings = test_embeddings.reshape(test_embeddings.shape[0], -1)

In [80]:
training_embeddings.shape, test_embeddings.shape

((36645, 1984), (15706, 1984))

## Train XGBOOST model

In [81]:
params = {
    'max_depth': 4,
    'n_estimators': 50,
    'learning_rate': 0.15,
    'subsample': 0.6,
    'colsample_bytree': 0.8,
    'min_child_weight': 3,
    'gamma': 0.1,
    'objective': 'binary:logistic',
    'alpha': 0.1,
    'lambda': 1.0
}

xgb_model = MultiOutputClassifier(xgb.XGBClassifier(**params))

In [82]:
xgb_model.fit(training_embeddings, y_train)

In [83]:
# Save the model
pickle.dump(xgb_model, open("xgb_model.pkl", "wb"))

## Evaluate classification

In [84]:
predictions = xgb_model.predict(training_embeddings)

accuracy = accuracy_score(y_train, predictions)
f1 = f1_score(y_train, predictions, average='weighted')
print(f"Training accuracy: {accuracy}")
print(f"Training F1 score: {f1}")

fla_predictions = predictions.flatten()
fla_y_train = y_train.flatten()
print(classification_report(fla_y_train, fla_predictions))

cohen_kappa_score(fla_y_train, fla_predictions)

Training accuracy: 0.5286942284076954
Training F1 score: 0.7553226043201056
              precision    recall  f1-score   support

           0       0.65      0.36      0.46     74015
           1       0.67      0.87      0.76    109210

    accuracy                           0.66    183225
   macro avg       0.66      0.61      0.61    183225
weighted avg       0.66      0.66      0.64    183225



0.24655684532098154

In [85]:
np.unique(predictions, return_counts=True)

(array([0, 1]), array([ 40538, 142687], dtype=int64))

In [86]:
predictions = xgb_model.predict(test_embeddings)


accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy:.2f}')
# F1 score
f1 = f1_score(y_test, predictions, average='weighted')
print(f'F1 Score: {f1:.2f}')


Accuracy: 0.51
F1 Score: 0.74


In [87]:
# flatten the predictions and the true labels
y_test_flat = y_test.flatten()
predictions_flat = predictions.flatten()

# Cohen's Kappa
kappa = cohen_kappa_score(y_test_flat, predictions_flat)
print(f'Cohen\'s Kappa: {kappa:.2f}')


Cohen's Kappa: 0.18


In [50]:
# Count proportion of each class
print(np.unique(y_test, return_counts=True))

print(np.unique(predictions, return_counts=True))

(array([0, 1], dtype=int64), array([31279, 47251], dtype=int64))
(array([0, 1]), array([11114, 67416], dtype=int64))


In [88]:
print(np.unique(y_train, return_counts=True))

(array([0, 1], dtype=int64), array([ 74015, 109210], dtype=int64))


In [None]:
## With 8,2,5,2 -> Clearly overfitting

## Train knn classification

In [130]:
# Flatten the data and the targets
training_embeddings = training_embeddings.reshape(training_embeddings.shape[0], -1)
test_embeddings = test_embeddings.reshape(test_embeddings.shape[0], -1)
print(training_embeddings.shape, test_embeddings.shape)

y_test = y_test.reshape(-1, 5)
y_train = y_train.reshape(-1, 5)
print(y_train.shape, y_test.shape)

(36645, 3992) (15706, 3992)
(36645, 5) (15706, 5)


In [131]:
# Train a 5 knn classifiers

knn_models = [KNeighborsClassifier(n_neighbors=5) for _ in range(5)]
for i, model in enumerate(knn_models):
    model.fit(training_embeddings, y_train[:, i])


In [132]:
# Evaluate the models
predictions = np.stack([model.predict(test_embeddings) for model in knn_models], axis=1)
predictions.shape


(15706, 5)

In [133]:
accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy:.2f}')
# F1 score
f1 = f1_score(y_test, predictions, average='weighted')
print(f'F1 Score: {f1:.2f}')


Accuracy: 0.52
F1 Score: 0.65


## Create submission

In [89]:
ROOT_TEST_PATH = Path("test/")
submission_data = [np.load(ROOT_TEST_PATH / f"data_{i}.npy")for i in range(4,6)]

In [90]:
submission_data_filtered = [butter_bandpass_filter(data, 0.1, 18, 100) for data in submission_data]
# We first load and reshape all the data
all_data = []
# We need to have
# data of Shape: [num_samples, num_channels (5), sequence_length]
# labels of Shape: [num_samples, 5]

for data in submission_data_filtered:
    reshaped_data = reshape_array_into_windows(data, 250, 2)
    reshaped_data = reshaped_data.transpose(1, 0, 2)
    all_data.append(reshaped_data)

submission_data = all_data


In [91]:
len(submission_data)

2

In [92]:
# Compute embeddings for xgboost submission data
submission_embeddings = []
for data in submission_data:
    submission_embedding = []
    for i in range(0, len(data), 100):
        x = torch.tensor(data[i:i+100], dtype=torch.float32).to(device)
        with torch.no_grad():
            submission_embedding.append(eeg2vec_model(x).cpu().numpy())
    submission_embedding = np.concatenate(submission_embedding, axis=0)
    submission_embedding = submission_embedding.reshape(submission_embedding.shape[0], -1)
    submission_embeddings.append(submission_embedding)

In [93]:
submission_embeddings[0].shape

(13204, 1984)

In [100]:
# prediction with xgboost
predictions_xgboost = {}
predictions_knn = {}
for i, submission_embedding in enumerate(submission_embeddings):
    predictions_xgboost[i+4] = xgb_model.predict(submission_embedding)

In [101]:
# reshape the predictions
for i in range(4, 6):
    predictions_xgboost[i] = predictions_xgboost[i].reshape(5, -1)

In [102]:
# Count proportion of each class
print("XGBoost")
for key, value in predictions_xgboost.items():
    print(f"Submission {key}")
    print(np.unique(value, return_counts=True))

print("KNN")
for key, value in predictions_knn.items():
    print(f"Submission {key}")
    print(np.unique(value, return_counts=True))

XGBoost
Submission 4
(array([0, 1]), array([31639, 34381], dtype=int64))
Submission 5
(array([0, 1]), array([23068, 23527], dtype=int64))
KNN


In [103]:
def format_array_to_target_format(array, record_number):
    assert isinstance(record_number, int)
    assert isinstance(array, np.ndarray)
    assert len(array.shape) == 2
    assert array.shape[0] == 5
    assert set(np.unique(array)) == {0, 1}
    formatted_target = []
    for i in range(array.shape[0]):
        channel_encoding = (i + 1) * 100000
        record_number_encoding = record_number * 1000000
        for j in range(array.shape[1]):
            formatted_target.append(
                {
                    "identifier": record_number_encoding + channel_encoding + j,
                    "target": array[i, j],
                }
            )
    return formatted_target

In [105]:
results = []
for record_number in predictions_xgboost.keys():
    formatted_preds = format_array_to_target_format(predictions_xgboost[record_number],record_number)
    results.extend(formatted_preds)
df = pd.DataFrame(results)
df.to_csv("submission_eeg2vec.csv",index = False)

In [86]:
## Compare training data with submission data
import pandas as pd

# Compare submission data to training data
train_stats = pd.DataFrame(training_data_filtered).describe()
submission_stats = pd.DataFrame(submission_data_filtered).describe()
print(train_stats)
print(submission_stats)



ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 2 dimensions. The detected shape was (2, 5) + inhomogeneous part.