In [1]:
from eeg2vec.train.train import train
from eeg2vec.data_loader import get_dataloader
from eeg2vec.models.eeg2vec import EEG2Vec
from eeg2vec.contrastive_loss import ContrastiveLoss

import numpy as np
import torch

In [2]:
## First let's load the training data
from pathlib import Path
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.signal import butter, lfilter
import pandas as pd

ROOT_PATH = Path("train/")
training_data = [(np.load(ROOT_PATH / f"data_{i}.npy"),np.load(ROOT_PATH / f"target_{i}.npy")) for i in range(4)]


In [3]:
print(training_data[0][0].shape)

(5, 7712740)


In [4]:
def butter_bandpass(lowcut, highcut, fs, order=5):
    return butter(order, [lowcut, highcut], fs=fs, btype='band')

def butter_bandpass_filter(data, lowcut, highcut, fs, order=5):
    b, a = butter_bandpass(lowcut, highcut, fs, order=order)
    y = lfilter(b, a, data)
    return y

In [5]:
# First we need to get the point that maps to a label

def reshape_array_into_windows(x, sample_rate, window_duration_in_seconds):
    """
    Reshape the data into an array of shape (C, T, window) where 'window' contains
    the points corresponding to 'window_duration' seconds of data.

    Parameters:
    x (numpy array): The input data array.
    sample_rate (int): The number of samples per second.
    window_duration_in_seconds (float): The duration of each window in seconds.

    Returns:
    reshaped_x (numpy array): The reshaped array with shape (C, T, window).
    """
    # Calculate the number of samples in one window
    window_size = int(window_duration_in_seconds * sample_rate)
    
    # Ensure the total length of x is a multiple of window_size
    total_samples = x.shape[-1]
    if total_samples % window_size != 0:
        # Truncate or pad x to make it divisible by window_size
        x = x[..., :total_samples - (total_samples % window_size)]
    # Reshape x into (C, T, window)
    reshaped_x = x.reshape(x.shape[0], -1, window_size)

    return reshaped_x

In [6]:
# We first load and reshape all the data
all_data = []
all_targets = []
# We need to have
# data of Shape: [num_samples, num_channels (5), sequence_length]
# labels of Shape: [num_samples, 5]

for data, target in training_data:
    reshaped_data = reshape_array_into_windows(data, 250, 2)
    reshaped_data = reshaped_data.transpose(1, 0, 2)
    target = target.reshape(-1, 5)
    all_data.append(reshaped_data)
    all_targets.append(target)

all_data = np.concatenate(all_data, axis=0)
all_targets = np.concatenate(all_targets, axis=0)


In [7]:
print(all_data.shape)
print(all_targets.shape)

(52351, 5, 500)
(52351, 5)


In [12]:
data = all_data[:5000]
labels = all_targets[:5000]

In [15]:
from sklearn.model_selection import train_test_split

# Split the data into training and test sets
X_train_full, X_test, y_train_full, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)

# Further split training data for embeddings and XGBoost
X_train_embeddings, X_train_xgboost, y_train_embeddings, y_train_xgboost = train_test_split(X_train_full, y_train_full, test_size=0.5, random_state=42)


In [16]:
print(X_train_embeddings.shape, y_train_embeddings.shape)


(2000, 5, 500) (2000, 5)


In [17]:
data_loader = get_dataloader(all_data, all_targets, batch_size=100, shuffle=True) 

In [22]:
model = EEG2Vec(64, 2, 5, 2)



In [23]:
# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [24]:
print(torch.__version__)

2.5.1+cu118


In [25]:
# use cuda if available
model = model.to(device)
train(model, data_loader, 100, device)

Epoch 1/100 completed.


KeyboardInterrupt: 

In [None]:
# Save the model
torch.save(model.state_dict(), "eeg2vec/data/saved_models/eeg2vec_1_400windows.pth")

In [12]:
model = EEG2Vec(64, 2, 50, 2)
model.load_state_dict(torch.load("eeg2vec/data/saved_models/eeg2vec_1_400windows.pth", weights_only=True))
model.eval()



EEG2Vec(
  (cnn_encoder): CNNEncoder(
    (conv_layers): Sequential(
      (0): Conv1d(5, 32, kernel_size=(5,), stride=(1,))
      (1): ReLU()
      (2): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (3): Conv1d(32, 64, kernel_size=(5,), stride=(1,))
      (4): ReLU()
    )
  )
  (transformer_encoder): TransformerEncoder(
    (transformer_encoder): TransformerEncoder(
      (layers): ModuleList(
        (0-49): 50 x TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=64, out_features=64, bias=True)
          )
          (linear1): Linear(in_features=64, out_features=2, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=2, out_features=64, bias=True)
          (norm1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropou

In [20]:
# Compute embeddings for xgboost training data
with torch.no_grad():
    model.eval()
    training_embeddings = model(torch.tensor(X_train_xgboost, dtype=torch.float32))
    test_embeddings = model(torch.tensor(X_test, dtype=torch.float32))
    test_embeddings = test_embeddings.cpu().numpy()
    training_embeddings = training_embeddings.cpu().numpy()

In [21]:
training_embeddings.shape

(400, 244, 64)

In [22]:
import xgboost as xgb
from sklearn.multioutput import MultiOutputClassifier

params = {
    'objective': 'binary:logistic',  # For binary classification; use 'multi:softmax' for multi-class
    'eval_metric': 'logloss',        # Evaluation metric (logarithmic loss)
    'learning_rate': 0.1,            # Step size shrinkage
    'max_depth': 6,                  # Maximum tree depth
    'subsample': 0.8,                # Percentage of samples to use per tree
    'colsample_bytree': 0.8,         # Percentage of features to use per tree
    'lambda': 1,                     # L2 regularization term
    'alpha': 0                       # L1 regularization term
}

# Train the model
model_xgb = MultiOutputClassifier(xgb.XGBClassifier(**params))
training_embeddings = training_embeddings.reshape(training_embeddings.shape[0], -1)
model_xgb.fit(training_embeddings, y_train_xgboost)


In [23]:
# save model
import pickle
pickle.dump(model_xgb, open("eeg2vec/data/saved_models/xgboost_1_400windows.pkl", "wb"))

In [11]:
import pickle
model_xgb = pickle.load(open("eeg2vec/data/saved_models/xgboost_1_400windows.pkl", "rb"))

In [24]:
# Evaluate the model
test_embeddings = test_embeddings.reshape(test_embeddings.shape[0], -1)
predictions = model_xgb.predict(test_embeddings)

from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy:.2f}')
# F1 score
from sklearn.metrics import f1_score
f1 = f1_score(y_test, predictions, average='weighted')
print(f'F1 Score: {f1:.2f}')

Accuracy: 0.93
F1 Score: 0.98


In [30]:
test_data = all_data[7000:20000]
test_targets = all_targets[7000:20000]

In [31]:
# empty cache
torch.cuda.empty_cache()

In [32]:
model =  model.to(device)
model.eval()
with torch.no_grad():
    model.eval()
    for i in range(0, len(test_data), 1000):
        embeddings = model(torch.tensor(test_data[i:i+1000], dtype=torch.float32).to(device))
        if i == 0:
            all_embeddings = embeddings
        else:
            all_embeddings = torch.cat((all_embeddings, embeddings), dim=0)
    embeddings = all_embeddings.reshape(all_embeddings.shape[0], -1).cpu().numpy()
predictions = model_xgb.predict(embeddings)

from sklearn.metrics import accuracy_score

accuracy = accuracy_score(test_targets, predictions)
print(f'Accuracy: {accuracy:.2f}')
# F1 score
from sklearn.metrics import f1_score
f1 = f1_score(test_targets, predictions, average='weighted')
print(f'F1 Score: {f1:.2f}')

Accuracy: 0.78
F1 Score: 0.89


In [19]:
all_embeddings.shape

torch.Size([1000, 244, 64])

In [None]:
ROOT_TEST_PATH = Path("test/")
test_data = {i:np.load(ROOT_TEST_PATH / f"data_{i}.npy") for i in [4,5]}
# We process each record independantly

def compute_predictions_on_record(data,model,model_xgb):
    filtered_data =  butter_bandpass_filter(data,0.1,18,250,4)
    reshaped_data = reshape_array_into_windows(filtered_data,250,2)

    reshaped_data = reshaped_data.transpose(1, 0, 2)
    model =  model.to(device)
    model.eval()
    embeddings = model(torch.tensor(reshaped_data, dtype=torch.float32).to(device)).cpu().numpy()
    embeddings = embeddings.reshape(embeddings.shape[0], -1)
    predictions = model_xgb.predict(embeddings)
    return predictions

def format_array_to_target_format(array, record_number):
    assert isinstance(record_number, int)
    assert isinstance(array, np.ndarray)
    assert len(array.shape) == 2
    assert array.shape[0] == 5
    assert set(np.unique(array)) == {0, 1}
    formatted_target = []
    for i in range(array.shape[0]):
        channel_encoding = (i + 1) * 100000
        record_number_encoding = record_number * 1000000
        for j in range(array.shape[1]):
            formatted_target.append(
                {
                    "identifier": record_number_encoding + channel_encoding + j,
                    "target": array[i, j],
                }
            )
    return formatted_target
    


In [43]:
test_data[4].shape

(5, 6602015)

In [None]:
results = []
for record_number, data in test_data.items():
    with torch.no_grad():
        preds = compute_predictions_on_record(data,model,model_xgb)
    formatted_preds = format_array_to_target_format(preds,record_number)
    results.extend(formatted_preds)
df = pd.DataFrame(results)
df.to_csv("submission.csv",index = False)

: 