# XGBoost with EEG spectrograms and Central Chain

In comparison to `xgboost_eeg_spectrograms.ipynb`, this notebook also includes the Central Chain features from the EEG spectrograms.

For each EEG ID, this produces 3120 (3\*400 + 3\*640) features.

In [21]:
DATA_PREPARATION_VOTE_METHOD = "max_vote_window" # "max_vote_window" or "sum_and_normalize". Decides how to aggregate the predictions of the overlapping windows
EEG_SPECTROGRAMS_TYPE = "cwt"

In [22]:
import os
import sys
import warnings
import gc
import pathlib

if bool(os.environ.get("KAGGLE_URL_BASE", "")):
  import sys
  # running on kaggle
  sys.path.insert(0, "/kaggle/input/hsm-source-files")
else:
  # running locally
  sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), "..", "..", "..")))

import torch.nn as nn
import pandas as pd
import numpy as np
import xgboost as xgb

import torch

from src.utils.utils import get_raw_data_dir, get_processed_data_dir, get_submission_csv_path, set_seeds, get_models_save_path, get_eeg_spectrogram_path
from src.utils.constants import Constants
from src.datasets.eeg_processor import EEGDataProcessor
from src.utils.k_folds_creator import KFoldCreator
from src.utils.eeg_spectrogram_creator import EEGSpectrogramGenerator

from tqdm import tqdm

set_seeds(42)

In [23]:
DATA_PATH = get_raw_data_dir()
TRAIN_EEG_SPECTROGRAMS_PATH = get_eeg_spectrogram_path("train", "hsm-eeg-spectrograms")
TEST_EEG_SPECTROGRAMS_PATH = get_eeg_spectrogram_path("test", "hsm-eeg-spectrograms")

processor = EEGDataProcessor(raw_data_path=DATA_PATH, processed_data_path=get_processed_data_dir())
train_df = processor.process_data(vote_method=DATA_PREPARATION_VOTE_METHOD, skip_npy=True)

test_df = pd.read_csv(DATA_PATH / "test.csv")

kl_score = nn.KLDivLoss(reduction="batchmean")

Processor initialized.
Raw data path: '/home/david/git/aicomp/data'
Processed data path: '/home/david/git/aicomp/data/processed'
Starting EEG Data Processing Pipeline
Skipping NumPy file creation as requested.
Using 'max_vote_window' vote aggregation strategy.

Processed train data saved to '/home/david/git/aicomp/data/processed/train_processed.csv'.
Shape of the final dataframe: (17089, 12)

Pipeline finished successfully!


## Load Spectrogram Files into Memory

In [24]:
kaggle_spectrograms_dir = DATA_PATH / "train_spectrograms"
kaggle_spectrogram_files = list(kaggle_spectrograms_dir.glob("*.parquet"))
print(f"Found {len(kaggle_spectrogram_files)} train Kaggle spectrogram files to load into memory")

def get_kaggle_spectrogram_content(spectrogram_file: pathlib.Path):
  eeg_id = int(spectrogram_file.stem.split("_")[-1])
  content = pd.read_parquet(file)
  content = content.drop(columns=["time"]).values
  return eeg_id, content

kaggle_spectrograms = {}
for file in tqdm(kaggle_spectrogram_files):
  eeg_id, content = get_kaggle_spectrogram_content(file)
  kaggle_spectrograms[eeg_id] = content

gc.collect()
print("Loaded all train Kaggle spectrograms into memory")

Found 11138 train Kaggle spectrogram files to load into memory


100%|██████████| 11138/11138 [06:33<00:00, 28.31it/s]


Loaded all train Kaggle spectrograms into memory


In [25]:
eeg_spectrograms_dir = TRAIN_EEG_SPECTROGRAMS_PATH / EEG_SPECTROGRAMS_TYPE
eeg_spectrogram_files = list(eeg_spectrograms_dir.glob("*.npy"))
print(f"Found {len(eeg_spectrogram_files)} train EEG spectrogram files to load into memory")

def get_eeg_spectrogram_content(spectrogram_file: pathlib.Path):
  spectrogram_id = int(spectrogram_file.stem.split("_")[-1])
  content = np.load(file)
  return spectrogram_id, content

eeg_spectrograms = {}
for file in tqdm(eeg_spectrogram_files):
  eeg_id, content = get_eeg_spectrogram_content(file)
  eeg_spectrograms[eeg_id] = content

gc.collect()
print("Loaded all train EEG spectrograms into memory")

Found 17089 train EEG spectrogram files to load into memory


100%|██████████| 17089/17089 [02:57<00:00, 96.52it/s] 

Loaded all train EEG spectrograms into memory





## Feature Engineering

We need features for the XGBoost model.
For this, we take the mean, min and max over time for all of the 400 spectrogram frequencies (100 per chain).
We take the middle 10 minutes of all spectrograms.
For each EEG ID, this produces 1200 features.

In [26]:
KAGGLE_SPECTROGRAM_FREQUENCY_COUNT = 400 # each kaggle spectrogram has 400 frequency bins
EEG_SPECTROGRAM_FREQUENCY_COUNT = 5*128 # each spectrogram has 640 frequency bins (including the central chain)

FEATURES = [f"spec_mean_freq_{x}_10m" for x in range(KAGGLE_SPECTROGRAM_FREQUENCY_COUNT)]
FEATURES += [f"spec_min_freq_{x}_10m" for x in range(KAGGLE_SPECTROGRAM_FREQUENCY_COUNT)]
FEATURES += [f"spec_max_freq_{x}_10m" for x in range(KAGGLE_SPECTROGRAM_FREQUENCY_COUNT)]
FEATURES += [f"eeg_mean_freq_{x}_10s" for x in range(EEG_SPECTROGRAM_FREQUENCY_COUNT)]
FEATURES += [f"spec_min_freq_{x}_10s" for x in range(EEG_SPECTROGRAM_FREQUENCY_COUNT)]
FEATURES += [f"spec_max_freq_{x}_10s" for x in range(EEG_SPECTROGRAM_FREQUENCY_COUNT)]
data = np.zeros((len(train_df), len(FEATURES)))

def extract_kaggle_spectrogram_features(time_window):
  average_frequencies = np.nanmean(time_window, axis=0) # average over time
  min_frequencies = np.nanmin(time_window, axis=0) # min over time
  max_frequencies = np.nanmax(time_window, axis=0) # max over time
  return average_frequencies, min_frequencies, max_frequencies

def extract_train_kaggle_spectrogram_features(row, all_spectrograms):
  spectrogram_id = int(row["spectrogram_id"])
  middle_offset = (row["min_offset"] + row["max_offset"]) // 2 # this the middle between the least spectrogram offset and greatest spectogram offset
  row_index = int(middle_offset // 2) # each spectrogram row corresponds to 2s, so we divide by 2 to get the row index
  window = np.array(all_spectrograms[spectrogram_id][row_index:row_index+300,:])
  average_frequencies, min_frequencies, max_frequencies = extract_kaggle_spectrogram_features(window)
  return average_frequencies, min_frequencies, max_frequencies

def extract_eeg_spectrogram_features(row, all_spectrograms):
  eeg_id = int(row["eeg_id"])
  eeg_spectrogram = all_spectrograms[eeg_id]
  reshaped_eeg_spectrogram = np.zeros((640,256), dtype="float32")
  for j in range(5): # reshape the spectrogram by stacking the 5 channels vertically ([128,256,4] -> [640,256])
    reshaped_eeg_spectrogram[128*j:128*(j+1),] = eeg_spectrogram[:,:,j]
  reshaped_eeg_spectrogram = reshaped_eeg_spectrogram.T # transpose to have time on rows and frequencies on columns ([256,640])

  # get the central 10 seconds
  pixels_per_second = 256 / 50  # 5.12 pixels per second
  duration_pixels = int(10 * pixels_per_second)  # 51 pixels for 10 seconds
  center_pixel = 256 // 2  # 128
  start_idx = center_pixel - duration_pixels // 2  # 128 - 25 = 103
  end_idx = start_idx + duration_pixels  # 103 + 51 = 154
  window = reshaped_eeg_spectrogram[start_idx:end_idx,:] # extract the central 10s window

  average_frequencies, min_frequencies, max_frequencies = extract_kaggle_spectrogram_features(window)
  return average_frequencies, min_frequencies, max_frequencies

for i in tqdm(range(len(train_df)), total=len(train_df)):
  row = train_df.iloc[i]
  average_features_kaggle, min_features_kaggle, max_frequencies_kaggle = extract_train_kaggle_spectrogram_features(row, kaggle_spectrograms)
  average_features_eeg, min_features_eeg, max_frequencies_eeg = extract_eeg_spectrogram_features(row, eeg_spectrograms)

  data[i,:KAGGLE_SPECTROGRAM_FREQUENCY_COUNT] = average_features_kaggle
  data[i,KAGGLE_SPECTROGRAM_FREQUENCY_COUNT:2*KAGGLE_SPECTROGRAM_FREQUENCY_COUNT] = min_features_kaggle
  data[i,2*KAGGLE_SPECTROGRAM_FREQUENCY_COUNT:3*KAGGLE_SPECTROGRAM_FREQUENCY_COUNT] = max_frequencies_kaggle
  data[i,3*KAGGLE_SPECTROGRAM_FREQUENCY_COUNT:3*KAGGLE_SPECTROGRAM_FREQUENCY_COUNT+EEG_SPECTROGRAM_FREQUENCY_COUNT] = average_features_eeg
  data[i,3*KAGGLE_SPECTROGRAM_FREQUENCY_COUNT+EEG_SPECTROGRAM_FREQUENCY_COUNT:3*KAGGLE_SPECTROGRAM_FREQUENCY_COUNT+2*EEG_SPECTROGRAM_FREQUENCY_COUNT] = min_features_eeg
  data[i,3*KAGGLE_SPECTROGRAM_FREQUENCY_COUNT+2*EEG_SPECTROGRAM_FREQUENCY_COUNT:3*KAGGLE_SPECTROGRAM_FREQUENCY_COUNT+3*EEG_SPECTROGRAM_FREQUENCY_COUNT] = max_frequencies_eeg

100%|██████████| 17089/17089 [02:07<00:00, 134.55it/s]


In [27]:
warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning)
train_df[FEATURES] = data

gc.collect()

train_df.head()

Unnamed: 0,eeg_id,spectrogram_id,patient_id,expert_consensus,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote,...,spec_max_freq_630_10s,spec_max_freq_631_10s,spec_max_freq_632_10s,spec_max_freq_633_10s,spec_max_freq_634_10s,spec_max_freq_635_10s,spec_max_freq_636_10s,spec_max_freq_637_10s,spec_max_freq_638_10s,spec_max_freq_639_10s
0,568657,789577333,20654,Other,0.0,0.0,0.25,0.0,0.166667,0.583333,...,0.696763,0.706355,0.720342,0.730382,0.739936,0.745822,0.749742,0.751327,0.762009,0.768977
1,582999,1552638400,20230,LPD,0.0,0.857143,0.0,0.071429,0.0,0.071429,...,0.879343,0.890854,0.90805,0.924625,0.940498,0.955845,0.970025,0.981525,0.990747,0.996359
2,642382,14960202,5955,Other,0.0,0.0,0.0,0.0,0.0,1.0,...,0.816606,0.828744,0.841835,0.85763,0.868275,0.877941,0.885635,0.886907,0.891141,0.892733
3,751790,618728447,38549,GPD,0.0,0.0,1.0,0.0,0.0,0.0,...,0.665133,0.633162,0.623576,0.657301,0.664624,0.682423,0.676099,0.677395,0.675104,0.663505
4,778705,52296320,40955,Other,0.0,0.0,0.0,0.0,0.0,1.0,...,0.58962,0.584764,0.583061,0.576891,0.560683,0.563658,0.560212,0.582349,0.599415,0.608775


## Train XGBoost Model

In [28]:
N_SPLITS = 5

In [29]:
fold_creator = KFoldCreator(n_splits=N_SPLITS, seed=Constants.SEED)
train_folds_df = fold_creator.create_folds(
    df=train_df, stratify_col="expert_consensus", group_col="patient_id"
)

In [30]:
all_oof = []
all_true = []
targets_dict = {"Seizure":0, "LPD":1, "GPD":2, "LRDA":3, "GRDA":4, "Other":5}

models_save_path = get_models_save_path() / "xgboost" / "spectrogram_means" / DATA_PREPARATION_VOTE_METHOD
models_save_path.mkdir(parents=True, exist_ok=True)

for fold in range(N_SPLITS):
    fold_train_df = train_folds_df[train_folds_df["fold"] != fold].reset_index(drop=True)
    fold_valid_df = train_folds_df[train_folds_df["fold"] == fold].reset_index(drop=True)

    print("=" * 40)
    print(f"FOLD {fold}")
    print(f"Train size: {len(fold_train_df)}, Valid size: {len(fold_valid_df)}")
    print("=" * 30)

    X_train = fold_train_df[FEATURES]
    y_train = fold_train_df["expert_consensus"].map(targets_dict)
    
    X_valid = fold_valid_df[FEATURES]
    y_valid = fold_valid_df["expert_consensus"].map(targets_dict)

    dtrain = xgb.DMatrix(X_train, label=y_train)
    dvalid = xgb.DMatrix(X_valid, label=y_valid)

    params = {
        "objective": "multi:softprob",
        "num_class": len(Constants.TARGETS),
        "device": "cuda",
        "tree_method": "hist",
        "eval_metric": "mlogloss",
        "seed": Constants.SEED,
    }

    evals = [(dvalid, "eval")]
    model = xgb.train(
        params,
        dtrain,
        num_boost_round=300,
        evals=evals,
        verbose_eval=100,
        early_stopping_rounds=10,
    )
    
    model.save_model(models_save_path / f"fold_{fold}.json")

    oof = model.predict(dvalid)
    all_oof.extend(oof)

    all_true.extend(fold_valid_df[Constants.TARGETS].values)

    del X_train, y_train, X_valid, y_valid, dtrain, dvalid, oof
    gc.collect()

all_oof = np.array(all_oof)
all_true = np.array(all_true)

FOLD 0
Train size: 13022, Valid size: 4067
[0]	eval-mlogloss:1.61136
[33]	eval-mlogloss:1.28431
FOLD 1
Train size: 13431, Valid size: 3658
[0]	eval-mlogloss:1.57259
[50]	eval-mlogloss:1.02083
FOLD 2
Train size: 13708, Valid size: 3381
[0]	eval-mlogloss:1.57379
[49]	eval-mlogloss:1.07309
FOLD 3
Train size: 14464, Valid size: 2625
[0]	eval-mlogloss:1.58194
[36]	eval-mlogloss:1.14563
FOLD 4
Train size: 13731, Valid size: 3358
[0]	eval-mlogloss:1.57938
[36]	eval-mlogloss:1.08854


## CV Score

In [31]:
all_oof_tensor = torch.tensor(all_oof, dtype=torch.float32)
all_true_tensor = torch.tensor(all_true, dtype=torch.float32)

kl_score = nn.KLDivLoss(reduction="batchmean")
score = kl_score(all_oof_tensor.log(), all_true_tensor).item()

print(f"OOF KL Score: {score}")

OOF KL Score: 0.9043303728103638


## Create EEG Spectrograms for Test Set

In [32]:
test_eeg_save_path = TEST_EEG_SPECTROGRAMS_PATH / EEG_SPECTROGRAMS_TYPE
test_eeg_save_path.mkdir(parents=True, exist_ok=True)

spectrogram_creator = EEGSpectrogramGenerator([EEG_SPECTROGRAMS_TYPE])

In [33]:
test_eeg_ids = test_df["eeg_id"].unique()
print(f"Generating spectrograms for {len(test_eeg_ids)} test EEG recordings")

test_eeg_spectrograms = {}
for i, eeg_id in enumerate(tqdm(test_eeg_ids)):
  eeg = pd.read_parquet(DATA_PATH / "test_eegs" / f"{eeg_id}.parquet")
  spectrogram = spectrogram_creator.generate(eeg)

  assert len(spectrogram) == 1, "Expected only one spectrogram type"
  test_eeg_spectrograms[eeg_id] = spectrogram[EEG_SPECTROGRAMS_TYPE]

Generating spectrograms for 1 test EEG recordings


100%|██████████| 1/1 [00:02<00:00,  2.20s/it]


## Infer on Test and create Submission

In [34]:
del train_df
gc.collect()

0

In [35]:
test_spectrograms_dir = DATA_PATH / "test_spectrograms"
test_spectrogram_files = list(test_spectrograms_dir.glob("*.parquet"))
print(f"Found {len(test_spectrogram_files)} test spectrogram files to load into memory")

test_spectrograms = {}
for file in tqdm(test_spectrogram_files):
  eeg_id, content = get_kaggle_spectrogram_content(file)
  test_spectrograms[eeg_id] = content

gc.collect()
print("Loaded all test spectrograms into memory")

Found 1 test spectrogram files to load into memory


100%|██████████| 1/1 [00:00<00:00, 17.50it/s]

Loaded all test spectrograms into memory





In [36]:
test_data = np.zeros((len(test_df), len(FEATURES)))

def extract_test_spectrogram_features(row, all_spectrograms):
  # this differs from train because all test spectrograms are exactly 10 minutes long, so we don't need to extract the center window
  spectrogram_id = int(row["spectrogram_id"])
  content = np.array(all_spectrograms[spectrogram_id][:])
  average_frequencies, min_frequencies, max_frequencies = extract_kaggle_spectrogram_features(content)
  return average_frequencies, min_frequencies, max_frequencies

for i in tqdm(range(len(test_df)), total=len(test_df)):
  row = test_df.iloc[i]
  average_features_kaggle, min_features_kaggle, max_frequencies_kaggle = extract_test_spectrogram_features(row, test_spectrograms)
  average_features_eeg, min_features_eeg, max_frequencies_eeg = extract_eeg_spectrogram_features(row, test_eeg_spectrograms)

  test_data[i,:KAGGLE_SPECTROGRAM_FREQUENCY_COUNT] = average_features_kaggle
  test_data[i,KAGGLE_SPECTROGRAM_FREQUENCY_COUNT:2*KAGGLE_SPECTROGRAM_FREQUENCY_COUNT] = min_features_kaggle
  test_data[i,2*KAGGLE_SPECTROGRAM_FREQUENCY_COUNT:3*KAGGLE_SPECTROGRAM_FREQUENCY_COUNT] = max_frequencies_kaggle
  test_data[i,3*KAGGLE_SPECTROGRAM_FREQUENCY_COUNT:3*KAGGLE_SPECTROGRAM_FREQUENCY_COUNT+EEG_SPECTROGRAM_FREQUENCY_COUNT] = average_features_eeg
  test_data[i,3*KAGGLE_SPECTROGRAM_FREQUENCY_COUNT+EEG_SPECTROGRAM_FREQUENCY_COUNT:3*KAGGLE_SPECTROGRAM_FREQUENCY_COUNT+2*EEG_SPECTROGRAM_FREQUENCY_COUNT] = min_features_eeg
  test_data[i,3*KAGGLE_SPECTROGRAM_FREQUENCY_COUNT+2*EEG_SPECTROGRAM_FREQUENCY_COUNT:3*KAGGLE_SPECTROGRAM_FREQUENCY_COUNT+3*EEG_SPECTROGRAM_FREQUENCY_COUNT] = max_frequencies_eeg

100%|██████████| 1/1 [00:00<00:00, 111.78it/s]


In [37]:
test_df[FEATURES] = test_data

del test_data
del test_spectrograms
gc.collect()

test_df.head()

Unnamed: 0,spectrogram_id,eeg_id,patient_id,spec_mean_freq_0_10m,spec_mean_freq_1_10m,spec_mean_freq_2_10m,spec_mean_freq_3_10m,spec_mean_freq_4_10m,spec_mean_freq_5_10m,spec_mean_freq_6_10m,...,spec_max_freq_630_10s,spec_max_freq_631_10s,spec_max_freq_632_10s,spec_max_freq_633_10s,spec_max_freq_634_10s,spec_max_freq_635_10s,spec_max_freq_636_10s,spec_max_freq_637_10s,spec_max_freq_638_10s,spec_max_freq_639_10s
0,853520,3911565283,6885,16.864132,19.120565,18.342468,13.408634,8.0575,4.890133,3.460633,...,0.855498,0.844325,0.8336,0.825578,0.818686,0.825944,0.840221,0.861681,0.882156,0.89733


In [38]:
test_preds = []

for fold in range(N_SPLITS):
  print("=" * 40)
  print(f"Predicting fold {fold}")
  print("=" * 40)

  X_train = test_df[FEATURES]
  dtest = xgb.DMatrix(X_train)

  model = xgb.Booster()
  model.load_model(models_save_path / f"fold_{fold}.json")

  preds = model.predict(dtest)
  test_preds.append(preds)

test_preds = np.mean(test_preds, axis=0)
print(f"Test predictions shape: {test_preds.shape}")

Predicting fold 0
Predicting fold 1
Predicting fold 2
Predicting fold 3
Predicting fold 4
Test predictions shape: (1, 6)


In [39]:
# sanity check: all predictions should sum to 1
assert np.allclose(test_preds.sum(axis=1), 1.0)

In [40]:
submission = pd.DataFrame({"eeg_id": test_df["eeg_id"]})
submission[Constants.TARGETS] = test_preds

submission.to_csv(get_submission_csv_path(), index=False)