# XGBoost

In [1]:
DATA_PREPARATION_VOTE_METHOD = "sum_and_normalize" # "max_vote_window" or "sum_and_normalize". Decides how to aggregate the predictions of the overlapping windows

In [2]:
import os
import sys
import warnings
import gc
import pathlib

if bool(os.environ.get("KAGGLE_URL_BASE", "")):
  import sys
  # running on kaggle
  sys.path.insert(0, "/kaggle/input/hsm-source-files")
else:
  # running locally
  sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), "..", "..", "..")))

import torch.nn as nn
import pandas as pd
import numpy as np
import xgboost as xgb

import torch

from src.utils.utils import get_raw_data_dir, get_processed_data_dir, get_submission_csv_path, set_seeds, get_models_save_path
from src.utils.constants import Constants
from src.datasets.eeg_processor import EEGDataProcessor
from src.utils.k_folds_creator import KFoldCreator

from tqdm import tqdm

set_seeds(42)

2025-10-08 19:43:41,132 :: root :: INFO :: Initialising Utils
2025-10-08 19:43:41,134 :: root :: INFO :: Initialising Datasets


In [3]:
DATA_PATH = get_raw_data_dir()

processor = EEGDataProcessor(raw_data_path=DATA_PATH, processed_data_path=get_processed_data_dir())
train_df = processor.process_data(vote_method=DATA_PREPARATION_VOTE_METHOD, skip_npy=True)

test_df = pd.read_csv(DATA_PATH / "test.csv")

kl_score = nn.KLDivLoss(reduction="batchmean")

Processor initialized.
Raw data path: '/home/david/git/aicomp/data'
Processed data path: '/home/david/git/aicomp/data/processed'
Starting EEG Data Processing Pipeline
Skipping NumPy file creation as requested.
Using 'max_vote_window' vote aggregation strategy.

Processed train data saved to '/home/david/git/aicomp/data/processed/train_processed.csv'.
Shape of the final dataframe: (17089, 12)

Pipeline finished successfully!


## Load Spectrogram Files into Memory

In [4]:
spectrograms_dir = DATA_PATH / "train_spectrograms"
spectrogram_files = list(spectrograms_dir.glob("*.parquet"))
print(f"Found {len(spectrogram_files)} train spectrogram files to load into memory")

def get_spectrogram_content(spectrogram_file: pathlib.Path):
  spectrogram_id = int(spectrogram_file.stem.split("_")[-1])
  content = pd.read_parquet(file)
  content = content.drop(columns=["time"]).values
  return spectrogram_id, content

spectrograms = {}
for file in tqdm(spectrogram_files):
  spectrogram_id, content = get_spectrogram_content(file)
  spectrograms[spectrogram_id] = content

gc.collect()
print("Loaded all train spectrograms into memory")

Found 11138 train spectrogram files to load into memory


100%|██████████| 11138/11138 [05:45<00:00, 32.21it/s]


Loaded all train spectrograms into memory


## Feature Engineering

We need features for the XGBoost model.
For this, we take the mean, min and max over time for all of the 400 spectrogram frequencies (100 per chain).
We take the middle 10 minutes of all spectrograms.
For each EEG ID, this produces 1200 features.

In [5]:
FREQUENCY_COUNT = 400 # each spectrogram has 400 frequency bins

FEATURES = [f"spec_mean_freq_{x}" for x in range(FREQUENCY_COUNT)]
FEATURES += [f"spec_min_freq_{x}" for x in range(FREQUENCY_COUNT)]
FEATURES += [f"spec_max_freq_{x}" for x in range(FREQUENCY_COUNT)]
data = np.zeros((len(train_df), len(FEATURES)))

def extract_spectrogram_features(ten_minute_window):
  average_frequencies = ten_minute_window.mean(axis=0) # average over 300 rows (10 minutes)
  min_frequencies = ten_minute_window.min(axis=0) # min over 300 rows (10 minutes)
  max_frequencies = ten_minute_window.max(axis=0) # max over 300 rows (10 minutes)
  return average_frequencies, min_frequencies, max_frequencies

def extract_train_spectrogram_features(row, all_spectrograms):
  spectrogram_id = int(row["spectrogram_id"])
  middle_offset = (row["min_offset"] + row["max_offset"]) // 2 # this the middle between the least spectrogram offset and greatest spectogram offset
  row_index = int(middle_offset // 2) # each spectrogram row corresponds to 2s, so we divide by 2 to get the row index
  window = np.array(all_spectrograms[spectrogram_id][row_index:row_index+300,:])
  average_frequencies, min_frequencies, max_frequencies = extract_spectrogram_features(window)
  return average_frequencies, min_frequencies, max_frequencies

for i in tqdm(range(len(train_df)), total=len(train_df)):
  row = train_df.iloc[i]
  average_features, min_features, max_frequencies = extract_train_spectrogram_features(row, spectrograms)

  data[i,:FREQUENCY_COUNT] = average_features
  data[i,FREQUENCY_COUNT:2*FREQUENCY_COUNT] = min_features
  data[i,2*FREQUENCY_COUNT:3*FREQUENCY_COUNT] = max_frequencies

100%|██████████| 17089/17089 [00:03<00:00, 4953.69it/s]


In [6]:
warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning)
train_df[FEATURES] = data

del data
del spectrograms
gc.collect()

train_df.head()

Unnamed: 0,eeg_id,spectrogram_id,patient_id,expert_consensus,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote,...,spec_max_freq_390,spec_max_freq_391,spec_max_freq_392,spec_max_freq_393,spec_max_freq_394,spec_max_freq_395,spec_max_freq_396,spec_max_freq_397,spec_max_freq_398,spec_max_freq_399
0,568657,789577333,20654,Other,0.0,0.0,0.25,0.0,0.166667,0.583333,...,688.390015,658.429993,635.539978,628.02002,629.719971,626.849976,609.75,598.27002,559.200012,547.330017
1,582999,1552638400,20230,LPD,0.0,0.857143,0.0,0.071429,0.0,0.071429,...,0.63,0.89,1.25,1.58,1.72,1.63,1.41,1.16,0.81,0.83
2,642382,14960202,5955,Other,0.0,0.0,0.0,0.0,0.0,1.0,...,35.959999,40.369999,37.049999,36.740002,38.330002,41.080002,56.959999,51.360001,40.66,38.490002
3,751790,618728447,38549,GPD,0.0,0.0,1.0,0.0,0.0,0.0,...,0.43,0.32,0.37,0.36,0.38,0.38,0.37,0.31,0.23,0.32
4,778705,52296320,40955,Other,0.0,0.0,0.0,0.0,0.0,1.0,...,89517.703125,86578.101562,79598.1875,70372.320312,59542.519531,48824.449219,39700.550781,32905.851562,28676.740234,27125.720703


## Train XGBoost Model

In [7]:
N_SPLITS = 5

In [8]:
fold_creator = KFoldCreator(n_splits=N_SPLITS, seed=Constants.SEED)
train_folds_df = fold_creator.create_folds(
    df=train_df, stratify_col="expert_consensus", group_col="patient_id"
)

In [9]:
all_oof = []
all_true = []
targets_dict = {"Seizure":0, "LPD":1, "GPD":2, "LRDA":3, "GRDA":4, "Other":5}

models_save_path = get_models_save_path() / "xgboost" / "spectrogram_means" / DATA_PREPARATION_VOTE_METHOD
models_save_path.mkdir(parents=True, exist_ok=True)

for fold in range(N_SPLITS):
    fold_train_df = train_folds_df[train_folds_df["fold"] != fold].reset_index(drop=True)
    fold_valid_df = train_folds_df[train_folds_df["fold"] == fold].reset_index(drop=True)

    print("=" * 40)
    print(f"FOLD {fold}")
    print(f"Train size: {len(fold_train_df)}, Valid size: {len(fold_valid_df)}")
    print("=" * 30)

    X_train = fold_train_df[FEATURES]
    y_train = fold_train_df["expert_consensus"].map(targets_dict)
    
    X_valid = fold_valid_df[FEATURES]
    y_valid = fold_valid_df["expert_consensus"].map(targets_dict)

    dtrain = xgb.DMatrix(X_train, label=y_train)
    dvalid = xgb.DMatrix(X_valid, label=y_valid)

    params = {
        "objective": "multi:softprob",
        "num_class": len(Constants.TARGETS),
        "device": "cuda",
        "tree_method": "hist",
        "eval_metric": "mlogloss",
        "seed": Constants.SEED,
    }

    evals = [(dvalid, "eval")]
    model = xgb.train(
        params,
        dtrain,
        num_boost_round=300,
        evals=evals,
        verbose_eval=100,
        early_stopping_rounds=10,
    )
    
    model.save_model(models_save_path / f"fold_{fold}.json")

    oof = model.predict(dvalid)
    all_oof.extend(oof)

    all_true.extend(fold_valid_df[Constants.TARGETS].values)

    del X_train, y_train, X_valid, y_valid, dtrain, dvalid, oof
    gc.collect()

all_oof = np.array(all_oof)
all_true = np.array(all_true)

FOLD 0
Train size: 13022, Valid size: 4067
[0]	eval-mlogloss:1.64093
[25]	eval-mlogloss:1.37529
FOLD 1
Train size: 13431, Valid size: 3658
[0]	eval-mlogloss:1.58046
[27]	eval-mlogloss:1.13886
FOLD 2
Train size: 13708, Valid size: 3381
[0]	eval-mlogloss:1.60073
[35]	eval-mlogloss:1.22203
FOLD 3
Train size: 14464, Valid size: 2625
[0]	eval-mlogloss:1.60919
[32]	eval-mlogloss:1.28704
FOLD 4
Train size: 13731, Valid size: 3358
[0]	eval-mlogloss:1.60742
[25]	eval-mlogloss:1.25664


## CV Score

In [10]:
all_oof_tensor = torch.tensor(all_oof, dtype=torch.float32)
all_true_tensor = torch.tensor(all_true, dtype=torch.float32)

kl_score = nn.KLDivLoss(reduction="batchmean")
score = kl_score(all_oof_tensor.log(), all_true_tensor).item()

print(f"OOF KL Score: {score}")

OOF KL Score: 1.0002787113189697


## Infer on Test and create Submission

In [11]:
del train_df
gc.collect()

0

In [12]:
test_spectrograms_dir = DATA_PATH / "test_spectrograms"
test_spectrogram_files = list(test_spectrograms_dir.glob("*.parquet"))
print(f"Found {len(test_spectrogram_files)} test spectrogram files to load into memory")

test_spectrograms = {}
for file in tqdm(test_spectrogram_files):
  spectrogram_id, content = get_spectrogram_content(file)
  test_spectrograms[spectrogram_id] = content

gc.collect()
print("Loaded all test spectrograms into memory")

Found 1 test spectrogram files to load into memory


100%|██████████| 1/1 [00:00<00:00, 32.75it/s]

Loaded all test spectrograms into memory





In [13]:
test_data = np.zeros((len(test_df), len(FEATURES)))

def extract_test_spectrogram_features(row, all_spectrograms):
  # this differs from train because all test spectrograms are exactly 10 minutes long, so we don't need to extract the center window
  spectrogram_id = int(row["spectrogram_id"])
  content = np.array(all_spectrograms[spectrogram_id][:])
  average_frequencies, min_frequencies, max_frequencies = extract_spectrogram_features(content)
  return average_frequencies, min_frequencies, max_frequencies

for i in tqdm(range(len(test_df)), total=len(test_df)):
  row = test_df.iloc[i]
  average_features, min_features, max_frequencies = extract_test_spectrogram_features(row, test_spectrograms)

  test_data[i,:FREQUENCY_COUNT] = average_features
  test_data[i,FREQUENCY_COUNT:2*FREQUENCY_COUNT] = min_features
  test_data[i,2*FREQUENCY_COUNT:3*FREQUENCY_COUNT] = max_frequencies

100%|██████████| 1/1 [00:00<00:00, 1530.21it/s]


In [14]:
test_df[FEATURES] = test_data

del test_data
del test_spectrograms
gc.collect()

test_df.head()

Unnamed: 0,spectrogram_id,eeg_id,patient_id,spec_mean_freq_0,spec_mean_freq_1,spec_mean_freq_2,spec_mean_freq_3,spec_mean_freq_4,spec_mean_freq_5,spec_mean_freq_6,...,spec_max_freq_390,spec_max_freq_391,spec_max_freq_392,spec_max_freq_393,spec_max_freq_394,spec_max_freq_395,spec_max_freq_396,spec_max_freq_397,spec_max_freq_398,spec_max_freq_399
0,853520,3911565283,6885,16.864132,19.120565,18.342468,13.408634,8.0575,4.890133,3.460633,...,0.58,0.59,0.59,0.73,0.48,0.41,0.6,0.6,0.61,0.6


In [15]:
test_preds = []

for fold in range(N_SPLITS):
  print("=" * 40)
  print(f"Predicting fold {fold}")
  print("=" * 40)

  X_train = test_df[FEATURES]
  dtest = xgb.DMatrix(X_train)

  model = xgb.Booster()
  model.load_model(models_save_path / f"fold_{fold}.json")

  preds = model.predict(dtest)
  test_preds.append(preds)

test_preds = np.mean(test_preds, axis=0)
print(f"Test predictions shape: {test_preds.shape}")

Predicting fold 0
Predicting fold 1
Predicting fold 2
Predicting fold 3
Predicting fold 4
Test predictions shape: (1, 6)


In [16]:
# sanity check: all predictions should sum to 1
assert np.allclose(test_preds.sum(axis=1), 1.0)

In [17]:
submission = pd.DataFrame({"eeg_id": test_df["eeg_id"]})
submission[Constants.TARGETS] = test_preds

submission.to_csv(get_submission_csv_path(), index=False)