# XGBoost

https://www.kaggle.com/code/cdeotte/catboost-starter-lb-0-60?scriptVersionId=158772898

In [1]:
DATA_PREPARATION_VOTE_METHOD = "sum_and_normalize" # "max_vote_window" or "sum_and_normalize". Decides how to aggregate the predictions of the overlapping windows

In [94]:
import os
import warnings
import gc
import pathlib

if bool(os.environ.get("KAGGLE_URL_BASE", "")):
  import sys
  # running on kaggle
  sys.path.insert(0, "/kaggle/input/hsm-source-files")
else:
  # running locally
  import notebook_setup

import torch.nn as nn
import pandas as pd
import numpy as np
import xgboost as xgb

import torch

from src.utils.utils import get_raw_data_dir, get_processed_data_dir, get_submission_csv_path, running_in_kaggle
from src.utils.constants import Constants
from src.datasets.eeg_processor import EEGDataProcessor
from src.utils.k_folds_creator import KFoldCreator

from tqdm import tqdm

In [3]:
DATA_PATH = get_raw_data_dir()

processor = EEGDataProcessor(raw_data_path=DATA_PATH, processed_data_path=get_processed_data_dir())
train_df = processor.process_data(vote_method=DATA_PREPARATION_VOTE_METHOD, skip_npy=True)

test_df = pd.read_csv(DATA_PATH / "test.csv")

kl_score = nn.KLDivLoss(reduction="batchmean")

Processor initialized.
Raw data path: '/home/david/git/aicomp/data'
Processed data path: '/home/david/git/aicomp/data/processed'
Starting EEG Data Processing Pipeline
Skipping NumPy file creation as requested.
Using 'sum_and_normalize' vote aggregation strategy with spectrogram info.

Processed train data saved to '/home/david/git/aicomp/data/processed/train_processed.csv'.
Shape of the final dataframe: (17089, 12)

Pipeline finished successfully!


## Feature Engineering

We need features for the XGBoost model.
For this, we take the mean over time for all of the 400 spectrogram frequencies.
We take the middle 10 minutes of all spectrograms.
For each EEG ID, this produces 400 features.

## Load Spectrogram Files into Memory

In [19]:
spectrograms_dir = DATA_PATH / "train_spectrograms"
spectrogram_files = list(spectrograms_dir.glob("*.parquet"))
print(f"Found {len(spectrogram_files)} spectrogram files to load into memory")

def get_spectrogram_content(spectromgram_file: pathlib.Path):
  spectrogram_id = int(file.stem.split("_")[-1])
  content = pd.read_parquet(file)
  content = content.drop(columns=["time"]).values
  return spectrogram_id, content

spectrograms = {}
for file in tqdm(spectrogram_files):
  spectrogram_id, content = get_spectrogram_content(file)
  spectrograms[spectrogram_id] = content

gc.collect()
print("Loaded all spectrograms into memory")

Found 11138 spectrogram files to load into memory


100%|██████████| 11138/11138 [06:14<00:00, 29.74it/s]

Loaded all spectrograms into memory





In [95]:
FEATURES = [f'freq_{x}' for x in range(400)]
data = np.zeros((len(train_df),len(FEATURES)))

def extract_spectrogram_features(row):
  spectrogram_id = int(row["spectrogram_id"])
  middle_offset = (row['min_offset'] + row['max_offset']) // 2 # this the middle between the least spectrogram offset and greatest spectogram offset
  row_index = int(middle_offset // 2) # each spectrogram row corresponds to 2s, so we divide by 2 to get the row index
  average_frequencies = np.array(spectrograms[spectrogram_id][row_index:row_index+300,:] ).mean(axis=0) # average over 300 rows (10 minutes)
  return average_frequencies

for i in tqdm(range(len(train_df)), total=len(train_df)):
  row = train_df.iloc[i]
  data[i,:] = extract_spectrogram_features(row)

  0%|          | 0/17089 [00:00<?, ?it/s]


NameError: name 'spectrograms' is not defined

In [38]:
warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning)
train_df[FEATURES] = data

del data
del spectrograms
gc.collect()

train_df.head()

NameError: name 'data' is not defined

## Train XGBoost Model

In [40]:
N_SPLITS = 5

In [41]:
fold_creator = KFoldCreator(n_splits=N_SPLITS, seed=Constants.SEED)
train_folds_df = fold_creator.create_folds(
    df=train_df, stratify_col="expert_consensus", group_col="patient_id"
)

In [None]:
# warnings.filterwarnings("ignore", message="Falling back to prediction using DMatrix due to mismatched devices")

all_oof = []
all_true = []
TARS = {'Seizure':0, 'LPD':1, 'GPD':2, 'LRDA':3, 'GRDA':4, 'Other':5}

for fold in range(N_SPLITS):
    fold_train_df = train_folds_df[train_folds_df["fold"] != fold].reset_index(drop=True)
    fold_valid_df = train_folds_df[train_folds_df["fold"] == fold].reset_index(drop=True)

    print("=" * 40)
    print(f"FOLD {fold}")
    print(f"Train size: {len(fold_train_df)}, Valid size: {len(fold_valid_df)}")
    print("=" * 30)

    X_train = fold_train_df[FEATURES]
    y_train = fold_train_df["expert_consensus"].map(TARS)
    
    X_valid = fold_valid_df[FEATURES]
    y_valid = fold_valid_df["expert_consensus"].map(TARS)

    # Create DMatrix objects
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dvalid = xgb.DMatrix(X_valid, label=y_valid)

    # Set parameters
    params = {
        "objective": "multi:softprob",
        "num_class": len(Constants.TARGETS),
        "device": "cuda",
        "tree_method": "hist",
        "eval_metric": "mlogloss",
        "seed": Constants.SEED,
    }

    # Train model
    evals = [(dvalid, "eval")]
    model = xgb.train(
        params,
        dtrain,
        num_boost_round=300,
        evals=evals,
        verbose_eval=100,
        early_stopping_rounds=10,
    )
    
    model.save_model(f"XGB_fold_{fold}.json")

    # Predict probabilities
    oof = model.predict(dvalid)
    all_oof.extend(oof)

    all_true.extend(fold_valid_df[Constants.TARGETS].values)

    del X_train, y_train, X_valid, y_valid, dtrain, dvalid, oof
    gc.collect()

all_oof = np.array(all_oof)
all_true = np.array(all_true)

FOLD 0
Train size: 13755, Valid size: 3334
[0]	eval-mlogloss:1.63018
[26]	eval-mlogloss:1.35269
FOLD 1
Train size: 13151, Valid size: 3938
[0]	eval-mlogloss:1.60886
[27]	eval-mlogloss:1.32410
FOLD 2
Train size: 13422, Valid size: 3667
[0]	eval-mlogloss:1.62823
[22]	eval-mlogloss:1.32224
FOLD 3
Train size: 14356, Valid size: 2733
[0]	eval-mlogloss:1.61536
[31]	eval-mlogloss:1.23553
FOLD 4
Train size: 13672, Valid size: 3417
[0]	eval-mlogloss:1.64184
[20]	eval-mlogloss:1.42018


## CV Score

In [93]:
all_oof_tensor = torch.tensor(all_oof, dtype=torch.float32)
all_true_tensor = torch.tensor(all_true, dtype=torch.float32)

kl_score = nn.KLDivLoss(reduction="batchmean")
score = kl_score(all_oof_tensor.log(), all_true_tensor).item()

print(f"OOF KL Score: {score}")

OOF KL Score: 1.055870532989502


## Infer on Test and create Submission

In [None]:
del train_df
gc.collect()