# Baseline Support Vector Machine with Mean Spectrogram Features

In [1]:
DATA_PREPARATION_VOTE_METHOD = "sum_and_normalize" # "max_vote_window" or "sum_and_normalize". Decides how to aggregate the predictions of the overlapping windows
SKIP_TRAIN = False # If True, skips the training phase and only runs evaluation with existing checkpoints
EXISTING_CHECKPOINT_KAGGLE_DATASET_ID = "hsm-models" # set to None if you want to train a new model on Kaggle. Else, set to the Kaggle dataset ID where the existing model checkpoints are stored

In [2]:
import os
import sys
import warnings
import gc
import pathlib

if bool(os.environ.get("KAGGLE_URL_BASE", "")):
  import sys
  # running on kaggle
  sys.path.insert(0, "/kaggle/input/hsm-source-files")
else:
  # running locally
  sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), "..", "..", "..")))

import torch.nn as nn
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

import torch
import joblib

from src.utils.utils import get_raw_data_dir, get_processed_data_dir, get_submission_csv_path, set_seeds, get_models_save_path, running_in_kaggle
from src.utils.constants import Constants
from src.datasets.eeg_processor import EEGDataProcessor
from src.utils.k_folds_creator import KFoldCreator

from tqdm import tqdm

set_seeds(42)

warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning)

2025-10-13 17:14:19,778 :: root :: INFO :: Initialising Utils
2025-10-13 17:14:19,781 :: root :: INFO :: Initialising Datasets


In [4]:
# never train on kaggle, only evaluate
SKIP_TRAIN = SKIP_TRAIN or running_in_kaggle()
print(f"SKIP_TRAIN={SKIP_TRAIN}")

SKIP_TRAIN=False


In [5]:
DATA_PATH = get_raw_data_dir()

processor = EEGDataProcessor(raw_data_path=DATA_PATH, processed_data_path=get_processed_data_dir())
train_df = processor.process_data(vote_method=DATA_PREPARATION_VOTE_METHOD, skip_npy=True)

test_df = pd.read_csv(DATA_PATH / "test.csv")

kl_score = nn.KLDivLoss(reduction="batchmean")


models_save_path = get_models_save_path(EXISTING_CHECKPOINT_KAGGLE_DATASET_ID) / "svm" / "spectrogram_means" / DATA_PREPARATION_VOTE_METHOD
models_save_path.mkdir(parents=True, exist_ok=True)
print("Using models save path:", models_save_path)

Processor initialized.
Raw data path: '/home/david/git/aicomp/data'
Processed data path: '/home/david/git/aicomp/data/processed'
Starting EEG Data Processing Pipeline
Skipping NumPy file creation as requested.
Using 'sum_and_normalize' vote aggregation strategy with spectrogram info.

Processed train data saved to '/home/david/git/aicomp/data/processed/train_processed.csv'.
Shape of the final dataframe: (17089, 12)

Pipeline finished successfully!
Using models save path: /home/david/git/aicomp/models/svm/spectrogram_means/sum_and_normalize


## Load Spectrogram Files into Memory

In [6]:
def get_spectrogram_content(spectrogram_file: pathlib.Path):
  spectrogram_id = int(spectrogram_file.stem.split("_")[-1])
  content = pd.read_parquet(spectrogram_file)
  content = content.drop(columns=["time"]).values
  return spectrogram_id, content

In [7]:
if not SKIP_TRAIN:
  spectrograms_dir = DATA_PATH / "train_spectrograms"
  spectrogram_files = list(spectrograms_dir.glob("*.parquet"))
  print(f"Found {len(spectrogram_files)} train spectrogram files to load into memory")

  spectrograms = {}
  for file in tqdm(spectrogram_files):
    spectrogram_id, content = get_spectrogram_content(file)
    spectrograms[spectrogram_id] = content

  gc.collect()
  print("Loaded all train spectrograms into memory")

Found 11138 train spectrogram files to load into memory


100%|██████████| 11138/11138 [06:46<00:00, 27.42it/s]

Loaded all train spectrograms into memory





## Feature Engineering

We use only use the mean of all frequencies (100 per chain => 400 in total) as features.
We take the middle 10 minutes of all spectrograms.

In [8]:
FREQUENCY_COUNT = 400 # each spectrogram has 400 frequency bins

FEATURES = [f"spec_mean_freq_{x}" for x in range(FREQUENCY_COUNT)]

def extract_spectrogram_features(ten_minute_window):
  average_frequencies = ten_minute_window.mean(axis=0) # average over 300 rows (10 minutes)
  return average_frequencies

In [9]:
if not SKIP_TRAIN:
  data = np.zeros((len(train_df), len(FEATURES)))

  def extract_train_spectrogram_features(row, all_spectrograms):
    spectrogram_id = int(row["spectrogram_id"])
    middle_offset = (row["min_offset"] + row["max_offset"]) // 2 # this the middle between the least spectrogram offset and greatest spectogram offset
    row_index = int(middle_offset // 2) # each spectrogram row corresponds to 2s, so we divide by 2 to get the row index
    window = np.array(all_spectrograms[spectrogram_id][row_index:row_index+300,:])
    average_frequencies = extract_spectrogram_features(window)
    return average_frequencies

  for i in tqdm(range(len(train_df)), total=len(train_df)):
    row = train_df.iloc[i]
    average_features = extract_train_spectrogram_features(row, spectrograms)
    data[i,:] = average_features

100%|██████████| 17089/17089 [00:02<00:00, 5847.91it/s]


In [10]:
if not SKIP_TRAIN:
  train_df[FEATURES] = data

  train_df.head()

# Train SVM Model

In [11]:
N_SPLITS = 5
targets_dict = {"Seizure":0, "LPD":1, "GPD":2, "LRDA":3, "GRDA":4, "Other":5}

In [12]:
if not SKIP_TRAIN:
    fold_creator = KFoldCreator(n_splits=N_SPLITS, seed=Constants.SEED)
    train_folds_df = fold_creator.create_folds(
        df=train_df, stratify_col="expert_consensus", group_col="patient_id"
    )

In [13]:
def fill_nan_with_mean(X):
    col_means = np.nanmean(X, axis=0)
    X = np.nan_to_num(X, nan=col_means)
    return X

In [14]:
if not SKIP_TRAIN:
    all_oof = []
    all_true = []

    for fold in range(N_SPLITS):
        fold_train_df = train_folds_df[train_folds_df["fold"] != fold].reset_index(drop=True)
        fold_valid_df = train_folds_df[train_folds_df["fold"] == fold].reset_index(drop=True)

        print("=" * 40)
        print(f"FOLD {fold}")
        print(f"Train size: {len(fold_train_df)}, Valid size: {len(fold_valid_df)}")
        print("=" * 30)

        X_train = fold_train_df[FEATURES].values
        y_train = fold_train_df["expert_consensus"].map(targets_dict).values

        X_valid = fold_valid_df[FEATURES].values
        y_valid = fold_valid_df["expert_consensus"].map(targets_dict).values

        X_train = fill_nan_with_mean(X_train)
        X_valid = fill_nan_with_mean(X_valid)

        # scale features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_valid_scaled = scaler.transform(X_valid)

        # train SVM with probability estimates
        model = SVC(
            kernel='rbf',
            C=1.0,
            gamma='scale',
            probability=True,
            random_state=Constants.SEED,
            verbose=False,
            cache_size=1000 # 1 GB
        )
        
        print("Training SVM...")
        model.fit(X_train_scaled, y_train)

        joblib.dump(model, models_save_path / f"fold_{fold}_model.pkl")
        joblib.dump(scaler, models_save_path / f"fold_{fold}_scaler.pkl")
        
        print("Predicting on validation set...")
        oof = model.predict_proba(X_valid_scaled)
        all_oof.extend(oof)

        all_true.extend(fold_valid_df[Constants.TARGETS].values)

        del X_train, y_train, X_valid, y_valid, X_train_scaled, X_valid_scaled, oof
        gc.collect()

    all_oof = np.array(all_oof)
    all_true = np.array(all_true)

FOLD 0
Train size: 13755, Valid size: 3334
Training SVM...
Predicting on validation set...
FOLD 1
Train size: 13151, Valid size: 3938
Training SVM...
Predicting on validation set...
FOLD 2
Train size: 13422, Valid size: 3667
Training SVM...
Predicting on validation set...
FOLD 3
Train size: 14356, Valid size: 2733
Training SVM...
Predicting on validation set...
FOLD 4
Train size: 13672, Valid size: 3417
Training SVM...
Predicting on validation set...


## CV Score

In [15]:
if not SKIP_TRAIN:
  all_oof_tensor = torch.tensor(all_oof, dtype=torch.float32)
  all_true_tensor = torch.tensor(all_true, dtype=torch.float32)

  kl_score = nn.KLDivLoss(reduction="batchmean")
  score = kl_score(all_oof_tensor.log(), all_true_tensor).item()

  print(f"OOF KL Score: {score}")

OOF KL Score: 1.2697938680648804


## Infer on Test and create Submission

In [16]:
test_spectrograms_dir = DATA_PATH / "test_spectrograms"
test_spectrogram_files = list(test_spectrograms_dir.glob("*.parquet"))
print(f"Found {len(test_spectrogram_files)} test spectrogram files to load into memory")

test_spectrograms = {}
for file in tqdm(test_spectrogram_files):
  spectrogram_id, content = get_spectrogram_content(file)
  test_spectrograms[spectrogram_id] = content

gc.collect()
print("Loaded all test spectrograms into memory")

Found 1 test spectrogram files to load into memory


100%|██████████| 1/1 [00:00<00:00, 29.41it/s]

Loaded all test spectrograms into memory





In [17]:
test_data = np.zeros((len(test_df), len(FEATURES)))

def extract_test_spectrogram_features(row, all_spectrograms):
  # this differs from train because all test spectrograms are exactly 10 minutes long, so we don't need to extract the center window
  spectrogram_id = int(row["spectrogram_id"])
  content = np.array(all_spectrograms[spectrogram_id][:])
  average_frequencies = extract_spectrogram_features(content)
  return average_frequencies

for i in tqdm(range(len(test_df)), total=len(test_df)):
  row = test_df.iloc[i]
  average_features = extract_test_spectrogram_features(row, test_spectrograms)
  test_data[i,:] = average_features

100%|██████████| 1/1 [00:00<00:00, 1612.57it/s]


In [18]:
test_df[FEATURES] = test_data
test_df.head()

Unnamed: 0,spectrogram_id,eeg_id,patient_id,spec_mean_freq_0,spec_mean_freq_1,spec_mean_freq_2,spec_mean_freq_3,spec_mean_freq_4,spec_mean_freq_5,spec_mean_freq_6,...,spec_mean_freq_390,spec_mean_freq_391,spec_mean_freq_392,spec_mean_freq_393,spec_mean_freq_394,spec_mean_freq_395,spec_mean_freq_396,spec_mean_freq_397,spec_mean_freq_398,spec_mean_freq_399
0,853520,3911565283,6885,16.864132,19.120565,18.342468,13.408634,8.0575,4.890133,3.460633,...,0.088567,0.086333,0.083633,0.084067,0.081933,0.082867,0.084267,0.082633,0.083967,0.081533


In [19]:
test_preds = []

for fold in range(N_SPLITS):
  print("=" * 40)
  print(f"Predicting fold {fold}")
  print("=" * 40)

  X_test = test_df[FEATURES].values
  X_test = fill_nan_with_mean(X_test)

  scaler = joblib.load(models_save_path / f"fold_{fold}_scaler.pkl")
  model = joblib.load(models_save_path / f"fold_{fold}_model.pkl")

  X_test_scaled = scaler.transform(X_test)

  preds = model.predict_proba(X_test_scaled)
  test_preds.append(preds)

test_preds = np.mean(test_preds, axis=0)
print(f"Test predictions shape: {test_preds.shape}")

Predicting fold 0
Predicting fold 1
Predicting fold 2
Predicting fold 3
Predicting fold 4
Test predictions shape: (1, 6)


In [20]:
# sanity check: all predictions should sum to 1
assert np.allclose(test_preds.sum(axis=1), 1.0)

In [21]:
submission = pd.DataFrame({"eeg_id": test_df["eeg_id"]})
submission[Constants.TARGETS] = test_preds

submission.to_csv(get_submission_csv_path(), index=False)