In [11]:
EXISTING_CHECKPOINT_KAGGLE_DATASET_ID = "hsm-models"
DATA_PREPARATION_VOTE_METHOD = "max_vote_window" # "max_vote_window" or "sum_and_normalize". Decides how to aggregate the predictions of the overlapping windows
DATA_SOURCE = "cv" # "cv" or "test"

In [12]:
import os
import sys

from tqdm import tqdm
import torch

import numpy as np
import pandas as pd
import torch.nn.functional as F
from torch.utils.data import DataLoader


if bool(os.environ.get("KAGGLE_URL_BASE", "")):
  import sys
  # running on kaggle
  sys.path.insert(0, "/kaggle/input/hsm-source-files")
else:
  # running locally
  sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), "..", "..", "..")))

from src.utils.utils import get_raw_data_dir, get_processed_data_dir, get_submission_csv_path, get_models_save_path, set_seeds
from src.utils.constants import Constants

from src.datasets.multi_spectrogram import MultiSpectrogramDataset
from src.datasets.eeg_dataset_montage import EEGDatasetMontage
from src.models.base_cnn import BaseCNN
from src.models.gru_convolution_attention import NodeAttentionModel
from src.utils.eeg_spectrogram_creator import EEGSpectrogramGenerator
from src.datasets.eeg_processor import EEGDataProcessor

set_seeds(Constants.SEED)

In [13]:
def create_eeg_spectrograms(eeg_spectrograms_path, raw_eegs_path, data_df):
  os.makedirs(eeg_spectrograms_path, exist_ok=True)
  existing_specs = len(list(eeg_spectrograms_path.glob("*.npy")))

  eeg_ids = data_df["eeg_id"].unique()
  if existing_specs >= len(eeg_ids):
    print("EEG Spectrograms already created.")
    return
  else:
    spectrogram_creator = EEGSpectrogramGenerator(["cwt"])
    for eeg_id in tqdm(eeg_ids, desc="Generating EEG Spectrograms"):
        eeg_path = os.path.join(raw_eegs_path, f"{eeg_id}.parquet")
        eeg = pd.read_parquet(eeg_path)
        spectrograms = spectrogram_creator.generate(eeg)
        np.save(eeg_spectrograms_path / f"{eeg_id}.npy", spectrograms['cwt']) 

In [14]:
DATA_PATH = get_raw_data_dir()

if DATA_SOURCE == "cv":
  EEG_SPECT_PATH = get_processed_data_dir() / "eeg_spectrograms" / "train" / "cwt"
  processor = EEGDataProcessor(raw_data_path=DATA_PATH, processed_data_path=get_processed_data_dir())
  data_df = processor.process_data(vote_method=DATA_PREPARATION_VOTE_METHOD, skip_parquet=True)
else:
  EEG_SPECT_PATH = get_processed_data_dir() / "eeg_spectrograms" / "test" / "cwt"
  data_df = pd.read_csv(DATA_PATH / "test.csv")
  create_eeg_spectrograms(EEG_SPECT_PATH, DATA_PATH / "test_eegs", data_df)

data_df.head()

Processor initialized.
Raw data path: '/home/david/git/aicomp/data'
Processed data path: '/home/david/git/aicomp/data/processed'
Starting EEG Data Processing Pipeline
Skipping Parquet file creation as requested.
Using 'max_vote_window' vote aggregation strategy.

Processed train data saved to '/home/david/git/aicomp/data/processed/train_processed.csv'.
Shape of the final dataframe: (17089, 12)

Pipeline finished successfully!


Unnamed: 0,eeg_id,spectrogram_id,patient_id,expert_consensus,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote,min_offset,max_offset
0,568657,789577333,20654,Other,0.0,0.0,0.25,0.0,0.166667,0.583333,0.0,16.0
1,582999,1552638400,20230,LPD,0.0,0.857143,0.0,0.071429,0.0,0.071429,0.0,38.0
2,642382,14960202,5955,Other,0.0,0.0,0.0,0.0,0.0,1.0,1008.0,1032.0
3,751790,618728447,38549,GPD,0.0,0.0,1.0,0.0,0.0,0.0,908.0,908.0
4,778705,52296320,40955,Other,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [15]:
multi_spect_config = {
      "batch_size": 64,
      "num_workers": 8,
      "pretrained_model_name": "inception_v3",
      "target_size": 6,
      "img_size": (128, 256), 
      "dropout_p": 0.1,
      "image_alignment": "stacked",
      "augmentations": []
}

multi_spect_dataset = test_dataset = MultiSpectrogramDataset(
  df=data_df, 
  targets=Constants.TARGETS, 
  data_path=DATA_PATH, 
  img_size=multi_spect_config["img_size"], 
  eeg_spec_path=EEG_SPECT_PATH, 
  mode='test' if DATA_SOURCE == "test" else 'train',
  apply_augmentations=multi_spect_config["augmentations"],
)

multi_spect_model = BaseCNN(
  multi_spect_config["pretrained_model_name"],
  pretrained=False,
  num_classes=multi_spect_config["target_size"],
  dropout_p=multi_spect_config["dropout_p"],
  image_alignment=multi_spect_config["image_alignment"]
)

In [16]:
gru_conv_montage_config = {
  "batch_size": 32,
  "num_workers": 8,
  "num_nodes": 20,
  "node_embed_size": 256,
  "hidden_size": 256,
  "num_layers": 1,
  "target_size": 6,
  "num_cnn_blocks": 3,
  "dropout": 0.4,
  "downsample_factor": 4,
  "augmentations": []
}

gru_conv_montage_dataset = EEGDatasetMontage(
  df=data_df,
  data_path=DATA_PATH,
  mode='test' if DATA_SOURCE == "test" else 'val',
  downsample_factor=gru_conv_montage_config["downsample_factor"],
  augmentations=gru_conv_montage_config["augmentations"]
  )

gru_conv_montage_model = NodeAttentionModel(
  num_nodes=gru_conv_montage_config["num_nodes"],
  node_embed_size=gru_conv_montage_config["node_embed_size"],
  hidden_size=gru_conv_montage_config["hidden_size"],
  num_layers=gru_conv_montage_config["num_layers"],
  num_classes=gru_conv_montage_config["target_size"],
  num_cnn_blocks=gru_conv_montage_config["num_cnn_blocks"],
  dropout=gru_conv_montage_config["dropout"]
)

In [17]:
model_configs = [
  {
    "identifier": "multi-spect-cnn",
    "config": multi_spect_config,
    "dataloader": DataLoader(multi_spect_dataset, batch_size=multi_spect_config["batch_size"], shuffle=False, num_workers=multi_spect_config["num_workers"]),
    "model": multi_spect_model,
    "model_checkpoints_dir": get_models_save_path(EXISTING_CHECKPOINT_KAGGLE_DATASET_ID) / "multi_spec_cnn" / "inception_v3" / DATA_PREPARATION_VOTE_METHOD,

  },
  # {
  #   "identifier": "gru_conv_montage",
  #   "config": gru_conv_montage_config,
  #   "dataloader": DataLoader(gru_conv_montage_dataset, batch_size=gru_conv_montage_config["batch_size"], shuffle=False, num_workers=gru_conv_montage_config["num_workers"]),
  #   "model": gru_conv_montage_model,
  #   "model_checkpoints_dir": get_models_save_path(EXISTING_CHECKPOINT_KAGGLE_DATASET_ID) / "multi_spec_cnn" / "inception_v3" / DATA_PREPARATION_VOTE_METHOD,
  # }
]

In [18]:
def get_ensemble_score(all_model_predictions):
  # simple average ensemble
  return np.mean(all_model_predictions, axis=0)

In [23]:
def run_ensemble_inference():
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
  print(f"Using device: {device}")

  all_model_predictions = []  # Store predictions from each model architecture

  for config in model_configs:
    print(f"\n========== Loading {config['identifier']} ==========")

    model_checkpoints_dir = config["model_checkpoints_dir"]
    model_paths = [os.path.join(model_checkpoints_dir, f'best_model_fold{i}.pth') for i in range(5)]
    data_loader = config["dataloader"]

    # Get predictions from all folds for this model
    fold_predictions = []
    for i, path in enumerate(model_paths):
      print(f"\n========== Inferencing with Fold {i} Model ==========")
      if not os.path.exists(path):
          print(f"Model file not found: {path}. Skipping this fold.")
          continue
      
      model = config['model']
      model.load_state_dict(torch.load(path, map_location=device))
      model.to(device)
      model.eval()

      current_fold_preds = []
      with torch.no_grad():
          for x in tqdm(data_loader, desc=f"{config['identifier']} Fold {i}"):
              # if the dataloader returns a tuple (inputs, targets), unpack it
              if isinstance(x, (list, tuple)):
                  x = x[0]

              x = x.to(device)
              outputs = model(x)
              probs = F.softmax(outputs, dim=1).cpu().numpy()
              current_fold_preds.append(probs)
            
      fold_predictions.append(np.concatenate(current_fold_preds))

      # Average across folds for this model
      model_avg = np.mean(fold_predictions, axis=0)
      all_model_predictions.append(model_avg)
      print(f"Completed {config['identifier']}: {model_avg.shape}")

  # Simple average across all models
  print("\n========== Combining Model Predictions ==========")
  final_predictions = get_ensemble_score(all_model_predictions)
    
  submission = pd.DataFrame({"eeg_id": data_df["eeg_id"]})
  submission[Constants.TARGETS] = final_predictions
  submission.to_csv(get_submission_csv_path(), index=False)

  return submission

In [24]:
submission = run_ensemble_inference()

Using device: cuda




multi-spect-cnn Fold 0: 100%|██████████| 268/268 [02:21<00:00,  1.90it/s]


Completed multi-spect-cnn: (17089, 6)



multi-spect-cnn Fold 1: 100%|██████████| 268/268 [02:20<00:00,  1.90it/s]


Completed multi-spect-cnn: (17089, 6)



multi-spect-cnn Fold 2: 100%|██████████| 268/268 [02:16<00:00,  1.97it/s]


Completed multi-spect-cnn: (17089, 6)



multi-spect-cnn Fold 3: 100%|██████████| 268/268 [02:18<00:00,  1.94it/s]


Completed multi-spect-cnn: (17089, 6)



multi-spect-cnn Fold 4: 100%|██████████| 268/268 [02:18<00:00,  1.94it/s]


Completed multi-spect-cnn: (17089, 6)



In [28]:
# calculate KL divergence score if using cv data
if DATA_SOURCE == "cv":
  kl_loss_fn = torch.nn.KLDivLoss(reduction='batchmean')
  true_labels = torch.tensor(data_df[Constants.TARGETS].values, dtype=torch.float32)
  pred_labels = torch.tensor(submission[Constants.TARGETS].values, dtype=torch.float32)
  kl_score = kl_loss_fn(torch.log(pred_labels + 1e-8), true_labels).item()
  print(f"KL Divergence Score on CV Data: {kl_score}")

KL Divergence Score on CV Data: 0.42844244837760925
