# Baselines

In [46]:
BASELINE_TO_SAVE = "uniform" # "mean" or "uniform". Decides which baseline is used to create the submission.csv
DATA_PREPARATION_VOTE_METHOD = "sum_and_normalize" # "max_vote_window" or "sum_and_normalize". Decides how to aggregate the predictions of the overlapping windows

In [47]:
import os

if bool(os.environ.get("KAGGLE_URL_BASE", "")):
  import sys
  # running on kaggle
  sys.path.insert(0, "/kaggle/input/hsm-source-files")
else:
  # running locally
  import notebook_setup

import torch.nn as nn
import torch
import pandas as pd

from src.utils.utils import get_raw_data_dir, get_processed_data_dir, get_submission_csv_path, running_in_kaggle
from src.utils.constants import Constants
from src.datasets.eeg_processor import EEGDataProcessor

In [48]:
DATA_PATH = get_raw_data_dir()
PROCESSED_TRAIN_DATA_PATH = get_processed_data_dir() / "train_processed.csv"

if (running_in_kaggle()):
  # preprocess data only if running in kaggle, locally it's already done
  processor = EEGDataProcessor(raw_data_path=DATA_PATH, processed_data_path=get_processed_data_dir())
  train_df = processor.process_data(vote_method=DATA_PREPARATION_VOTE_METHOD, skip_npy=True)
else:
  # load preprocessed data
  train_df = pd.read_csv(PROCESSED_TRAIN_DATA_PATH)

test_df = pd.read_csv(DATA_PATH / "test.csv")

kl_score = nn.KLDivLoss(reduction="batchmean")

In [49]:
train_df.head()

Unnamed: 0,eeg_id,spectrogram_id,min_offset,max_offset,patient_id,expert_consensus,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote
0,568657,789577333,0.0,16.0,20654,Other,0.0,0.0,0.25,0.0,0.166667,0.583333
1,582999,1552638400,0.0,38.0,20230,LPD,0.0,0.857143,0.0,0.071429,0.0,0.071429
2,642382,14960202,1008.0,1032.0,5955,Other,0.0,0.0,0.0,0.0,0.0,1.0
3,751790,618728447,908.0,908.0,38549,GPD,0.0,0.0,1.0,0.0,0.0,0.0
4,778705,52296320,0.0,0.0,40955,Other,0.0,0.0,0.0,0.0,0.0,1.0


In [50]:
test_df.head()

Unnamed: 0,spectrogram_id,eeg_id,patient_id
0,853520,3911565283,6885


## Uniform Baseline

This baseline assigns equal probabilities (1/6 each) to all six possible classes.

In [51]:
num_classes = len(Constants.TARGETS)
uniform_predictions = [1/num_classes] * num_classes
uniform_predictions

[0.16666666666666666,
 0.16666666666666666,
 0.16666666666666666,
 0.16666666666666666,
 0.16666666666666666,
 0.16666666666666666]

In [52]:
uniform_train_predictions = pd.DataFrame([uniform_predictions] * len(train_df))
uniform_train_predictions.insert(0, Constants.EEG_ID_COL, train_df[Constants.EEG_ID_COL])
uniform_train_predictions.columns = [Constants.EEG_ID_COL] + Constants.TARGETS

uniform_train_predictions

Unnamed: 0,eeg_id,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote
0,568657,0.166667,0.166667,0.166667,0.166667,0.166667,0.166667
1,582999,0.166667,0.166667,0.166667,0.166667,0.166667,0.166667
2,642382,0.166667,0.166667,0.166667,0.166667,0.166667,0.166667
3,751790,0.166667,0.166667,0.166667,0.166667,0.166667,0.166667
4,778705,0.166667,0.166667,0.166667,0.166667,0.166667,0.166667
...,...,...,...,...,...,...,...
17084,4293354003,0.166667,0.166667,0.166667,0.166667,0.166667,0.166667
17085,4293843368,0.166667,0.166667,0.166667,0.166667,0.166667,0.166667
17086,4294455489,0.166667,0.166667,0.166667,0.166667,0.166667,0.166667
17087,4294858825,0.166667,0.166667,0.166667,0.166667,0.166667,0.166667


In [53]:
targets = torch.tensor(train_df[Constants.TARGETS].values)
preds = torch.tensor(uniform_train_predictions[Constants.TARGETS].values)
score = kl_score(preds.log(), targets).item()

print(f"Uniform prediction KL score on entire train set: {score:.6f}")

Uniform prediction KL score on entire train set: 1.456325


In [54]:
uniform_test_predictions = pd.DataFrame([uniform_predictions] * len(test_df))
uniform_test_predictions.insert(0, Constants.EEG_ID_COL, test_df[Constants.EEG_ID_COL])
uniform_test_predictions.columns = [Constants.EEG_ID_COL] + Constants.TARGETS

uniform_test_predictions

Unnamed: 0,eeg_id,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote
0,3911565283,0.166667,0.166667,0.166667,0.166667,0.166667,0.166667


In [55]:
if (BASELINE_TO_SAVE == "uniform"):
  uniform_test_predictions.to_csv(get_submission_csv_path(), index=False)

## Mean Baseline

This baseline predicts the mean of the training labels for all test instances.

In [56]:
mean_predictions = train_df[Constants.TARGETS].mean()
mean_predictions

seizure_vote    0.152810
lpd_vote        0.142456
gpd_vote        0.104062
lrda_vote       0.065407
grda_vote       0.114851
other_vote      0.420413
dtype: float64

In [57]:
mean_train_predictions = pd.DataFrame([mean_predictions] * len(train_df))
mean_train_predictions.insert(0, Constants.EEG_ID_COL, train_df[Constants.EEG_ID_COL])
mean_train_predictions.columns = [Constants.EEG_ID_COL] + Constants.TARGETS
mean_train_predictions

Unnamed: 0,eeg_id,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote
0,568657,0.15281,0.142456,0.104062,0.065407,0.114851,0.420413
1,582999,0.15281,0.142456,0.104062,0.065407,0.114851,0.420413
2,642382,0.15281,0.142456,0.104062,0.065407,0.114851,0.420413
3,751790,0.15281,0.142456,0.104062,0.065407,0.114851,0.420413
4,778705,0.15281,0.142456,0.104062,0.065407,0.114851,0.420413
...,...,...,...,...,...,...,...
17084,4293354003,0.15281,0.142456,0.104062,0.065407,0.114851,0.420413
17085,4293843368,0.15281,0.142456,0.104062,0.065407,0.114851,0.420413
17086,4294455489,0.15281,0.142456,0.104062,0.065407,0.114851,0.420413
17087,4294858825,0.15281,0.142456,0.104062,0.065407,0.114851,0.420413


In [58]:
targets = torch.tensor(train_df[Constants.TARGETS].values)
preds = torch.tensor(mean_train_predictions[Constants.TARGETS].values)
score = kl_score(preds.log(), targets).item()

print(f"Mean prediction KL score on entire train set: {score:.6f}")

Mean prediction KL score on entire train set: 1.255924


In [59]:
mean_test_predictions = pd.DataFrame([mean_predictions] * len(test_df))
mean_test_predictions.insert(0, Constants.EEG_ID_COL, test_df[Constants.EEG_ID_COL])
mean_test_predictions.columns = [Constants.EEG_ID_COL] + Constants.TARGETS
mean_test_predictions

Unnamed: 0,eeg_id,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote
0,3911565283,0.15281,0.142456,0.104062,0.065407,0.114851,0.420413


In [60]:
if (BASELINE_TO_SAVE == "mean"):
  mean_test_predictions.to_csv(get_submission_csv_path(), index=False)