# Create Folds

This notebook creates a stratified k-fold split of the dataset and saves the dataset with the folds as a new CSV file.

Grouped k-fold is used to prevent any overlap between the training and validation sets for the same patient.
Furthermore, each split is stratified based on the `expert_consensus` column to ensure a balanced distribution of classes across folds (or, as balanced as possible given the constraints of grouping by patient).

In [None]:
import sys
import os

if bool(os.environ.get("KAGGLE_URL_BASE", "")):
  import sys
  # running on kaggle
  sys.path.insert(0, "/kaggle/input/hsm-source-files")
else:
  # running locally
  sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), "..")))

import pandas as pd

from pathlib import Path
from src.utils.utils import get_raw_data_dir, get_processed_data_dir
from src.utils.k_folds_creator import KFoldCreator

ModuleNotFoundError: No module named 'src'

In [2]:
DATA_PATH = get_raw_data_dir()
PROCESSED_TRAIN_DATA_PATH = get_processed_data_dir() / "train_processed.csv"
OUTPUT_PATH = get_processed_data_dir() / "train_folds.csv"

SPLITS_N = 5
SEED = 42

In [3]:
train_df = pd.read_csv(PROCESSED_TRAIN_DATA_PATH)
train_df.head()

Unnamed: 0,eeg_id,eeg_sub_id,eeg_label_offset_seconds,spectrogram_id,spectrogram_sub_id,spectrogram_label_offset_seconds,label_id,patient_id,expert_consensus,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote
0,568657,0,0.0,789577333,0,0.0,1825637311,20654,Other,0.0,0.0,0.25,0.0,0.166667,0.583333
1,582999,0,0.0,1552638400,0,0.0,1722186807,20230,LPD,0.0,0.857143,0.0,0.071429,0.0,0.071429
2,642382,0,0.0,14960202,12,1008.0,3254468733,5955,Other,0.0,0.0,0.0,0.0,0.0,1.0
3,751790,0,0.0,618728447,4,908.0,2898467035,38549,GPD,0.0,0.0,1.0,0.0,0.0,0.0
4,778705,0,0.0,52296320,0,0.0,3255875127,40955,Other,0.0,0.0,0.0,0.0,0.0,1.0


In [4]:
fold_creator = KFoldCreator(n_splits=SPLITS_N, seed=SEED)
train_folds_df = fold_creator.create_folds(
    df=train_df, stratify_col="expert_consensus", group_col="patient_id"
)

train_folds_df.groupby(["fold", "expert_consensus"])[["eeg_id"]].count().T

fold,0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2,2,2,3,3,3,3,3,3,4,4,4,4,4,4
expert_consensus,GPD,GRDA,LPD,LRDA,Other,Seizure,GPD,GRDA,LPD,LRDA,Other,Seizure,GPD,GRDA,LPD,LRDA,Other,Seizure,GPD,GRDA,LPD,LRDA,Other,Seizure,GPD,GRDA,LPD,LRDA,Other,Seizure
eeg_id,436,453,733,271,1541,633,583,481,395,169,1611,419,229,368,583,156,1549,496,316,279,334,216,1023,457,248,263,541,126,1548,632


In [6]:
train_folds_df.to_csv(OUTPUT_PATH, index=False)
print(f"Train folds data saved to {OUTPUT_PATH}")

Train folds data saved to /home/david/git/aicomp/data/processed/train_folds.csv
