In [2]:
import sys
import os
import gc
import copy
import yaml
import random
import shutil
from time import time
import typing as tp
import pandas as pd
from pathlib import Path

In [16]:
class CFG:
    ROOT = Path.cwd()
    # INPUT = ROOT / "input"
    # OUTPUT = ROOT / "output"
    # SRC = ROOT / "src"

    DATA = '/data/hms/'
    TRAIN_SPEC = DATA +"/train_spectrograms"
    TEST_SPEC = DATA  +"/test_spectrograms"

    TMP = ROOT / "tmp"
    TRAIN_SPEC_SPLIT = TMP / "train_spectrograms_split"
    TEST_SPEC_SPLIT = TMP / "test_spectrograms_split"
    TMP.mkdir(exist_ok=True)
    TRAIN_SPEC_SPLIT.mkdir(exist_ok=True)
    TEST_SPEC_SPLIT.mkdir(exist_ok=True)


    RANDAM_SEED = 1086
    CLASSES = ["seizure_vote", "lpd_vote", "gpd_vote", "lrda_vote", "grda_vote", "other_vote"]
    N_CLASSES = len(CLASSES)
    FOLD_METHOD = 'StratifiedGroupKFold'
    FOLDS = [0, 1, 2, 3, 4]
    N_FOLDS = len(FOLDS)

cfg = CFG()

In [9]:
from sklearn.model_selection import StratifiedGroupKFold

In [14]:
train = pd.read_csv(cfg.DATA + "train.csv")
# convert vote to probability
train[cfg.CLASSES] /= train[cfg.CLASSES].sum(axis=1).values[:, None]
if cfg.FOLD_METHOD == 'StratifiedGroupKFold':
    # NOTE: I used the first spectrogram_sub_id for each spectrogram_id in order to train model faster. (https://www.kaggle.com/code/ttahara/hms-hbac-resnet34d-baseline-training)
    train = train.groupby("spectrogram_id").head(1).reset_index(drop=True)
    sgkf = StratifiedGroupKFold(n_splits=cfg.N_FOLDS, shuffle=True, random_state=cfg.RANDAM_SEED)
    train["fold"] = -1

    for fold_id, (_, val_idx) in enumerate(
        sgkf.split(train, y=train["expert_consensus"], groups=train["patient_id"])
    ):
        train.loc[val_idx, "fold"] = fold_id
    
    train.to_csv('./fold_dataset/'+ "train_folds.csv", index=False)

In [31]:
import tqdm
import numpy as np

SyntaxError: invalid syntax (555273170.py, line 2)

In [42]:
train.head()

Unnamed: 0,eeg_id,eeg_sub_id,eeg_label_offset_seconds,spectrogram_id,spectrogram_sub_id,spectrogram_label_offset_seconds,label_id,patient_id,expert_consensus,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote,fold
0,1628180742,0,0.0,353733,0,0.0,127492639,42516,Seizure,1.0,0.0,0.0,0.0,0.0,0.0,4
1,2277392603,0,0.0,924234,0,0.0,1978807404,30539,GPD,0.0,0.0,0.454545,0.0,0.090909,0.454545,3
2,722738444,0,0.0,999431,0,0.0,557980729,56885,LRDA,0.0,0.0625,0.0,0.875,0.0,0.0625,1
3,387987538,0,0.0,1084844,0,0.0,4099147263,4264,LRDA,0.0,0.0,0.0,1.0,0.0,0.0,4
4,2175806584,0,0.0,1219001,0,0.0,1963161945,23435,Seizure,1.0,0.0,0.0,0.0,0.0,0.0,3


In [43]:
for spec_id, df in train.groupby("spectrogram_id"):
    spec = pd.read_parquet(CFG.TRAIN_SPEC +'/'+ f"{spec_id}.parquet")
    
    spec_arr = spec.fillna(0).values[:, 1:].T.astype("float32")  # (Hz, Time) = (400, 300)
    
    for spec_offset, label_id in df[
        ["spectrogram_label_offset_seconds", "label_id"]
    ].astype(int).values:
        spec_offset = spec_offset // 2
        split_spec_arr = spec_arr[:, spec_offset: spec_offset + 300]
        np.save(CFG.TRAIN_SPEC_SPLIT / f"{label_id}.npy" , split_spec_arr)