In [4]:
import os
import sys
import warnings
import gc
import pathlib

if bool(os.environ.get("KAGGLE_URL_BASE", "")):
  import sys
  # running on kaggle
  sys.path.insert(0, "/kaggle/input/hsm-source-files")
else:
  # running locally
  sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), "..")))

import torch.nn as nn
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

import torch
import joblib
import librosa

from src.utils.utils import get_raw_data_dir, get_processed_data_dir, get_submission_csv_path, set_seeds, get_models_save_path, running_in_kaggle
from src.utils.constants import Constants
from src.datasets.eeg_processor import EEGDataProcessor
from src.utils.k_folds_creator import KFoldCreator

from tqdm import tqdm

set_seeds(42)

warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning)

https://www.kaggle.com/code/cdeotte/how-to-make-spectrogram-from-eeg/notebook

In [9]:
DATA_PATH = get_raw_data_dir()
print("Data path:", DATA_PATH)

SAVE_PATH = get_processed_data_dir() / "eeg_spectrograms"
TRAIN_SAVE_PATH = SAVE_PATH / "train"

os.makedirs(TRAIN_SAVE_PATH, exist_ok=True)

train_df = pd.read_csv(DATA_PATH / "train.csv")
train_eeg_ids = train_df["eeg_id"].unique()

Data path: /home/david/git/aicomp/data


In [12]:
CHAIN_NAMES = ["LL", "LP", "RP", "RR", "CZ"]
ELECTRODES_PER_CHAIN = [
  ["Fp1","F7","T3","T5","O1"],
  ["Fp1","F3","C3","P3","O1"],
  ["Fp2","F8","T4","T6","O2"],
  ["Fp2","F4","C4","P4","O2"],
  ["Fz","Cz","Pz"]
]

We will input it into CNN image model and CNN image model needs the dimensions to be multiples of 32. Then I chose 128x256 because it is close to Kaggle's dimensions of 100x300.

In [20]:
def load_middle_50_seconds_of_eeg(data_path, eeg_id):
  eeg = pd.read_parquet(data_path / f"{eeg_id}.parquet")
  middle = (len(eeg) - 10000) // 2 # 10000 samples = 25 seconds
  return eeg.iloc[middle:middle+10_000]

def fill_nan_with_mean(X):
    col_means = np.nanmean(X, axis=0)
    X = np.nan_to_num(X, nan=col_means)
    return X

def create_mel_spectrogram(pair_difference):
  # raw spectrogram
  mel_spec = librosa.feature.melspectrogram(
    y=pair_difference,
    sr=200, # sampling frequency is 200Hz
    hop_length=len(pair_difference)//256, # produces image with width = len(x)/hop_length
    n_fft=1024, # controls vertical resolution and quality of spectrogram
    n_mels=128, # number of mel bands, corresponds to height of spectrogram
    fmin=0, # min frequency
    fmax=20, # max frequency
    win_length=128) # window size, controls horizontal resolution and quality of spectrogram
  
  # log transform and cut to width 256
  width = (mel_spec.shape[1]//32)*32
  mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max).astype(np.float32)[:,:width]

  # standardize to [-1,1]
  mel_spec_db = (mel_spec_db+40)/40 


def spectrogram_from_eeg(data_path, eeg_id):
  eeg = load_middle_50_seconds_of_eeg(data_path, eeg_id)

  # output image has height=128, width=256, channels=5 (for 5 chains)
  spectrogram = np.zeros((128,256,len(CHAIN_NAMES)), dtype="float32")
  for chain_index in range(len(CHAIN_NAMES)):
    electrodes = ELECTRODES_PER_CHAIN[chain_index]
    electrodes = fill_nan_with_mean(electrodes)
    pair_difference_num = len(electrodes) - 1

    for pair_difference_index in range(pair_difference_num):
      pair_difference = eeg[electrodes[pair_difference_index]] - eeg[electrodes[pair_difference_index+1]].values
      mel_spec_db = create_mel_spectrogram(pair_difference)
      spectrogram[:,:,chain_index] += mel_spec_db

    # average the spectrogram differences for this chain
    spectrogram[:,:,chain_index] /= pair_difference_num
  
  return spectrogram

In [21]:
for i, eeg_id in enumerate(tqdm(train_eeg_ids)):
  spectrogram = spectrogram_from_eeg(DATA_PATH / "train_eegs", eeg_id)
  out_path = TRAIN_SAVE_PATH / f"{eeg_id}.npy"
  np.save(out_path, spectrogram)

  1%|          | 97/17089 [00:15<44:39,  6.34it/s]  


KeyboardInterrupt: 