In [None]:
import pandas as pd, numpy as np, os
import matplotlib.pyplot as plt
import os
from collections import Counter
import glob

train = pd.read_csv('../data/train.csv')
print( train.shape )
display( train.head() )


(106800, 15)


Unnamed: 0,eeg_id,eeg_sub_id,eeg_label_offset_seconds,spectrogram_id,spectrogram_sub_id,spectrogram_label_offset_seconds,label_id,patient_id,expert_consensus,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote
0,1628180742,0,0.0,353733,0,0.0,127492639,42516,Seizure,3,0,0,0,0,0
1,1628180742,1,6.0,353733,1,6.0,3887563113,42516,Seizure,3,0,0,0,0,0
2,1628180742,2,8.0,353733,2,8.0,1142670488,42516,Seizure,3,0,0,0,0,0
3,1628180742,3,18.0,353733,3,18.0,2718991173,42516,Seizure,3,0,0,0,0,0
4,1628180742,4,24.0,353733,4,24.0,3080632009,42516,Seizure,3,0,0,0,0,0


In [2]:
df = pd.read_parquet('../data/train_eegs/1000913311.parquet')
FEATS = df.columns
print(f'There are {len(FEATS)} raw eeg features')
print( list(FEATS) )
FEAT2IDX = {x:y for x,y in zip(FEATS,range(len(FEATS)))}


There are 20 raw eeg features
['Fp1', 'F3', 'C3', 'P3', 'F7', 'T3', 'T5', 'O1', 'Fz', 'Cz', 'Pz', 'Fp2', 'F4', 'C4', 'P4', 'F8', 'T4', 'T6', 'O2', 'EKG']


# Extract Middle 50 Seconds and Convert to numpy

In [3]:
def eeg_from_parquet(parquet_path, display=False):
    
    # EXTRACT MIDDLE 50 SECONDS
    eeg = pd.read_parquet(parquet_path, columns=FEATS)
    rows = len(eeg)
    offset = (rows-10_000)//2
    eeg = eeg.iloc[offset:offset+10_000]
    
    if display: 
        plt.figure(figsize=(10,5))
        offset = 0
    
    data = np.zeros((10_000,len(FEATS)))
    for j,col in enumerate(FEATS):
        
        x = eeg[col].values.astype('float32')
        m = np.nanmean(x)
        if np.isnan(x).mean()<1: x = np.nan_to_num(x,nan=m)
        else: x[:] = 0
            
        data[:,j] = x
        
        if display: 
            if j!=0: offset += x.max()
            plt.plot(range(10_000),x-offset,label=col)
            offset -= x.min()
            
    if display:
        plt.legend()
        name = parquet_path.split('/')[-1]
        name = name.split('.')[0]
        plt.title(f'EEG {name}',size=16)
        plt.show()
        
    return data

In [None]:
import numpy as np
import os

def eeg_generator(eeg_ids, path):
    """Generator to read Parquet files one by one."""
    for eeg_id in eeg_ids:
        file_path = f'{path}{eeg_id}.parquet'
        data = eeg_from_parquet(file_path)
        yield eeg_id, data

CREATE_EEGS = True 
EEG_IDS = train.eeg_id.unique()
RAW_PATH = '../data/train_eegs/'
PROCESSED_PATH = '../data/eegs_npy/'

if CREATE_EEGS:
    os.makedirs(PROCESSED_PATH, exist_ok=True)
    
    print(f"Converting {len(EEG_IDS)} Parquet files to NumPy format...")
    
    gen = eeg_generator(EEG_IDS, RAW_PATH)

    for i, (eeg_id, data) in enumerate(gen):
        if (i > 0) and (i % 100 == 0):
            print(f"{i}, ", end='')
            
        # Save each EEG array to its own file
        np.save(f'{PROCESSED_PATH}{eeg_id}.npy', data)
        
    print("\nDone. All EEG files have been saved as individual .npy files.")

Converting 17089 Parquet files to NumPy format...
100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500, 1600, 1700, 1800, 1900, 2000, 2100, 2200, 2300, 2400, 2500, 2600, 2700, 2800, 2900, 3000, 3100, 3200, 3300, 3400, 3500, 3600, 3700, 3800, 3900, 4000, 4100, 4200, 4300, 4400, 4500, 4600, 4700, 4800, 4900, 5000, 5100, 5200, 5300, 5400, 5500, 5600, 5700, 5800, 5900, 6000, 6100, 6200, 6300, 6400, 6500, 6600, 6700, 6800, 6900, 7000, 7100, 7200, 7300, 7400, 7500, 7600, 7700, 7800, 7900, 8000, 8100, 8200, 8300, 8400, 8500, 8600, 8700, 8800, 8900, 9000, 9100, 9200, 9300, 9400, 9500, 9600, 9700, 9800, 9900, 10000, 10100, 10200, 10300, 10400, 10500, 10600, 10700, 10800, 10900, 11000, 11100, 11200, 11300, 11400, 11500, 11600, 11700, 11800, 11900, 12000, 12100, 12200, 12300, 12400, 12500, 12600, 12700, 12800, 12900, 13000, 13100, 13200, 13300, 13400, 13500, 13600, 13700, 13800, 13900, 14000, 14100, 14200, 14300, 14400, 14500, 14600, 14700, 14800, 14900, 15000, 15100, 1

# Deduplication

We're summing the votes of all the rows with the same eeg_id and then average it

In [5]:
df = pd.read_csv('../data/train.csv')
TARGETS = df.columns[-6:]
TARS = {'Seizure':0, 'LPD':1, 'GPD':2, 'LRDA':3, 'GRDA':4, 'Other':5}
TARS2 = {x:y for y,x in TARS.items()}

train = df.groupby('eeg_id')[['patient_id']].agg('first')

tmp = df.groupby('eeg_id')[TARGETS].agg('sum')
for t in TARGETS:
    train[t] = tmp[t].values
    
y_data = train[TARGETS].values
y_data = y_data / y_data.sum(axis=1,keepdims=True)
train[TARGETS] = y_data

tmp = df.groupby('eeg_id')[['expert_consensus']].agg('first')
train['target'] = tmp

train = train.reset_index()
train = train.loc[train.eeg_id.isin(EEG_IDS)]
print('Train Data with unique eeg_id shape:', train.shape )
train.head()

Train Data with unique eeg_id shape: (17089, 9)


Unnamed: 0,eeg_id,patient_id,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote,target
0,568657,20654,0.0,0.0,0.25,0.0,0.166667,0.583333,Other
1,582999,20230,0.0,0.857143,0.0,0.071429,0.0,0.071429,LPD
2,642382,5955,0.0,0.0,0.0,0.0,0.0,1.0,Other
3,751790,38549,0.0,0.0,1.0,0.0,0.0,0.0,GPD
4,778705,40955,0.0,0.0,0.0,0.0,0.0,1.0,Other


In [6]:
train.groupby('eeg_id').size().value_counts()

1    17089
Name: count, dtype: int64

In [7]:
PROCESSED_PATH = '../data/eegs_npy/'
npy_files = glob.glob(os.path.join(PROCESSED_PATH, '*.npy'))

if not npy_files:
    print(f"Warning: No .npy files found in the directory: {PROCESSED_PATH}")
else:
    lengths = [np.load(f).shape[0] for f in npy_files]
    length_counts = Counter(lengths)

    if len(length_counts) == 1:

        the_length = list(length_counts.keys())[0]
        print(f"All {len(npy_files)} files have the same length: {the_length}.")
    else:
        print(f"Warning! Not all files have the same length.")
        print("Distribution of lengths found:")
        for length, count in length_counts.items():
            print(f"  - Length {length}: Found in {count} files.")

All 17089 files have the same length: 10000.
