The goal of this file is to transform the data in the original file and output a csv with this data that can be used to train the machine learning model

All models take as input datasets of the form "train_n_samples_fft_0_to_10_hz_consensus_1.0_balanced.csv". To generate a dataset of size n, set NUM_ROWS = n. For each model, ensure TRAIN_PATH is set to local path for the dataset

In [1]:
import numpy as np
import pandas as pd
import random
from tqdm import tqdm
import time

In [2]:
EEG_PATH = '../train_eegs/'
SPEC_PATH = '../train_spectrograms/'
CHANNELS = ['Fp1', 'F3', 'C3', 'P3', 'F7', 'T3', 'T5', 'O1', 'Fz', 'Cz', 'Pz', 'Fp2', 'F4', 'C4',
 'P4', 'F8', 'T4', 'T6', 'O2', 'EKG']

# Parameters for generating the output
MIN_FREQUENCY = 0
MAX_FREQUENCY = 10
NUM_FREQUENCIES = (MAX_FREQUENCY - MIN_FREQUENCY) * 6
NUM_ROWS = 2000

# Determine if expert agreement is required for data to be included and if so what proportion of experts must agree
ONLY_CONSENSUS = True
CONSENSUS_PROP = 1.0 

# Determine if the output classes should be balanced (corrects for when the occurances of the classes are disproportionate)
# Only works when transforming a subset of the overall data
BALANCE = True

# Format of the data - options are 'raw' for the raw data collapsed into vectors or 'fft' for the Fourier transform
FORMAT = 'fft'
POOL_SIZE = 5 # For use only when the format is raw where values are averaged over group

# A random seed is used to ensure that datasets generated with the same parameters will be the same
random.seed(7)

In [3]:
# Function to generate an empty dataset with the correct headers
def create_empty_dataset(format):
    """Create a dataset with the appropriate headers as described in the spec file"""

    headers = []

    if format == 'fft':
        # For each channel, store the var and amp/phase within the freq range
        for channel in CHANNELS:
            headers.append("var_" + channel)
            for j in range(NUM_FREQUENCIES):
                headers.append("amp_" + channel + "_" + str(j))
            for j in range(NUM_FREQUENCIES):
                headers.append("phase_" + channel + "_" + str(j))
        headers.append("label")
    elif format == 'raw':
        # Since there are 200 labels per second and 10 seconds, there are 2000 total entries. These can be binned
        for channel in CHANNELS:
            for t in range(2000 // POOL_SIZE):
                headers.append(channel + "_" + str(t))
        headers.append("label")
    else:
        raise ValueError("Format is not valid")

    new_dataframe = pd.DataFrame(columns = headers)

    return new_dataframe

In [4]:
# Function to generate the output values
def row_transform_fft(row, min_frequency, max_frequency):
    """
    Takes as input a single sample from the train set and returns a reformatted row
    New format is: 
        For each cell, the (1) variance, (2) amplitudes, (3) phases
        The ground truth label
    """

    new_row = []

    # Read in the EEG
    eeg_full = pd.read_parquet(f'{EEG_PATH}{row.eeg_id}.parquet')

    # Get the middle 10s that the diagnosis comes from
    eeg_offset = int( row.eeg_label_offset_seconds )
    eeg_10s = eeg_full.iloc[(eeg_offset+20)*200:(eeg_offset+30)*200]

    if np.any(np.isnan(eeg_10s)):
        return []

    # Construct the interval
    t = np.linspace(0, 10, eeg_10s.size)

    # Iterate through all the channels
    for i in range(len(CHANNELS)):

        # Get the data for the specific channel
        signal = eeg_10s.iloc[:, i]

        # Perform DFT using numpy's rfft function
        dft_result = np.fft.rfft(signal)
        amplitude = np.abs(dft_result)
        phase = np.angle(dft_result)

        # Add to the row
        new_row.append(np.var(signal))
        new_row.extend(amplitude[6*min_frequency:6*max_frequency])
        new_row.extend(phase[6*min_frequency:6*max_frequency])

    new_row.append(row.expert_consensus)

    return new_row


In [5]:
# Function to generate the output values
def row_transform_raw(row, pool_size=1):
    """
    Takes as input a single sample from the train set and returns a reformatted row
    New format is: 
        For each cell, the (1) variance, (2) amplitudes, (3) phases
        The ground truth label
    """

    new_row = []

    # Read in the EEG
    eeg_full = pd.read_parquet(f'{EEG_PATH}{row.eeg_id}.parquet')

    # Get the middle 10s that the diagnosis comes from
    eeg_offset = int( row.eeg_label_offset_seconds )
    eeg_10s = eeg_full.iloc[(eeg_offset+20)*200:(eeg_offset+30)*200]

    if np.any(np.isnan(eeg_10s)):
        return []

    # Construct the interval
    t = np.linspace(0, 10, eeg_10s.size)

    # Iterate through all the channels
    for i in range(len(CHANNELS)):

        # Get the data for the specific channel
        signal = eeg_10s.iloc[:, i]

        for t in range(2000 // pool_size):
            start_index = t*pool_size
            end_index = (t+1)*pool_size
            mean = np.mean(signal[start_index:end_index])
            new_row.append(mean)

    new_row.append(row.expert_consensus)

    return new_row

In [6]:
# Read in the train dataset
train = pd.read_csv('../train.csv')

train_reformat = create_empty_dataset(FORMAT)

rows_removed = 0

# Only select the rows where there is consensus if this is a condition
if ONLY_CONSENSUS:
    total = train[['seizure_vote', 'lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote', 'other_vote']].sum(axis=1)
    max = train[['seizure_vote', 'lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote', 'other_vote']].max(axis=1)

    consensus_rows = max / total >= CONSENSUS_PROP

    train = train[consensus_rows]

# Randomize if not processing full dataset
if NUM_ROWS != len(train):

    if BALANCE:
        print("Balancing classes")

        g = train.groupby('expert_consensus')

        # Update the number of rows to be a multiple of the number of classes
        if NUM_ROWS % 6 != 0:
            NUM_ROWS = NUM_ROWS - NUM_ROWS % 6
        
        samples_per_class = NUM_ROWS // 6

        least_class_samples = g.size().min()

        # Ensure it is possible to make a balanced dataset
        if samples_per_class > least_class_samples:
            raise ValueError(f"The smallest class has only {least_class_samples} samples, so the maximum size of a balanced dataset is {least_class_samples*6}")

        # Randomly sample from each class to create a balanced dataset
        train = pd.DataFrame(g.apply(lambda x: x.sample(samples_per_class).reset_index(drop=True)))

        # Quick sanity check
        assert((train['expert_consensus'] == 'Seizure').sum() == samples_per_class)

    print(f"len: {len(train)}, rows: {NUM_ROWS}")
    random_indices = random.sample(range(len(train)), NUM_ROWS)

    # Create a new DataFrame with the random subset
    train = train.iloc[random_indices]

    print("\nSelecting a random subset of data to avoid problems arising from correlation between samples in same session")

# Iterate through all rows
with tqdm(total=NUM_ROWS, desc="Processing", unit=" iterations", ncols=100) as pbar:
    for r in range(NUM_ROWS): # train.shape[0]

        if FORMAT == 'fft':
            new_row = row_transform_fft(train.iloc[r], MIN_FREQUENCY, MAX_FREQUENCY)
        elif FORMAT == 'raw':
            new_row = row_transform_raw(train.iloc[r], POOL_SIZE)
        else:
            raise ValueError(f"Invalid format provided")

        # Check if NaN issues
        if len(new_row) != 0:
            train_reformat.loc[len(train_reformat)] = new_row
        else:
            rows_removed += 1

        pbar.update(1)
        
print("\nSamples successfully processed: {}".format(len(train_reformat)))
print("Samples removed due to incomplete data: {}".format(rows_removed))

assert len(train_reformat) + rows_removed == NUM_ROWS, "Row mismatch"

# Convert DataFrame B to CSV

format = ''
if FORMAT == 'fft':
    format = f'{FORMAT}_{MIN_FREQUENCY}_to_{MAX_FREQUENCY}'
else:
    format = f'{FORMAT}'

extra_info = ''
if ONLY_CONSENSUS:
    extra_info = '_consensus_{}'.format(CONSENSUS_PROP)
if BALANCE:
    extra_info += '_balanced'

train_reformat.to_csv(f'train_{NUM_ROWS}_samples_{format}_hz{extra_info}.csv', index=False)

Balancing classes
len: 1998, rows: 1998

Selecting a random subset of data to avoid problems arising from correlation between samples in same session


Processing: 100%|██████████████████████████████████████| 1998/1998 [06:22<00:00,  5.22 iterations/s]



Samples successfully processed: 1989
Samples removed due to incomplete data: 9
