The goal of this file is to transform the data in the original file and output a csv with this data that can be used to train the machine learning model

In [6]:
import numpy as np
import pandas as pd
import random
from tqdm import tqdm
import time

In [7]:
EEG_PATH = '../train_eegs/'
SPEC_PATH = '../train_spectrograms/'
CHANNELS = ['Fp1', 'F3', 'C3', 'P3', 'F7', 'T3', 'T5', 'O1', 'Fz', 'Cz', 'Pz', 'Fp2', 'F4', 'C4',
 'P4', 'F8', 'T4', 'T6', 'O2', 'EKG']

# Parameters for generating the output
MIN_FREQUENCY = 0
MAX_FREQUENCY = 10
NUM_FREQUENCIES = (MAX_FREQUENCY - MIN_FREQUENCY) * 6
NUM_ROWS = 15000

# Determine if expert agreement is required for data to be included and if so what proportion of experts must agree
ONLY_CONSENSUS = True
CONSENSUS_PROP = 1.0 

# Determine if the output classes should be balanced (corrects for when the occurances of the classes are disproportionate)
# Only works when transforming a subset of the overall data
BALANCE = True

In [8]:
# Function to generate an empty dataset with the correct headers
def create_empty_dataset():
    # Create empty new database with the appropriate headers
    headers = []
    for channel in CHANNELS:
        headers.append("var_" + channel)
        for j in range(NUM_FREQUENCIES):
            headers.append("amp_" + channel + "_" + str(j))
        for j in range(NUM_FREQUENCIES):
            headers.append("phase_" + channel + "_" + str(j))
    headers.append("label")

    new_dataframe = pd.DataFrame(columns = headers)

    return new_dataframe

In [9]:
# Function to generate the output values
def row_transform(row, min_frequency, max_frequency):
    """
    Takes as input a single sample from the train set and returns a reformatted row
    New format is: 
        For each cell, the (1) variance, (2) amplitudes, (3) phases
        The ground truth label
    """

    new_row = []

    # Read in the EEG
    eeg_full = pd.read_parquet(f'{EEG_PATH}{row.eeg_id}.parquet')

    # Get the middle 10s that the diagnosis comes from
    eeg_offset = int( row.eeg_label_offset_seconds )
    eeg_10s = eeg_full.iloc[(eeg_offset+20)*200:(eeg_offset+30)*200]

    if np.any(np.isnan(eeg_10s)):
        return []

    # Construct the interval
    t = np.linspace(0, 10, eeg_10s.size)

    # Iterate through all the channels
    for i in range(len(CHANNELS)):

        # Get the data for the specific channel
        signal = eeg_10s.iloc[:, i]

        # Perform DFT using numpy's rfft function
        dft_result = np.fft.rfft(signal)
        amplitude = np.abs(dft_result)
        phase = np.angle(dft_result)

        # Add to the row
        new_row.append(np.var(signal))
        new_row.extend(amplitude[6*min_frequency:6*max_frequency])
        new_row.extend(phase[6*min_frequency:6*max_frequency])

    new_row.append(row.expert_consensus)

    return new_row


In [10]:
# Read in the train dataset
train = pd.read_csv('../train.csv')

print(train.head)

train_reformat = create_empty_dataset()

rows_removed = 0

# Only select the rows where there is consensus if this is a condition
if ONLY_CONSENSUS:
    total = train[['seizure_vote', 'lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote', 'other_vote']].sum(axis=1)
    max = train[['seizure_vote', 'lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote', 'other_vote']].max(axis=1)

    consensus_rows = max / total >= CONSENSUS_PROP

    train = train[consensus_rows]

# Randomize if not processing full dataset
if NUM_ROWS != len(train):

    if BALANCE:
        g = train.groupby('expert_consensus')

        # Check that the input makes sense for balancing
        if NUM_ROWS % 6 != 0:
            print("Please choose a multiple of 6 as the number of samples")
            quit()
        
        samples_per_class = NUM_ROWS // 6

        least_class_samples = g.size().min()
        print(f"Least Samples: {least_class_samples}")

        # Ensure it is possible to make a balanced dataset
        if samples_per_class > least_class_samples:
            print(f"The smallest class has only {least_class_samples} samples, so the maximum size of a balanced dataset is {least_class_samples*6}")
            quit()

        print(f"old: {g.size().min()}, new: {samples_per_class}")

        # Randomly sample from each class to create a balanced dataset
        train = pd.DataFrame(g.apply(lambda x: x.sample(samples_per_class).reset_index(drop=True)))

        # Quick sanity check
        assert((train['expert_consensus'] == 'Seizure').sum() == samples_per_class)

    random_indices = random.sample(range(len(train)), NUM_ROWS)

    # Create a new DataFrame with the random subset
    train = train.iloc[random_indices]

    print("\nSelecting a random subset of data to avoid problems arising from correlation between samples in same session")

# Iterate through all rows
with tqdm(total=NUM_ROWS, desc="Processing", unit=" iterations", ncols=100) as pbar:
    for r in range(NUM_ROWS): # train.shape[0]
        new_row = row_transform(train.iloc[r], MIN_FREQUENCY, MAX_FREQUENCY)

        # Check if NaN issues
        if len(new_row) != 0:
            train_reformat.loc[len(train_reformat)] = new_row
        else:
            rows_removed += 1

        pbar.update(1)
        
print("\nSamples successfully processed: {}".format(len(train_reformat)))
print("Samples removed due to incomplete data: {}".format(rows_removed))

assert len(train_reformat) + rows_removed == NUM_ROWS, "Row mismatch"

# Convert DataFrame B to CSV
extra_info = ''
if ONLY_CONSENSUS:
    extra_info = '_consensus_{}'.format(CONSENSUS_PROP)
if BALANCE:
    extra_info += '_balanced'

train_reformat.to_csv('train_{}_samples_{}_to_{}_hz{}.csv'.format(NUM_ROWS, MIN_FREQUENCY, MAX_FREQUENCY, extra_info), index=False)

<bound method NDFrame.head of             eeg_id  eeg_sub_id  eeg_label_offset_seconds  spectrogram_id  \
0       1628180742           0                       0.0          353733   
1       1628180742           1                       6.0          353733   
2       1628180742           2                       8.0          353733   
3       1628180742           3                      18.0          353733   
4       1628180742           4                      24.0          353733   
...            ...         ...                       ...             ...   
106795   351917269           6                      12.0      2147388374   
106796   351917269           7                      14.0      2147388374   
106797   351917269           8                      16.0      2147388374   
106798   351917269           9                      18.0      2147388374   
106799   351917269          10                      20.0      2147388374   

        spectrogram_sub_id  spectrogram_label_offset_seco

Processing:  10%|███▌                                 | 1456/15000 [03:13<29:22,  7.68 iterations/s]