The goal of this file is to transform the data in the original file and output a csv with this data that can be used to train the machine learning model

In [9]:
import numpy as np
import pandas as pd
import random

In [2]:
EEG_PATH = '../train_eegs/'
SPEC_PATH = '../train_spectrograms/'
CHANNELS = ['Fp1', 'F3', 'C3', 'P3', 'F7', 'T3', 'T5', 'O1', 'Fz', 'Cz', 'Pz', 'Fp2', 'F4', 'C4',
 'P4', 'F8', 'T4', 'T6', 'O2', 'EKG']
NUM_FREQUENCIES = 20

In [7]:
# Function to generate an empty dataset with the correct headers
def create_empty_dataset():
    # Create empty new database with the appropriate headers
    headers = []
    for channel in CHANNELS:
        headers.append("var_" + channel)
        for j in range(NUM_FREQUENCIES):
            headers.append("amp_" + channel + "_" + str(j))
        for j in range(NUM_FREQUENCIES):
            headers.append("phase_" + channel + "_" + str(j))
    headers.append("label")

    print(headers)
    print(len(headers))

    new_dataframe = pd.DataFrame(columns = headers)

    return new_dataframe

In [3]:
# Function to generate the output values
def row_transform(row):
    """
    Takes as input a single sample from the train set and returns a reformatted row
    New format is: 
        For each cell, the (1) variance, (2) amplitudes, (3) phases
        The ground truth label
    """

    new_row = []

    # Read in the EEG
    eeg_full = pd.read_parquet(f'{EEG_PATH}{row.eeg_id}.parquet')

    # Get the middle 10s that the diagnosis comes from
    eeg_offset = int( row.eeg_label_offset_seconds )
    eeg_10s = eeg_full.iloc[(eeg_offset+20)*200:(eeg_offset+30)*200]

    if np.any(np.isnan(eeg_10s)):
        return []

    # Construct the interval
    t = np.linspace(0, 10, eeg_10s.size)

    # Iterate through all the channels
    for i in range(len(CHANNELS)):

        # Get the data for the specific channel
        signal = eeg_10s.iloc[:, i]

        # Perform DFT using numpy's rfft function
        dft_result = np.fft.rfft(signal)
        amplitude = np.abs(dft_result)
        phase = np.angle(dft_result)

        # Add to the row
        new_row.append(np.var(signal))
        new_row.extend(amplitude[:20])
        new_row.extend(phase[:20])

    # Add the ground truth
    #print(row.expert_consensus)
    #print(len(new_row))
    new_row.append(row.expert_consensus)

    #print(len(new_row))

    return new_row


In [22]:
# Read in the train dataset
train = pd.read_csv('../train.csv')

train_reformat = create_empty_dataset()

print(train_reformat)

# Pick the number of rows to read in
num_rows = len(train)
rows_removed = 0

# Iterate through all rows
print("Processing {} raw samples\n".format(num_rows))

for r in range(num_rows): # train.shape[0]
    new_row = row_transform(train.iloc[r])

    # Check if NaN issues
    if len(new_row) != 0:
        train_reformat.loc[len(train_reformat)] = new_row
    else:
        rows_removed += 1

    if r % 100 == 0:
        print("Done with " + str(r) + " iterations.")
        
print("\nSamples successfully processed: {}".format(len(train_reformat)))
print("Samples removed due to incomplete data: {}".format(rows_removed))

assert len(train_reformat) + rows_removed == num_rows, "Row mismatch"

# Convert DataFrame B to CSV
train_reformat.to_csv('reformatted_train_{}_samples.csv'.format(num_rows), index=False)

['var_Fp1', 'amp_Fp1_0', 'amp_Fp1_1', 'amp_Fp1_2', 'amp_Fp1_3', 'amp_Fp1_4', 'amp_Fp1_5', 'amp_Fp1_6', 'amp_Fp1_7', 'amp_Fp1_8', 'amp_Fp1_9', 'amp_Fp1_10', 'amp_Fp1_11', 'amp_Fp1_12', 'amp_Fp1_13', 'amp_Fp1_14', 'amp_Fp1_15', 'amp_Fp1_16', 'amp_Fp1_17', 'amp_Fp1_18', 'amp_Fp1_19', 'phase_Fp1_0', 'phase_Fp1_1', 'phase_Fp1_2', 'phase_Fp1_3', 'phase_Fp1_4', 'phase_Fp1_5', 'phase_Fp1_6', 'phase_Fp1_7', 'phase_Fp1_8', 'phase_Fp1_9', 'phase_Fp1_10', 'phase_Fp1_11', 'phase_Fp1_12', 'phase_Fp1_13', 'phase_Fp1_14', 'phase_Fp1_15', 'phase_Fp1_16', 'phase_Fp1_17', 'phase_Fp1_18', 'phase_Fp1_19', 'var_F3', 'amp_F3_0', 'amp_F3_1', 'amp_F3_2', 'amp_F3_3', 'amp_F3_4', 'amp_F3_5', 'amp_F3_6', 'amp_F3_7', 'amp_F3_8', 'amp_F3_9', 'amp_F3_10', 'amp_F3_11', 'amp_F3_12', 'amp_F3_13', 'amp_F3_14', 'amp_F3_15', 'amp_F3_16', 'amp_F3_17', 'amp_F3_18', 'amp_F3_19', 'phase_F3_0', 'phase_F3_1', 'phase_F3_2', 'phase_F3_3', 'phase_F3_4', 'phase_F3_5', 'phase_F3_6', 'phase_F3_7', 'phase_F3_8', 'phase_F3_9', 'phase_F

In [5]:
def save_random_subset_to_csv(input_df, output_csv, subset_size):
    """
    Takes a Pandas DataFrame, generates a new DataFrame with a random subset of the rows,
    and saves this new DataFrame to a CSV file.
    
    Parameters:
        input_df (pandas.DataFrame): Input DataFrame.
        output_csv (str): Path to the output CSV file.
        subset_size (int): Number of rows to include in the random subset.
    """
    # Check if subset size is valid
    if subset_size > len(input_df):
        raise ValueError("Subset size is larger than the DataFrame size")

    # Generate random indices for the subset
    random_indices = random.sample(range(len(input_df)), subset_size)

    # Create a new DataFrame with the random subset
    random_subset_df = input_df.iloc[random_indices]

    # Save the random subset DataFrame to a CSV file
    random_subset_df.to_csv(output_csv, index=False)



In [6]:
# Read in the train dataset
train = pd.read_csv('../train.csv')

train_reformat = create_empty_dataset()

print(train_reformat)

# Pick the number of rows to read in
num_rows = len(train)
rows_removed = 0

# Iterate through all rows
print("Processing {} raw samples\n".format(num_rows))

for r in range(num_rows): # train.shape[0]
    new_row = row_transform(train.iloc[r])

    # Check if NaN issues
    if len(new_row) != 0:
        train_reformat.loc[len(train_reformat)] = new_row
    else:
        rows_removed += 1

    if r % 100 == 0:
        print("Done with " + str(r) + " iterations.")
        
print("\nSamples successfully processed: {}".format(len(train_reformat)))
print("Samples removed due to incomplete data: {}".format(rows_removed))

assert len(train_reformat) + rows_removed == num_rows, "Row mismatch"

# Convert DataFrame B to CSV
train_reformat.to_csv('reformatted_train_{}_samples.csv'.format(num_rows), index=False)

NameError: name 'create_empty_dataset' is not defined

In [10]:
TRAIN_REFORMAT_PATH = '../augmented_train/reformatted_train_106800_samples.csv'
SAMPLES = 1000

train_reformat_full = pd.read_csv(TRAIN_REFORMAT_PATH)

save_random_subset_to_csv(train_reformat_full, 'reformatted_train_{}_random_samples.csv'.format(SAMPLES), SAMPLES)

In [6]:
# Checking out anomalies
row = train.iloc[205]

eeg_full = pd.read_parquet(f'{EEG_PATH}{row.eeg_id}.parquet')

# Get the middle 10s that the diagnosis comes from
eeg_offset = int( row.eeg_label_offset_seconds )
eeg_10s = eeg_full.iloc[(eeg_offset+20)*200:(eeg_offset+30)*200]

print(row)
print(eeg_10s)

contains_non_numeric = np.any(np.isnan(eeg_10s))

if contains_non_numeric:
    print("The array contains non-numeric values.")
else:
    print("The array contains only numeric values.")

row = train.iloc[206]

eeg_full = pd.read_parquet(f'{EEG_PATH}{row.eeg_id}.parquet')

# Get the middle 10s that the diagnosis comes from
eeg_offset = int( row.eeg_label_offset_seconds )
eeg_10s = eeg_full.iloc[(eeg_offset+20)*200:(eeg_offset+30)*200]

print(row)
print(eeg_10s)

contains_non_numeric = np.any(np.isnan(eeg_10s))

if contains_non_numeric:
    print("The array contains non-numeric values.")
else:
    print("The array contains only numeric values.")


eeg_id                              3630961636
eeg_sub_id                                   0
eeg_label_offset_seconds                   0.0
spectrogram_id                         8430915
spectrogram_sub_id                           0
spectrogram_label_offset_seconds           0.0
label_id                            2402782492
patient_id                               48861
expert_consensus                         Other
seizure_vote                                 0
lpd_vote                                     0
gpd_vote                                     0
lrda_vote                                    0
grda_vote                                    3
other_vote                                  10
Name: 205, dtype: object
            Fp1         F3     C3     P3         F7    T3         T5  \
4000  11.170000 -14.360000 -14.36 -13.03  31.370001 -4.79 -53.169998   
4001   6.910000 -15.680000 -14.36 -13.29  32.700001 -2.13 -50.509998   
4002  14.620000 -12.230000 -14.89 -12.76  34.560001 -0