In [1]:
import pandas as pd
import numpy as np
import os
import time
import shutil
import Data_Preprocessing_Raw as DPR
import Data_Preprocessing_Metrics as DPM
import Data_Preprocessing_Spectrogram as DPS

In [2]:
def Perform_Filtering_Plus_NaN_Removal(set_file_path, obs_rate, low_freq, high_freq, num_electrodes):
    filtered_df_sample, channel_names = DPR.load_filtered_eeg_data_using_mne(set_file_path, obs_rate, low_freq, high_freq, num_electrodes)
    filtered_df_sample, channel_names = DPR.remove_high_nan_rows_cols(filtered_df_sample, channel_names, threshold=0.4)
    return filtered_df_sample, channel_names 

def Create_Metric_And_Detail_Data_For_Application(num_electrodes, thresholds, sequence_lengths, num_int, window, num_std, obs_rate, subject_number, filtered_df_sample, channel_names):

    time1 = time.perf_counter()

    int_min, int_max = 0, filtered_df_sample.shape[1] 
    df_sample = DPR.transform_dataset_for_visualization(filtered_df_sample, 'interval', channel_names, interval_start=int_min, interval_end=int_max, obs_rate=obs_rate)

    time2 = time.perf_counter()
    print(f'Execution time for raw preprocess: {time2 - time1:.2f} seconds. The shape of raw filtered data: {df_sample.shape}')
    
    df_sample = DPM.create_sign_difference_column(df_sample)
    
    time3 = time.perf_counter()
    print(f'Execution time for sign_diff function: {time3 - time2:.2f} seconds')
    
    df_sample = DPM.mark_threshold_sequences(df_sample, thresholds, sequence_lengths)
    
    time4 = time.perf_counter()
    print(f'Execution time for threshold function: {time4 - time3:.2f} seconds')
            
    time5 = time.perf_counter()
    print(f'Execution time for sentiment function: {time5 - time4:.2f} seconds')
    
    df_sample = DPM.calculate_envelope_diff(df_sample, window, num_std)
    
    time6 = time.perf_counter()
    print(f'Execution time for envelope fuinction: {time6 - time5:.2f} seconds')

    columns = ['y', 'sign_change', 'abs_diff', 'envelope'] + [col for col in df_sample.columns if "SEQ" in col]
    
    print('filtered data for subject: {subject_number} is done')
    aggregated_df = DPM.aggregate_for_specific_columns(df_sample, num_int, columns, num_electrodes, obs_rate)
    
    time7 = time.perf_counter()
    print(f'Execution time for aggregation df creation: {time7 - time6:.2f} seconds.')

    aggregated_df = aggregated_df[['Electrode'] + columns + ['x_start', 'time_hms', 'MSE', 'ME', 'Slope', 'mean', 'var', 'std', 'median', 'range']]
    
    print(f'The shape of raw aggregated data: {aggregated_df.shape}')

    df_sample['x_interval'] = pd.cut(df_sample['x'], bins=3000)

    DPM.create_parquet_partitioned_data(df_sample, subject_number)
    del df_sample

    time8 = time.perf_counter()
    print(f'Execution time for filtered_df saving: {time8 - time7:.2f} seconds.')

    DPM.create_aggregated_data_parquet(aggregated_df, subject_number)
    del aggregated_df
    
    time9 = time.perf_counter()
    print(f'Execution time for agg_df saving: {time9 - time8:.2f} seconds.')


def Create_Spectrogram_Data_For_Application(subject, obs_rate, data, channel_names):

    time2 = time.perf_counter()
    
    data_spectro = DPS.create_spectrogram_dataframe_all_electrodes(data, obs_rate, channel_names) #data_spectro

    time3 = time.perf_counter()
    print(f'Execution time for agg_df saving: {time3 - time2:.2f} seconds.')
    
    directory_path = f"parquet_partitioned_spectrogram_{subject}"

    if os.path.exists(directory_path):
        shutil.rmtree(directory_path)

    data_spectro.to_parquet(directory_path, partition_cols=['Electrode'], compression='gzip')

    #data_spectro.to_parquet(f"parquet_frequency_{subject}_test", compression='gzip')

    time4 = time.perf_counter()
    print(f'Execution time for agg_df saving: {time4 - time3:.2f} seconds.')
    
    del data_spectro

In [3]:
directory_path = 'C:\\Users\\ander\\OneDrive\\Dokumenter\\10thSemesterThesis'
set_file_name = 'sub-001_ses-001_task-sleep_acq-PSG_eeg.set'
set_file_path = os.path.join(directory_path, set_file_name)
low_freq, high_freq, obs_rate = 0.1, 100, 250
num_electrodes = 24
subject = '003'
thresholds = [0.00001, 0.000005]
sequence_lengths = [1, 5]
num_int = 3000
window, num_std = 10, 1.5

filtered_df_sample, channel_names = Perform_Filtering_Plus_NaN_Removal(set_file_path, obs_rate, low_freq, high_freq, num_electrodes)

Create_Metric_And_Detail_Data_For_Application(num_electrodes, thresholds, sequence_lengths, num_int, window, num_std, obs_rate, subject, filtered_df_sample, channel_names)

Create_Spectrogram_Data_For_Application(subject, obs_rate, filtered_df_sample, channel_names)

Reading C:\Users\ander\OneDrive\Dokumenter\10thSemesterThesis\sub-001_ses-001_task-sleep_acq-PSG_eeg.fdt
Reading 0 ... 14255559  =      0.000 ... 28511.118 secs...
NOTE: pick_channels() is a legacy function. New code should use inst.pick(...).
Filtering raw data in 1 contiguous segment
Setting up band-pass filter from 0.1 - 1e+02 Hz

FIR filter parameters
---------------------
Designing a one-pass, zero-phase, non-causal bandpass filter:
- Windowed time-domain design (firwin) method
- Hamming window with 0.0194 passband ripple and 53 dB stopband attenuation
- Lower passband edge: 0.10
- Lower transition bandwidth: 0.10 Hz (-6 dB cutoff frequency: 0.05 Hz)
- Upper passband edge: 100.00 Hz
- Upper transition bandwidth: 25.00 Hz (-6 dB cutoff frequency: 112.50 Hz)
- Filter length: 16501 samples (33.002 s)



[Parallel(n_jobs=1)]: Done  17 tasks      | elapsed:    6.7s


Original NaN count: 7127780
Original shape data: (24, 7127780), original shape channel_names: 24
Columns to delete: 0, Rows to delete: 1
Indices of rows to be deleted: [7]
New data shape post cleaning: (23, 7127780), New channel_name shape post cleaning: 23
Execution time for raw preprocess: 59.68 seconds. The shape of raw filtered data: (163938940, 4)
Execution time for sign_diff function: 2.44 seconds
Execution time for threshold function: 32.84 seconds
Execution time for sentiment function: 0.02 seconds
Execution time for envelope fuinction: 8.33 seconds
filtered data for subject: {subject_number} is done


  agg_df = df.groupby(['Electrode', 'x_interval'])[columns].sum().reset_index()
  agg_df[['MSE', 'ME', 'Slope']] = df.groupby(['Electrode', 'x_interval']).apply(calculate_metrics).reset_index()[['MSE', 'ME', 'Slope']]
  stats_df = df.groupby(['Electrode', 'x_interval'])['y'].agg(['var', 'mean', 'std', 'median', 'min', 'max']).reset_index()
  agg_df['time_hms'] = pd.to_datetime(agg_df['x_start'] / obs_rate, unit='s').dt.floor('S')


Execution time for aggregation df creation: 414.51 seconds.
The shape of raw aggregated data: (69000, 19)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  DF['Electrode'] = DF['Electrode'].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  DF['y'] = DF['y'].astype('float32')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  DF['x'] = DF['x'].astype('int32')


In [None]:
'''
Input for preprocessing

- Insert own file path for the .set or .fdt file containing EEG data.
- Insert own name for  the given subject.
- Adjust the other parameters based on preference.

window, num_std:                   Envelope window and std multiplier
num_int:                           Number of aggregated intervals.
low_freq, high_freq, obs_rate:     Ban-pass filter parameters and sample rate (measurements per second) 
sequence_lengths and thresholds:   For the sequences exceeding the thresholds metric.
num_electrodes:                    How many electrodes from the subject should be preprocessed.
'''


set_file_path = ''
low_freq, high_freq, obs_rate = 0.1, 100, 250
num_electrodes = 24
subject = ''
thresholds = [0.00001, 0.000005]
sequence_lengths = [1, 5]
num_int = 3000
window, num_std = 10, 1.5

filtered_df_sample, channel_names = Perform_Filtering_Plus_NaN_Removal(set_file_path, obs_rate, low_freq, high_freq, num_electrodes)
Create_Metric_And_Detail_Data_For_Application(num_electrodes, thresholds, sequence_lengths, num_int, window, num_std, obs_rate, subject, filtered_df_sample, channel_names)
Create_Spectrogram_Data_For_Application(subject, obs_rate, filtered_df_sample, channel_names)

In [None]:
#s = ['001', '002']
#set_file_paths = [os.path.join(directory_path, f'sub-{sn}', f'ses-001', f'eeg', f'sub-{sn}_ses-001_task-sleep_eeg.set') for sn in s] + [set_file_path]
#set_file_paths = [set_file_path] 