In [1]:
#!pip install mne scipy
#!pip install pandas numpy openpyxl
#!pip install tsfresh
#!pip install PyWavelets

In [3]:
import os
import numpy as np
import scipy.signal as signal
import mne

def process_all_eeg_data() -> dict:
    """
    Process all .bdf EEG files in the current directory, applying filters and extracting data from
    channels A15 (O1), A16 (Oz), and A17 (O2).

    Returns
    -------
    dict
        A dictionary containing processed EEG data and header information for each file.
    """
    # Get a list of all .bdf files in the current directory
    files = [f for f in os.listdir('.') if f.endswith('.bdf')]
    if not files:
        raise FileNotFoundError("No BDF files found in the current directory")
    
    # Initialize the results dictionary
    results = {}
    
    # Loop over each file
    for filename in files:
        full_file_path = os.path.join(os.getcwd(), filename)
        
        # Read the raw EEG data using MNE
        raw = mne.io.read_raw_bdf(full_file_path, preload=True)
        hdr = raw.info
        
        # Select data from channels A15 (O1), A16 (Oz), and A17 (O2)
        channels_select = ['A15', 'A16', 'A17']
        missing_channels = [ch for ch in channels_select if ch not in hdr['ch_names']]
        if missing_channels:
            raise ValueError(f"Selected channels {missing_channels} not found in the data")
        
        channel_indices = [hdr['ch_names'].index(ch) for ch in channels_select]
        EEG_data = raw.get_data(picks=channel_indices).T  # Shape: (n_samples, n_channels)
        
        # Filter EEG Data
        Fs = hdr['sfreq']  # Sampling frequency
        
        # Bandpass filter parameters (2 to 80 Hz)
        Fc_BP = [2, 80]  # Bandpass frequency range
        Wn_BP = [f / (Fs / 2) for f in Fc_BP]  # Normalize by Nyquist frequency
        
        # Create and apply bandpass filter (6th order zero-phase Butterworth IIR)
        B_BP, A_BP = signal.butter(3, Wn_BP, btype='bandpass')
        EEG_filtered_BP = signal.filtfilt(B_BP, A_BP, EEG_data, axis=0)
        
        # Band stop filter parameters (48 to 52 Hz)
        Fc_BS = [48, 52]  # Band stop frequency range
        Wn_BS = [f / (Fs / 2) for f in Fc_BS]  # Normalize by Nyquist frequency
        
        # Create and apply band stop filter (6th order zero-phase Butterworth IIR)
        B_BS, A_BS = signal.butter(3, Wn_BS, btype='bandstop')
        EEG_filtered = signal.filtfilt(B_BS, A_BS, EEG_filtered_BP, axis=0)
        
        # Extract prefix before underscore from the filename
        underscore_index = filename.find('_')
        if underscore_index == -1:
            raise ValueError(f"Filename format error, no underscore found in {filename}")
        key = filename[:underscore_index]
        
        # Store results in the dictionary
        results[key] = {
            'data': EEG_filtered,      # Filtered data for channels A15, A16, A17
            'channels': channels_select,  # List of channel names
            'header': hdr
        }
        
        # Display a message indicating successful processing
        print(f"Data for file {filename} processed successfully")
    
    return results


In [4]:
results = process_all_eeg_data()

Extracting EDF parameters from c:\Users\WERPELGA\OneDrive - Danone\Desktop\UoA\2024.1&2\Python Gabe\A1_Full_Block.bdf...
BDF file detected
Setting channel info structure...
Creating raw.info structure...
Reading 0 ... 739327  =      0.000 ...   361.000 secs...
Data for file A1_Full_Block.bdf processed successfully
Extracting EDF parameters from c:\Users\WERPELGA\OneDrive - Danone\Desktop\UoA\2024.1&2\Python Gabe\A3_Full_Block.bdf...
BDF file detected
Setting channel info structure...
Creating raw.info structure...
Reading 0 ... 757759  =      0.000 ...   370.000 secs...
Data for file A3_Full_Block.bdf processed successfully
Extracting EDF parameters from c:\Users\WERPELGA\OneDrive - Danone\Desktop\UoA\2024.1&2\Python Gabe\A4_Full_Block.bdf...
BDF file detected
Setting channel info structure...
Creating raw.info structure...
Reading 0 ... 782335  =      0.000 ...   382.000 secs...
Data for file A4_Full_Block.bdf processed successfully
Extracting EDF parameters from c:\Users\WERPELGA\One

In [5]:
import numpy as np
import pandas as pd

def segment_eeg_data_new(results: dict, cohort_file: str = 'Cohort.xlsx') -> dict:
    """
    Segments EEG data into predefined sections (EC, EO, LC, RC, DEC, NDEC) based on cohort information,
    removing the first 2 seconds from each section.

    Parameters
    ----------
    results : dict
        Dictionary containing the raw EEG data and header information for each key (participant).
    cohort_file : str, optional
        Path to the Excel file containing cohort information (default is 'Cohort.xlsx').

    Returns
    -------
    dict
        Dictionary containing segmented EEG data for each participant.
    """
    # Read the cohort information from an Excel file
    cohort_table = pd.read_excel(cohort_file)
    # Segment Duration (in seconds)
    segment_duration = 10  # Original segment duration in seconds
    skip_duration = 2      # Duration to skip at the start of each segment (2 seconds)

    # Initialize the segmented results dictionary
    segmented_data = {}

    # Iterate through each key in the results dictionary
    for key, result in results.items():
        data = result['data']  # Data shape: (n_samples, n_channels)
        hdr = result['header']

        # Find the matching row in the cohort table
        cohort_row = cohort_table[cohort_table['Cohort'] == key]
        
        if cohort_row.empty:
            raise ValueError(f"Cohort information not found for {key}")

        # Define the sample rate and calculate sample counts
        Fs = hdr['sfreq']  # Sampling frequency
        samples_per_segment = int(segment_duration * Fs)
        samples_to_skip = int(skip_duration * Fs)
        effective_samples_per_segment = samples_per_segment - samples_to_skip
        n_channels = data.shape[1]  # Number of channels (should be 3: O1, Oz, O2)

        # Initialize segments with zeros
        EC = np.zeros((effective_samples_per_segment, n_channels))
        EO = np.zeros((effective_samples_per_segment, n_channels))
        LC = np.zeros((effective_samples_per_segment, n_channels))
        RC = np.zeros((effective_samples_per_segment, n_channels))
        DEC = np.zeros((effective_samples_per_segment, n_channels))
        NDEC = np.zeros((effective_samples_per_segment, n_channels))

        # Fill segments with data if available, skipping the first 2 seconds
        # EC segment
        segment_start = 0
        segment_end = samples_per_segment
        if data.shape[0] >= segment_end:
            EC = data[segment_start + samples_to_skip : segment_end, :]
        else:
            print(f"Not enough data for EC segment in {key}")

        # EO segment
        segment_start = samples_per_segment
        segment_end = 2 * samples_per_segment
        if data.shape[0] >= segment_end:
            EO = data[segment_start + samples_to_skip : segment_end, :]
        else:
            print(f"Not enough data for EO segment in {key}")

        # LC segment
        segment_start = 2 * samples_per_segment
        segment_end = 3 * samples_per_segment
        if data.shape[0] >= segment_end:
            LC = data[segment_start + samples_to_skip : segment_end, :]
        else:
            print(f"Not enough data for LC segment in {key}")

        # RC segment
        segment_start = 3 * samples_per_segment
        segment_end = 4 * samples_per_segment
        if data.shape[0] >= segment_end:
            RC = data[segment_start + samples_to_skip : segment_end, :]
        else:
            print(f"Not enough data for RC segment in {key}")

        # Apply conditions based on cohort table
        if cohort_row['LC'].values[0] == 'DEC':
            # Assign 'DEC' to LC and 'NDEC' to RC
            DEC = LC
            NDEC = RC
        elif cohort_row['RC'].values[0] == 'DEC':
            # Assign 'DEC' to RC and 'NDEC' to LC
            DEC = RC
            NDEC = LC
        else:
            # If neither LC nor RC is 'DEC', assign NDEC accordingly
            NDEC = LC
            # Optionally handle cases where DEC is not specified
            DEC = RC  # Or set DEC to zeros if appropriate

        # Store the segmented data and 'LinesDifference' in the results dictionary
        segmented_data[key] = {
            'header': hdr,
            'EC': EC,
            'EO': EO,
            'DEC': DEC,
            'NDEC': NDEC,
            'LinesDifference': cohort_row['LinesDifference'].values[0]
        }

    return segmented_data


In [6]:
segmented_data = segment_eeg_data_new(results)

In [7]:
import pandas as pd
import numpy as np
from tsfresh import extract_features, select_features
from tsfresh.utilities.dataframe_functions import impute
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

def prepare_time_series_by_section(segmented_data, cohort_table):
    """
    Prepares a DataFrame suitable for tsfresh from segmented EEG data for all sections (EC, EO, DEC, NDEC).

    Parameters
    ----------
    segmented_data : dict
        The dictionary containing segmented EEG data for each participant.
    cohort_table : pd.DataFrame
        DataFrame containing cohort information (including labels for Amblyopia/Control).

    Returns
    -------
    pd.DataFrame, pd.Series
        A DataFrame where each row represents a time-series sample with columns 'id', 'time', 'O1', 'Oz', 'O2',
        and a Series with group labels indexed by 'id'.
    """
    data_list = []
    labels_list = []

    # Loop through each participant's data
    for key, value in segmented_data.items():
        # Find the matching cohort row
        cohort_row = cohort_table[cohort_table['Cohort'] == key]
        if cohort_row.empty:
            continue

        # Assign label based on the first letter of the 'Cohort' column (Amblyopia = 1, Control = 0)
        label = 1 if key.startswith('A') else 0

        # Get channel names; default to ['O1', 'Oz', 'O2'] if not available
        channels = value.get('channels', ['O1', 'Oz', 'O2'])

        # For each section (EC, EO, DEC, NDEC)
        for section in ['EC', 'EO', 'DEC', 'NDEC']:
            section_data = value[section]  # Shape: (n_samples, n_channels)

            # Create a DataFrame for this section
            n_samples = section_data.shape[0]
            df = pd.DataFrame({
                'id': f"{key}_{section}",
                'time': np.arange(n_samples)
            })

            # Add each channel's data as a column
            for idx, channel_name in enumerate(channels):
                df[channel_name] = section_data[:, idx]

            # Append to data list
            data_list.append(df)

            # Append label for this 'id' (participant_section)
            labels_list.append({'id': f"{key}_{section}", 'label': label})

    # Concatenate all data into a single DataFrame
    time_series_df = pd.concat(data_list, ignore_index=True)

    # Create a labels DataFrame and convert to a Series indexed by 'id'
    labels_df = pd.DataFrame(labels_list).drop_duplicates(subset='id')
    labels_series = labels_df.set_index('id')['label']

    # Return the time-series data and corresponding labels
    return time_series_df, labels_series

# Load your cohort table (must include 'Cohort' column)
cohort_table = pd.read_excel('Cohort.xlsx')

# Prepare the time series DataFrame and labels
time_series_df, labels = prepare_time_series_by_section(segmented_data, cohort_table)


In [8]:
# Assuming your dataframe is named 'df'
time_series_df_0 = time_series_df[(time_series_df['time'] >= 0) & (time_series_df['time'] <= 1637)]
time_series_df_1 = time_series_df[(time_series_df['time'] >= 1638) & (time_series_df['time'] <= 3276)]
time_series_df_2 = time_series_df[(time_series_df['time'] >= 3276) & (time_series_df['time'] <= 4914)]
time_series_df_3 = time_series_df[(time_series_df['time'] >= 4914) & (time_series_df['time'] <= 6552)]
time_series_df_4 = time_series_df[(time_series_df['time'] >= 6552) & (time_series_df['time'] <= 8190)]
time_series_df_5 = time_series_df[(time_series_df['time'] >= 8190) & (time_series_df['time'] <= 9828)]
time_series_df_6 = time_series_df[(time_series_df['time'] >= 9828) & (time_series_df['time'] <= 11466)]
time_series_df_7 = time_series_df[(time_series_df['time'] >= 11466) & (time_series_df['time'] <= 13104)]
time_series_df_8 = time_series_df[(time_series_df['time'] >= 13104) & (time_series_df['time'] <= 14742)]
time_series_df_9 = time_series_df[(time_series_df['time'] >= 14742) & (time_series_df['time'] <= 16380)]

In [9]:
# Import necessary libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import pandas as pd
import numpy as np
import gc
from tsfresh import extract_features
from tsfresh.feature_extraction import ComprehensiveFCParameters, EfficientFCParameters
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_extraction import settings
import pickle

# Define the function to process data in chunks using ComprehensiveFCParameters
def process_in_chunks(time_series_df, N):
    # Get unique IDs
    unique_ids = time_series_df['id'].unique()
    
    # Split the unique IDs into chunks of size N
    chunks = [unique_ids[i:i + N] for i in range(0, len(unique_ids), N)]
    
    # Initialize an empty list to store the results
    results = []
    
    # Process each chunk
    for chunk in chunks:
        # Filter the DataFrame to include only the IDs in the current chunk
        chunk_df = time_series_df[time_series_df['id'].isin(chunk)]
        
        # Extract features for the current chunk using ComprehensiveFCParameters
        extracted_features_chunk = extract_features(
            chunk_df,
            column_id='id',
            column_sort='time',
            default_fc_parameters=ComprehensiveFCParameters(),  # Use ComprehensiveFCParameters() for more features
            n_jobs=4,  # Adjust based on your CPU cores
            # Since data is in wide format, we do not need to specify column_kind and column_value
        )
        
        # Impute missing values in the extracted features
        impute(extracted_features_chunk)
        
        # Append the extracted features to the results list
        results.append(extracted_features_chunk)
        
        # Clear memory
        del chunk_df, extracted_features_chunk
        gc.collect()
    
    # Concatenate all the results into a single DataFrame
    final_result = pd.concat(results)
    
    return final_result


In [55]:
# 1. Put all DataFrames into a List
time_series_dfs = [
    time_series_df_0,
    time_series_df_1,
    time_series_df_2,
    time_series_df_3,
    time_series_df_4,
    time_series_df_5,
    time_series_df_6,
    time_series_df_7,
    time_series_df_8,
    time_series_df_9
]

# Set the chunk size N (adjust based on your memory constraints)
N = 10  # Smaller chunk size to manage memory usage

# Initialize a set to store all selected feature names
all_selected_feature_names = set()

# Loop over each DataFrame and extract features
for idx, ts_df in enumerate(time_series_dfs):
    print(f"Processing DataFrame {idx}...")
    
    # 2. Extract features using the process_in_chunks function
    extracted_features = process_in_chunks(ts_df, N)
    
    # Clean the extracted features
    extracted_features_clean = extracted_features.replace([np.inf, -np.inf], np.nan).dropna(axis=1)
    
    # Ensure that the labels are aligned with the extracted features
    # Assuming 'labels' is a Series with 'id' as the index
    labels_aligned = labels.loc[extracted_features_clean.index]
    
    # Check if we have enough data to split
    if len(extracted_features_clean) < 2:
        print(f"Not enough data in DataFrame {idx} for training and testing. Skipping...")
        continue
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        extracted_features_clean,
        labels_aligned,
        test_size=0.2,
        random_state=42,
        stratify=labels_aligned  # Ensure stratified sampling
    )
    
    # Perform feature selection
    selector = SelectKBest(f_classif, k=10)  # Adjust 'k' as needed
    X_train_selected = selector.fit_transform(X_train, y_train)
    
    # Get the names of the selected features
    selected_feature_names = extracted_features_clean.columns[selector.get_support()]
    
    # Add the selected features to the set
    all_selected_feature_names.update(selected_feature_names)
    
    # Optionally, you can proceed to train models on the selected features here
    # For brevity, this step is omitted

# 3. Combine All Selected Features
print(f"Total unique selected features from all DataFrames: {len(all_selected_feature_names)}")
print(all_selected_feature_names)


Processing DataFrame 0...


Feature Extraction: 100%|██████████| 15/15 [00:30<00:00,  2.01s/it]
 'Oz__autocorrelation__lag_2' 'Oz__autocorrelation__lag_3'
 'Oz__autocorrelation__lag_4' 'Oz__autocorrelation__lag_5'
 'Oz__autocorrelation__lag_6' 'Oz__autocorrelation__lag_7'
 'Oz__autocorrelation__lag_8' 'Oz__autocorrelation__lag_9'
 'Oz__query_similarity_count__query_None__threshold_0.0'
 'O2__autocorrelation__lag_0' 'O2__autocorrelation__lag_1'
 'O2__autocorrelation__lag_2' 'O2__autocorrelation__lag_3'
 'O2__autocorrelation__lag_4' 'O2__autocorrelation__lag_5'
 'O2__autocorrelation__lag_6' 'O2__autocorrelation__lag_7'
 'O2__autocorrelation__lag_8' 'O2__autocorrelation__lag_9'
 'O2__query_similarity_count__query_None__threshold_0.0'
 'O1__autocorrelation__lag_0' 'O1__autocorrelation__lag_1'
 'O1__autocorrelation__lag_2' 'O1__autocorrelation__lag_3'
 'O1__autocorrelation__lag_4' 'O1__autocorrelation__lag_5'
 'O1__autocorrelation__lag_6' 'O1__autocorrelation__lag_7'
 'O1__autocorrelation__lag_8' 'O1__autocorrelation_

Processing DataFrame 1...


Feature Extraction: 100%|██████████| 15/15 [00:26<00:00,  1.80s/it]
 'Oz__autocorrelation__lag_2' 'Oz__autocorrelation__lag_3'
 'Oz__autocorrelation__lag_4' 'Oz__autocorrelation__lag_5'
 'Oz__autocorrelation__lag_6' 'Oz__autocorrelation__lag_7'
 'Oz__autocorrelation__lag_8' 'Oz__autocorrelation__lag_9'
 'Oz__query_similarity_count__query_None__threshold_0.0'
 'O2__autocorrelation__lag_0' 'O2__autocorrelation__lag_1'
 'O2__autocorrelation__lag_2' 'O2__autocorrelation__lag_3'
 'O2__autocorrelation__lag_4' 'O2__autocorrelation__lag_5'
 'O2__autocorrelation__lag_6' 'O2__autocorrelation__lag_7'
 'O2__autocorrelation__lag_8' 'O2__autocorrelation__lag_9'
 'O2__query_similarity_count__query_None__threshold_0.0'
 'O1__autocorrelation__lag_0' 'O1__autocorrelation__lag_1'
 'O1__autocorrelation__lag_2' 'O1__autocorrelation__lag_3'
 'O1__autocorrelation__lag_4' 'O1__autocorrelation__lag_5'
 'O1__autocorrelation__lag_6' 'O1__autocorrelation__lag_7'
 'O1__autocorrelation__lag_8' 'O1__autocorrelation_

Processing DataFrame 2...


Feature Extraction: 100%|██████████| 15/15 [00:32<00:00,  2.19s/it]
 'O1__autocorrelation__lag_2' 'O1__autocorrelation__lag_3'
 'O1__autocorrelation__lag_4' 'O1__autocorrelation__lag_5'
 'O1__autocorrelation__lag_6' 'O1__autocorrelation__lag_7'
 'O1__autocorrelation__lag_8' 'O1__autocorrelation__lag_9'
 'O1__query_similarity_count__query_None__threshold_0.0'
 'Oz__autocorrelation__lag_0' 'Oz__autocorrelation__lag_1'
 'Oz__autocorrelation__lag_2' 'Oz__autocorrelation__lag_3'
 'Oz__autocorrelation__lag_4' 'Oz__autocorrelation__lag_5'
 'Oz__autocorrelation__lag_6' 'Oz__autocorrelation__lag_7'
 'Oz__autocorrelation__lag_8' 'Oz__autocorrelation__lag_9'
 'Oz__query_similarity_count__query_None__threshold_0.0'
 'O2__autocorrelation__lag_0' 'O2__autocorrelation__lag_1'
 'O2__autocorrelation__lag_2' 'O2__autocorrelation__lag_3'
 'O2__autocorrelation__lag_4' 'O2__autocorrelation__lag_5'
 'O2__autocorrelation__lag_6' 'O2__autocorrelation__lag_7'
 'O2__autocorrelation__lag_8' 'O2__autocorrelation_

Processing DataFrame 3...


Feature Extraction: 100%|██████████| 15/15 [00:29<00:00,  1.97s/it]
 'O1__autocorrelation__lag_2' 'O1__autocorrelation__lag_3'
 'O1__autocorrelation__lag_4' 'O1__autocorrelation__lag_5'
 'O1__autocorrelation__lag_6' 'O1__autocorrelation__lag_7'
 'O1__autocorrelation__lag_8' 'O1__autocorrelation__lag_9'
 'O1__query_similarity_count__query_None__threshold_0.0'
 'Oz__autocorrelation__lag_0' 'Oz__autocorrelation__lag_1'
 'Oz__autocorrelation__lag_2' 'Oz__autocorrelation__lag_3'
 'Oz__autocorrelation__lag_4' 'Oz__autocorrelation__lag_5'
 'Oz__autocorrelation__lag_6' 'Oz__autocorrelation__lag_7'
 'Oz__autocorrelation__lag_8' 'Oz__autocorrelation__lag_9'
 'Oz__query_similarity_count__query_None__threshold_0.0'
 'O2__autocorrelation__lag_0' 'O2__autocorrelation__lag_1'
 'O2__autocorrelation__lag_2' 'O2__autocorrelation__lag_3'
 'O2__autocorrelation__lag_4' 'O2__autocorrelation__lag_5'
 'O2__autocorrelation__lag_6' 'O2__autocorrelation__lag_7'
 'O2__autocorrelation__lag_8' 'O2__autocorrelation_

Processing DataFrame 4...


Feature Extraction: 100%|██████████| 15/15 [00:27<00:00,  1.81s/it]
 'O2__autocorrelation__lag_2' 'O2__autocorrelation__lag_3'
 'O2__autocorrelation__lag_4' 'O2__autocorrelation__lag_5'
 'O2__autocorrelation__lag_6' 'O2__autocorrelation__lag_7'
 'O2__autocorrelation__lag_8' 'O2__autocorrelation__lag_9'
 'O2__query_similarity_count__query_None__threshold_0.0'
 'O1__autocorrelation__lag_0' 'O1__autocorrelation__lag_1'
 'O1__autocorrelation__lag_2' 'O1__autocorrelation__lag_3'
 'O1__autocorrelation__lag_4' 'O1__autocorrelation__lag_5'
 'O1__autocorrelation__lag_6' 'O1__autocorrelation__lag_7'
 'O1__autocorrelation__lag_8' 'O1__autocorrelation__lag_9'
 'O1__query_similarity_count__query_None__threshold_0.0'
 'Oz__autocorrelation__lag_0' 'Oz__autocorrelation__lag_1'
 'Oz__autocorrelation__lag_2' 'Oz__autocorrelation__lag_3'
 'Oz__autocorrelation__lag_4' 'Oz__autocorrelation__lag_5'
 'Oz__autocorrelation__lag_6' 'Oz__autocorrelation__lag_7'
 'Oz__autocorrelation__lag_8' 'Oz__autocorrelation_

Processing DataFrame 5...


Feature Extraction: 100%|██████████| 15/15 [00:28<00:00,  1.89s/it]
 'O2__autocorrelation__lag_2' 'O2__autocorrelation__lag_3'
 'O2__autocorrelation__lag_4' 'O2__autocorrelation__lag_5'
 'O2__autocorrelation__lag_6' 'O2__autocorrelation__lag_7'
 'O2__autocorrelation__lag_8' 'O2__autocorrelation__lag_9'
 'O2__query_similarity_count__query_None__threshold_0.0'
 'O1__autocorrelation__lag_0' 'O1__autocorrelation__lag_1'
 'O1__autocorrelation__lag_2' 'O1__autocorrelation__lag_3'
 'O1__autocorrelation__lag_4' 'O1__autocorrelation__lag_5'
 'O1__autocorrelation__lag_6' 'O1__autocorrelation__lag_7'
 'O1__autocorrelation__lag_8' 'O1__autocorrelation__lag_9'
 'O1__query_similarity_count__query_None__threshold_0.0'
 'Oz__autocorrelation__lag_0' 'Oz__autocorrelation__lag_1'
 'Oz__autocorrelation__lag_2' 'Oz__autocorrelation__lag_3'
 'Oz__autocorrelation__lag_4' 'Oz__autocorrelation__lag_5'
 'Oz__autocorrelation__lag_6' 'Oz__autocorrelation__lag_7'
 'Oz__autocorrelation__lag_8' 'Oz__autocorrelation_

Processing DataFrame 6...


Feature Extraction: 100%|██████████| 15/15 [00:32<00:00,  2.18s/it]
 'O2__autocorrelation__lag_2' 'O2__autocorrelation__lag_3'
 'O2__autocorrelation__lag_4' 'O2__autocorrelation__lag_5'
 'O2__autocorrelation__lag_6' 'O2__autocorrelation__lag_7'
 'O2__autocorrelation__lag_8' 'O2__autocorrelation__lag_9'
 'O2__query_similarity_count__query_None__threshold_0.0'
 'O1__autocorrelation__lag_0' 'O1__autocorrelation__lag_1'
 'O1__autocorrelation__lag_2' 'O1__autocorrelation__lag_3'
 'O1__autocorrelation__lag_4' 'O1__autocorrelation__lag_5'
 'O1__autocorrelation__lag_6' 'O1__autocorrelation__lag_7'
 'O1__autocorrelation__lag_8' 'O1__autocorrelation__lag_9'
 'O1__query_similarity_count__query_None__threshold_0.0'
 'Oz__autocorrelation__lag_0' 'Oz__autocorrelation__lag_1'
 'Oz__autocorrelation__lag_2' 'Oz__autocorrelation__lag_3'
 'Oz__autocorrelation__lag_4' 'Oz__autocorrelation__lag_5'
 'Oz__autocorrelation__lag_6' 'Oz__autocorrelation__lag_7'
 'Oz__autocorrelation__lag_8' 'Oz__autocorrelation_

Processing DataFrame 7...


Feature Extraction: 100%|██████████| 15/15 [00:47<00:00,  3.14s/it]
 'O1__autocorrelation__lag_2' 'O1__autocorrelation__lag_3'
 'O1__autocorrelation__lag_4' 'O1__autocorrelation__lag_5'
 'O1__autocorrelation__lag_6' 'O1__autocorrelation__lag_7'
 'O1__autocorrelation__lag_8' 'O1__autocorrelation__lag_9'
 'O1__query_similarity_count__query_None__threshold_0.0'
 'Oz__autocorrelation__lag_0' 'Oz__autocorrelation__lag_1'
 'Oz__autocorrelation__lag_2' 'Oz__autocorrelation__lag_3'
 'Oz__autocorrelation__lag_4' 'Oz__autocorrelation__lag_5'
 'Oz__autocorrelation__lag_6' 'Oz__autocorrelation__lag_7'
 'Oz__autocorrelation__lag_8' 'Oz__autocorrelation__lag_9'
 'Oz__query_similarity_count__query_None__threshold_0.0'
 'O2__autocorrelation__lag_0' 'O2__autocorrelation__lag_1'
 'O2__autocorrelation__lag_2' 'O2__autocorrelation__lag_3'
 'O2__autocorrelation__lag_4' 'O2__autocorrelation__lag_5'
 'O2__autocorrelation__lag_6' 'O2__autocorrelation__lag_7'
 'O2__autocorrelation__lag_8' 'O2__autocorrelation_

Processing DataFrame 8...


Feature Extraction: 100%|██████████| 15/15 [00:36<00:00,  2.46s/it]
 'O1__autocorrelation__lag_2' 'O1__autocorrelation__lag_3'
 'O1__autocorrelation__lag_4' 'O1__autocorrelation__lag_5'
 'O1__autocorrelation__lag_6' 'O1__autocorrelation__lag_7'
 'O1__autocorrelation__lag_8' 'O1__autocorrelation__lag_9'
 'O1__query_similarity_count__query_None__threshold_0.0'
 'Oz__autocorrelation__lag_0' 'Oz__autocorrelation__lag_1'
 'Oz__autocorrelation__lag_2' 'Oz__autocorrelation__lag_3'
 'Oz__autocorrelation__lag_4' 'Oz__autocorrelation__lag_5'
 'Oz__autocorrelation__lag_6' 'Oz__autocorrelation__lag_7'
 'Oz__autocorrelation__lag_8' 'Oz__autocorrelation__lag_9'
 'Oz__query_similarity_count__query_None__threshold_0.0'
 'O2__autocorrelation__lag_0' 'O2__autocorrelation__lag_1'
 'O2__autocorrelation__lag_2' 'O2__autocorrelation__lag_3'
 'O2__autocorrelation__lag_4' 'O2__autocorrelation__lag_5'
 'O2__autocorrelation__lag_6' 'O2__autocorrelation__lag_7'
 'O2__autocorrelation__lag_8' 'O2__autocorrelation_

Processing DataFrame 9...


Feature Extraction: 100%|██████████| 15/15 [00:31<00:00,  2.12s/it]
 'O2__autocorrelation__lag_2' 'O2__autocorrelation__lag_3'
 'O2__autocorrelation__lag_4' 'O2__autocorrelation__lag_5'
 'O2__autocorrelation__lag_6' 'O2__autocorrelation__lag_7'
 'O2__autocorrelation__lag_8' 'O2__autocorrelation__lag_9'
 'O2__query_similarity_count__query_None__threshold_0.0'
 'O1__autocorrelation__lag_0' 'O1__autocorrelation__lag_1'
 'O1__autocorrelation__lag_2' 'O1__autocorrelation__lag_3'
 'O1__autocorrelation__lag_4' 'O1__autocorrelation__lag_5'
 'O1__autocorrelation__lag_6' 'O1__autocorrelation__lag_7'
 'O1__autocorrelation__lag_8' 'O1__autocorrelation__lag_9'
 'O1__query_similarity_count__query_None__threshold_0.0'
 'Oz__autocorrelation__lag_0' 'Oz__autocorrelation__lag_1'
 'Oz__autocorrelation__lag_2' 'Oz__autocorrelation__lag_3'
 'Oz__autocorrelation__lag_4' 'Oz__autocorrelation__lag_5'
 'Oz__autocorrelation__lag_6' 'Oz__autocorrelation__lag_7'
 'Oz__autocorrelation__lag_8' 'Oz__autocorrelation_

Total unique selected features from all DataFrames: 50


ValueError: columns cannot be a set

In [63]:
all_selected_feature_names

{'O1__ar_coefficient__coeff_2__k_10',
 'O1__ar_coefficient__coeff_3__k_10',
 'O1__ar_coefficient__coeff_4__k_10',
 'O1__ar_coefficient__coeff_5__k_10',
 'O1__ar_coefficient__coeff_6__k_10',
 'O1__ar_coefficient__coeff_7__k_10',
 'O1__ar_coefficient__coeff_8__k_10',
 'O1__ar_coefficient__coeff_9__k_10',
 'O1__augmented_dickey_fuller__attr_"usedlag"__autolag_"AIC"',
 'O1__fft_coefficient__attr_"angle"__coeff_40',
 'O1__fft_coefficient__attr_"angle"__coeff_41',
 'O1__fft_coefficient__attr_"angle"__coeff_57',
 'O1__fft_coefficient__attr_"angle"__coeff_6',
 'O1__fft_coefficient__attr_"angle"__coeff_63',
 'O1__fft_coefficient__attr_"angle"__coeff_76',
 'O1__fft_coefficient__attr_"angle"__coeff_87',
 'O1__fft_coefficient__attr_"imag"__coeff_40',
 'O1__fft_coefficient__attr_"imag"__coeff_41',
 'O1__fft_coefficient__attr_"imag"__coeff_63',
 'O1__fft_coefficient__attr_"real"__coeff_10',
 'O1__fft_coefficient__attr_"real"__coeff_11',
 'O1__fft_coefficient__attr_"real"__coeff_3',
 'O1__fft_coeffic

In [10]:
time_series_df

Unnamed: 0,id,time,O1,Oz,O2
0,A1_EC,0,0.000010,-1.500819e-06,-0.000006
1,A1_EC,1,0.000010,-1.018319e-06,-0.000005
2,A1_EC,2,0.000010,-6.288356e-07,-0.000004
3,A1_EC,3,0.000010,-3.342138e-07,-0.000003
4,A1_EC,4,0.000010,-1.304634e-07,-0.000002
...,...,...,...,...,...
851963,C1_NDEC,16379,-0.000003,-3.512712e-06,-0.000004
851964,C1_NDEC,16380,-0.000003,-2.912297e-06,-0.000004
851965,C1_NDEC,16381,-0.000003,-2.468121e-06,-0.000003
851966,C1_NDEC,16382,-0.000003,-2.206402e-06,-0.000003


In [15]:
# Import necessary libraries
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd
import numpy as np
import gc
from tsfresh import extract_features
from tsfresh.feature_extraction import EfficientFCParameters, MinimalFCParameters, ComprehesiveFCParameters
from tsfresh.utilities.dataframe_functions import impute
from sklearn.metrics import confusion_matrix

# 1. Create a Custom fc_parameters Dictionary
# Load EfficientFCParameters
efficient_fc_parameters = EfficientFCParameters()

# Create custom fc_parameters with the specified feature calculators
custom_fc_parameters = {
    'fft_coefficient': [
        {'coeff': coeff, 'attr': attr}
        for coeff in range(100)  # Adjust the range as needed
        for attr in ['real', 'imag', 'abs', 'angle']
    ],
    'ar_coefficient': [
        {'k': 10, 'coeff': coeff} for coeff in range(1, 11)
    ],
    'augmented_dickey_fuller': [
        {'attr': 'teststat'},
        {'attr': 'pvalue'},
        {'attr': 'usedlag'}
    ],
    'large_standard_deviation': [
        {'r': 0.2}
    ],
    'number_peaks': [
        {'n': n} for n in [1, 5]
    ],
    'fourier_entropy': [
        {'bins': 10}
    ],
    'ratio_beyond_r_sigma': [
        {'r': 1}
    ]
}

# Append MinimalFCParameters to custom_fc_parameters
minimal_fc_parameters = MinimalFCParameters()
custom_fc_parameters.update(minimal_fc_parameters)

# 2. Define the function to process data in chunks using the custom fc_parameters
def process_in_chunks(time_series_df, N, fc_parameters):
    # Get unique IDs
    unique_ids = time_series_df['id'].unique()
    
    # Split the unique IDs into chunks of size N
    chunks = [unique_ids[i:i + N] for i in range(0, len(unique_ids), N)]
    
    # Initialize an empty list to store the results
    results = []
    
    # Process each chunk
    for chunk in chunks:
        # Filter the DataFrame to include only the IDs in the current chunk
        chunk_df = time_series_df[time_series_df['id'].isin(chunk)]
        
        # Extract features for the current chunk using the custom fc_parameters
        extracted_features_chunk = extract_features(
            chunk_df,
            column_id='id',
            column_sort='time',
            default_fc_parameters=fc_parameters,
            n_jobs=4,
        )
        
        # Impute missing values in the extracted features
        impute(extracted_features_chunk)
        
        # Append the extracted features to the results list
        results.append(extracted_features_chunk)
        
        # Clear memory
        del chunk_df, extracted_features_chunk
        gc.collect()
    
    # Concatenate all the results into a single DataFrame
    final_result = pd.concat(results)
    
    return final_result

# 3. Set the chunk size N (adjust based on your memory constraints)
N = 1  # Adjust based on your memory constraints

# 4. Extract features using the process_in_chunks function with custom_fc_parameters
extracted_features = process_in_chunks(time_series_df, N, custom_fc_parameters)

# 5. Clean the extracted features
extracted_features_clean = extracted_features.replace([np.inf, -np.inf], np.nan).dropna(axis=1)

# 6. Ensure that the labels are aligned with the extracted features
labels_aligned = labels.loc[extracted_features_clean.index]

# 7. Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    extracted_features_clean,
    labels_aligned,
    test_size=0.25,
    random_state=42,
    stratify=labels_aligned
)

# 8. Define classifiers and their parameter grids
classifiers = {
    'Random Forest': {
        'pipeline': Pipeline([
            ('selector', SelectKBest(f_classif, k=50)),
            ('classifier', RandomForestClassifier(random_state=42))
        ]),
        'param_grid': {
            'classifier__n_estimators': [100, 200, 500],
            'classifier__max_depth': [None, 10, 20, 30],
            'classifier__min_samples_split': [2, 5, 10],
        }
    },
    'Gradient Boosting': {
        'pipeline': Pipeline([
            ('selector', SelectKBest(f_classif, k=50)),
            ('classifier', GradientBoostingClassifier(random_state=42))
        ]),
        'param_grid': {
            'classifier__n_estimators': [100, 200],
            'classifier__learning_rate': [0.01, 0.1],
            'classifier__max_depth': [3, 5],
            'classifier__min_samples_split': [2, 5],
        }
    },
    'Neural Network': {
        'pipeline': Pipeline([
            ('scaler', StandardScaler()),
            ('selector', SelectKBest(f_classif, k=50)),
            ('classifier', MLPClassifier(random_state=42, max_iter=1000))
        ]),
        'param_grid': {
            'classifier__hidden_layer_sizes': [(50,), (100,), (50, 50)],
            'classifier__activation': ['tanh', 'relu'],
            'classifier__solver': ['adam', 'sgd'],
            'classifier__alpha': [0.0001, 0.001],
            'classifier__learning_rate': ['constant', 'adaptive'],
        }
    },
    'Logistic Regression': {
        'pipeline': Pipeline([
            ('scaler', StandardScaler()),
            ('selector', SelectKBest(f_classif, k=50)),
            ('classifier', LogisticRegression(random_state=42, max_iter=5000))
        ]),
        'param_grid': {
            'classifier__penalty': ['l1', 'l2'],
            'classifier__C': [0.01, 0.1, 1, 10, 100],
            'classifier__solver': ['liblinear'],
        }
    },
    'Support Vector Machine': {
        'pipeline': Pipeline([
            ('scaler', StandardScaler()),
            ('selector', SelectKBest(f_classif, k=50)),
            ('classifier', SVC(random_state=42))
        ]),
        'param_grid': [
            {
                'classifier__kernel': ['linear'],
                'classifier__C': [0.1, 1, 10, 100]
            },
            {
                'classifier__kernel': ['rbf'],
                'classifier__C': [0.1, 1, 10, 100],
                'classifier__gamma': ['scale', 'auto']
            },
            {
                'classifier__kernel': ['poly'],
                'classifier__C': [0.1, 1, 10],
                'classifier__degree': [2, 3],
                'classifier__gamma': ['scale', 'auto']
            }
        ]
    },
    'XGBoost': {
        'pipeline': Pipeline([
            ('selector', SelectKBest(f_classif, k=50)),
            ('classifier', XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'))
        ]),
        'param_grid': {
            'classifier__n_estimators': [50, 100, 200],
            'classifier__max_depth': [3, 5, 7],
            'classifier__learning_rate': [0.01, 0.1, 0.2],
            'classifier__subsample': [0.5, 0.7, 1.0],
            'classifier__colsample_bytree': [0.5, 0.7, 1.0],
        }
    }
}
# 9. Create the directory 'ml_models' if it doesn't exist
output_dir = 'ml_models'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    
# 9. Loop through each classifier, perform grid search, and evaluate
for name, classifier_info in classifiers.items():
    print(f"\nTraining and evaluating {name}...")
    pipeline = classifier_info['pipeline']
    param_grid = classifier_info['param_grid']
    
    grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_pipeline = grid_search.best_estimator_
    y_pred = best_pipeline.predict(X_test)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    accuracy_percentage = round(accuracy * 100, 2)
    
    # Print the best parameters found by GridSearchCV
    print(f"Best parameters for {name}: {grid_search.best_params_}")
    
    # Evaluate the model
    print(f"Classification Report for {name}:")
    print(classification_report(y_test, y_pred))
    
    cm = confusion_matrix(y_test, y_pred)
    print(f"Confusion Matrix for {name}:")
    print(cm)
    
    # For models that provide feature importances, display them
    classifier_step = best_pipeline.named_steps['classifier']
    selector_step = best_pipeline.named_steps['selector']
    selected_feature_indices = selector_step.get_support(indices=True)
    selected_feature_names = extracted_features_clean.columns[selected_feature_indices]
    
    if hasattr(classifier_step, 'feature_importances_'):
        important_features = pd.DataFrame({
            'Feature': selected_feature_names,
            'Importance': classifier_step.feature_importances_
        }).sort_values(by='Importance', ascending=False)
        print(f"Important features for {name}:")
        print(important_features)
    elif hasattr(classifier_step, 'coef_'):
        # For linear models like SVM with linear kernel
        if hasattr(classifier_step, 'kernel') and classifier_step.kernel == 'linear':
            importance = np.abs(classifier_step.coef_[0])
            important_features = pd.DataFrame({
                'Feature': selected_feature_names,
                'Importance': importance
            }).sort_values(by='Importance', ascending=False)
            print(f"Important features for {name}:")
            print(important_features)
        elif isinstance(classifier_step, MLPClassifier):
            # For Neural Network, feature importances are not directly available
            print(f"{name} does not provide feature importances directly.")
        else:
            print(f"{name} does not provide feature importances directly.")
    else:
        print(f"{name} does not provide feature importances directly.")
    
     # Save the model to a .pkl file
    model_filename = f"{output_dir}/{name}_accuracy_{accuracy_percentage}%.pkl"
    with open(model_filename, 'wb') as file:
        pickle.dump(best_pipeline, file)
    print(f"Model saved to {model_filename}")


Feature Extraction: 100%|██████████| 3/3 [00:03<00:00,  1.21s/it]
Feature Extraction: 100%|██████████| 3/3 [00:04<00:00,  1.34s/it]
Feature Extraction: 100%|██████████| 3/3 [00:05<00:00,  1.77s/it]
Feature Extraction: 100%|██████████| 3/3 [00:04<00:00,  1.60s/it]
Feature Extraction: 100%|██████████| 3/3 [00:04<00:00,  1.47s/it]
Feature Extraction: 100%|██████████| 3/3 [00:04<00:00,  1.40s/it]
Feature Extraction: 100%|██████████| 3/3 [00:04<00:00,  1.37s/it]
Feature Extraction: 100%|██████████| 3/3 [00:04<00:00,  1.38s/it]
Feature Extraction: 100%|██████████| 3/3 [00:04<00:00,  1.48s/it]
Feature Extraction: 100%|██████████| 3/3 [00:04<00:00,  1.41s/it]
Feature Extraction: 100%|██████████| 3/3 [00:04<00:00,  1.37s/it]
Feature Extraction: 100%|██████████| 3/3 [00:04<00:00,  1.42s/it]
Feature Extraction: 100%|██████████| 3/3 [00:04<00:00,  1.41s/it]
Feature Extraction: 100%|██████████| 3/3 [00:04<00:00,  1.36s/it]
Feature Extraction: 100%|██████████| 3/3 [00:04<00:00,  1.37s/it]
Feature Ex


Training and evaluating Random Forest...


  f = msb / msw


Best parameters for Random Forest: {'classifier__max_depth': None, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 100}
Classification Report for Random Forest:
              precision    recall  f1-score   support

           0       0.80      0.67      0.73         6
           1       0.75      0.86      0.80         7

    accuracy                           0.77        13
   macro avg       0.78      0.76      0.76        13
weighted avg       0.77      0.77      0.77        13

Confusion Matrix for Random Forest:
[[4 2]
 [1 6]]
Important features for Random Forest:
                                              Feature  Importance
4           O1__fft_coefficient__attr_"abs"__coeff_26    0.068097
26  O1__augmented_dickey_fuller__attr_"usedlag"__a...    0.050751
25                  O1__ar_coefficient__coeff_9__k_10    0.046889
8          O1__fft_coefficient__attr_"imag"__coeff_53    0.045571
13        O1__fft_coefficient__attr_"angle"__coeff_64    0.038205
39         

  f = msb / msw


Best parameters for Gradient Boosting: {'classifier__learning_rate': 0.1, 'classifier__max_depth': 3, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 100}
Classification Report for Gradient Boosting:
              precision    recall  f1-score   support

           0       0.67      0.67      0.67         6
           1       0.71      0.71      0.71         7

    accuracy                           0.69        13
   macro avg       0.69      0.69      0.69        13
weighted avg       0.69      0.69      0.69        13

Confusion Matrix for Gradient Boosting:
[[4 2]
 [2 5]]
Important features for Gradient Boosting:
                                              Feature    Importance
4           O1__fft_coefficient__attr_"abs"__coeff_26  4.110902e-01
39         O2__fft_coefficient__attr_"real"__coeff_33  2.463395e-01
48        O2__fft_coefficient__attr_"angle"__coeff_92  8.115756e-02
36          O2__fft_coefficient__attr_"abs"__coeff_20  6.491618e-02
16        O1__fft_co

  f = msb / msw


Best parameters for Neural Network: {'classifier__activation': 'tanh', 'classifier__alpha': 0.0001, 'classifier__hidden_layer_sizes': (100,), 'classifier__learning_rate': 'constant', 'classifier__solver': 'sgd'}
Classification Report for Neural Network:
              precision    recall  f1-score   support

           0       0.71      0.83      0.77         6
           1       0.83      0.71      0.77         7

    accuracy                           0.77        13
   macro avg       0.77      0.77      0.77        13
weighted avg       0.78      0.77      0.77        13

Confusion Matrix for Neural Network:
[[5 1]
 [2 5]]
Neural Network does not provide feature importances directly.
Model saved to ml_models/Neural Network_accuracy_76.92%.pkl

Training and evaluating Logistic Regression...


  f = msb / msw


Best parameters for Logistic Regression: {'classifier__C': 1, 'classifier__penalty': 'l2', 'classifier__solver': 'liblinear'}
Classification Report for Logistic Regression:
              precision    recall  f1-score   support

           0       0.71      0.83      0.77         6
           1       0.83      0.71      0.77         7

    accuracy                           0.77        13
   macro avg       0.77      0.77      0.77        13
weighted avg       0.78      0.77      0.77        13

Confusion Matrix for Logistic Regression:
[[5 1]
 [2 5]]
Logistic Regression does not provide feature importances directly.
Model saved to ml_models/Logistic Regression_accuracy_76.92%.pkl

Training and evaluating Support Vector Machine...


  f = msb / msw


Best parameters for Support Vector Machine: {'classifier__C': 10, 'classifier__degree': 2, 'classifier__gamma': 'scale', 'classifier__kernel': 'poly'}
Classification Report for Support Vector Machine:
              precision    recall  f1-score   support

           0       0.50      0.50      0.50         6
           1       0.57      0.57      0.57         7

    accuracy                           0.54        13
   macro avg       0.54      0.54      0.54        13
weighted avg       0.54      0.54      0.54        13

Confusion Matrix for Support Vector Machine:
[[3 3]
 [3 4]]
Support Vector Machine does not provide feature importances directly.
Model saved to ml_models/Support Vector Machine_accuracy_53.85%.pkl

Training and evaluating XGBoost...
Best parameters for XGBoost: {'classifier__colsample_bytree': 0.7, 'classifier__learning_rate': 0.2, 'classifier__max_depth': 3, 'classifier__n_estimators': 50, 'classifier__subsample': 1.0}
Classification Report for XGBoost:
            

  _data = np.array(data, dtype=dtype, copy=copy,
  f = msb / msw
Parameters: { "use_label_encoder" } are not used.

