In [1]:
#!pip install mne scipy

In [2]:
#!pip install pandas numpy openpyxl

In [3]:
#!pip install tsfresh

In [4]:
#!pip install PyWavelets

In [7]:
import os
import numpy as np
import scipy.signal as signal
import mne

def process_all_eeg_data() -> dict:
    """
    Process all .bdf EEG files in the current directory, applying filters and extracting relevant data.
    
    Returns
    -------
    dict
        A dictionary containing processed EEG data and header information for each file.
    """
    # Get a list of all .bdf files in the current directory
    files = [f for f in os.listdir('.') if f.endswith('.bdf')]
    if not files:
        raise FileNotFoundError("No BDF files found in the current directory")
    
    # Initialize the results dictionary
    results = {}
    
    # Loop over each file
    for filename in files:
        full_file_path = os.path.join(os.getcwd(), filename)
        
        # Read the raw EEG data using MNE
        raw = mne.io.read_raw_bdf(full_file_path, preload=True)
        hdr = raw.info
        
        # Select data from the occipital channel 'Oz' (assuming 'A16' is the label for Oz)
        channel_select = 'A16'
        if channel_select not in hdr['ch_names']:
            raise ValueError(f"Selected channel {channel_select} not found in the data")
        
        channel_index = hdr['ch_names'].index(channel_select)
        EEG_Oz1 = raw.get_data(picks=[channel_index]).T
        
        # Filter EEG Data
        Fs = hdr['sfreq']  # Sampling frequency
        
        # Bandpass filter parameters (2 to 80 Hz)
        Fc_BP = [2, 80]  # Bandpass frequency range
        Wn_BP = [f / (Fs / 2) for f in Fc_BP]  # Normalize by Nyquist frequency
        
        # Create and apply bandpass filter (6th order zero-phase Butterworth IIR)
        B_BP, A_BP = signal.butter(3, Wn_BP, btype='bandpass')
        EEG_Oz_filtered_BP = signal.filtfilt(B_BP, A_BP, EEG_Oz1, axis=0)
        
        # Band stop filter parameters (48 to 52 Hz)
        Fc_BS = [48, 52]  # Band stop frequency range
        Wn_BS = [f / (Fs / 2) for f in Fc_BS]  # Normalize by Nyquist frequency
        
        # Create and apply band stop filter (6th order zero-phase Butterworth IIR)
        B_BS, A_BS = signal.butter(3, Wn_BS, btype='bandstop')
        EEG_Oz_filtered = signal.filtfilt(B_BS, A_BS, EEG_Oz_filtered_BP, axis=0)
        
        # Extract prefix before underscore from the filename
        underscore_index = filename.find('_')
        if underscore_index == -1:
            raise ValueError(f"Filename format error, no underscore found in {filename}")
        key = filename[:underscore_index]
        
        # Store results in the dictionary
        results[key] = {
            'data': EEG_Oz_filtered,
            'header': hdr
        }
        
        # Display a message indicating successful processing
        print(f"Data for file {filename} processed successfully")
    
    return results


In [8]:
results = process_all_eeg_data()

Extracting EDF parameters from c:\Users\WERPELGA\OneDrive - Danone\Desktop\UoA\2024.1&2\Python Gabe\A1_Full_Block.bdf...
BDF file detected
Setting channel info structure...
Creating raw.info structure...
Reading 0 ... 739327  =      0.000 ...   361.000 secs...
Data for file A1_Full_Block.bdf processed successfully
Extracting EDF parameters from c:\Users\WERPELGA\OneDrive - Danone\Desktop\UoA\2024.1&2\Python Gabe\A3_Full_Block.bdf...
BDF file detected
Setting channel info structure...
Creating raw.info structure...
Reading 0 ... 757759  =      0.000 ...   370.000 secs...
Data for file A3_Full_Block.bdf processed successfully
Extracting EDF parameters from c:\Users\WERPELGA\OneDrive - Danone\Desktop\UoA\2024.1&2\Python Gabe\A4_Full_Block.bdf...
BDF file detected
Setting channel info structure...
Creating raw.info structure...
Reading 0 ... 782335  =      0.000 ...   382.000 secs...
Data for file A4_Full_Block.bdf processed successfully
Extracting EDF parameters from c:\Users\WERPELGA\One

In [9]:
import numpy as np
import pandas as pd

def segment_eeg_data_new(results: dict, cohort_file: str = 'Cohort.xlsx') -> dict:
    """
    Segments EEG data into predefined sections (EC, EO, LC, RC, DEC, NDEC) based on cohort information.
    
    Parameters
    ----------
    results : dict
        Dictionary containing the raw EEG data and header information for each key (participant).
    cohort_file : str, optional
        Path to the Excel file containing cohort information (default is 'Cohort.xlsx').

    Returns
    -------
    dict
        Dictionary containing segmented EEG data for each participant.
    """
    # Read the cohort information from an Excel file
    cohort_table = pd.read_excel(cohort_file)
    # Segment Duration (in seconds)
    segment_duration = 10  # seconds

    # Initialize the segmented results dictionary
    segmented_data = {}

    # Iterate through each key in the results dictionary
    for key, result in results.items():
        data = result['data']
        hdr = result['header']

        # Find the matching row in the cohort table
        cohort_row = cohort_table[cohort_table['Cohort'] == key]
        
        if cohort_row.empty:
            raise ValueError(f"Cohort information not found for {key}")

        # Define the duration and sample rate
        samples_per_segment = int(segment_duration * hdr['sfreq'])

        # Initialize segments with zeros
        EC = np.zeros(samples_per_segment)
        EO = np.zeros(samples_per_segment)
        LC = np.zeros(samples_per_segment)
        RC = np.zeros(samples_per_segment)
        DEC = np.zeros(samples_per_segment)
        NDEC = np.zeros(samples_per_segment)

        # Fill segments with data if available
        if len(data) >= samples_per_segment:
            EC = data[:samples_per_segment]
        if len(data) >= 2 * samples_per_segment:
            EO = data[samples_per_segment:2 * samples_per_segment]
        if len(data) >= 3 * samples_per_segment:
            LC = data[2 * samples_per_segment:3 * samples_per_segment]
        if len(data) >= 4 * samples_per_segment:
            RC = data[3 * samples_per_segment:4 * samples_per_segment]

        # Apply conditions based on cohort table
        if cohort_row['LC'].values[0] == 'DEC':
            # Swap 'LC' with 'DEC' condition if applicable
            DEC = LC
            NDEC = RC
        if cohort_row['RC'].values[0] == 'DEC':
            # Apply 'DEC' condition to 'RC'
            DEC = RC
            NDEC = LC

        # Store the segmented data and 'Lines Differences' in the results dictionary
        segmented_data[key] = {
            'header': hdr,
            'EC': EC,
            'EO': EO,
            'DEC': DEC,
            'NDEC': NDEC,
            'LinesDifference': cohort_row['LinesDifference'].values[0]
        }

    return segmented_data


In [10]:
segmented_data = segment_eeg_data_new(results)

In [11]:
import pandas as pd
import numpy as np
from tsfresh import extract_features, extract_relevant_features, select_features
from tsfresh.feature_extraction import MinimalFCParameters, ComprehensiveFCParameters
from scipy.stats import ttest_ind
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

def prepare_time_series_by_section(segmented_data, cohort_table):
    """
    Prepares a DataFrame suitable for tsfresh from segmented EEG data for all sections (EC, EO, DEC, NDEC).
    
    Parameters
    ----------
    segmented_data : dict
        The dictionary containing segmented EEG data for each participant.
    cohort_table : pd.DataFrame
        DataFrame containing cohort information (including labels for Amblyopia/Control).
    
    Returns
    -------
    pd.DataFrame, pd.Series
        A DataFrame where each row represents a time-series sample, and a Series with group labels.
    """
    data_list = []
    labels = []

    # Loop through each participant's data
    for key, value in segmented_data.items():
        # Find the matching cohort row
        cohort_row = cohort_table[cohort_table['Cohort'] == key]
        if cohort_row.empty:
            continue

        # Assign label based on the first letter of the 'Cohort' column (Amblyopia = 1, Control = 0)
        label = 1 if key.startswith('A') else 0

        # For each section (EC, EO, DEC, NDEC)
        for section in ['EC', 'EO', 'DEC', 'NDEC']:
            section_data = np.squeeze(value[section])  # Ensure the data is 1-dimensional

            # Create a DataFrame for each section
            df = pd.DataFrame({
                'id': [f"{key}_{section}"] * len(section_data),  # Unique ID for participant and section
                'time': np.arange(len(section_data)),  # Time step (sample number)
                'value': section_data  # The EEG data
            })

            # Append to list
            data_list.append(df)
            labels.append(label)

    # Concatenate all data into a single DataFrame
    time_series_df = pd.concat(data_list, ignore_index=True)

    # Return the time-series data and corresponding labels
    return time_series_df, pd.Series(labels)

# Load your cohort table (must include 'Cohort' column)
cohort_table = pd.read_excel('Cohort.xlsx')

# Prepare the time series DataFrame and labels
time_series_df, labels = prepare_time_series_by_section(segmented_data, cohort_table)


In [12]:
time_series_df

Unnamed: 0,id,time,value
0,A1_EC,0,0.000016
1,A1_EC,1,0.000015
2,A1_EC,2,0.000013
3,A1_EC,3,0.000011
4,A1_EC,4,0.000010
...,...,...,...
1064955,C1_NDEC,20475,-0.000004
1064956,C1_NDEC,20476,-0.000003
1064957,C1_NDEC,20477,-0.000002
1064958,C1_NDEC,20478,-0.000002


In [13]:
import pandas as pd

# Save time_series_df as CSV
time_series_df.to_csv('time_series_df.csv', index=False)

# Save labels as CSV
labels.to_csv('labels.csv', index=False, header=True)

# Optionally, save labels as Pickle (preserves Python object types)
# labels.to_pickle('labels.pkl')


In [14]:
import pandas as pd

# Read time_series_df from CSV
time_series_df = pd.read_csv('time_series_df.csv')

# Read labels from CSV
labels = pd.read_csv('labels.csv', squeeze=True)  # Use squeeze=True to load it as a Series if it's a single column

# Optionally, read labels from Pickle (preserves Python object types)
# labels = pd.read_pickle('labels.pkl')

TypeError: read_csv() got an unexpected keyword argument 'squeeze'

In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np
from tsfresh import extract_features
from tsfresh.feature_extraction import MinimalFCParameters

# Extract features using tsfresh with minimal settings
extracted_features = extract_features(time_series_df, column_id='id', column_sort='time',
                                      default_fc_parameters=MinimalFCParameters())

# Drop any columns with NaN or infinite values
extracted_features_clean = extracted_features.replace([np.inf, -np.inf], np.nan).dropna(axis=1)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(extracted_features_clean, labels, test_size=0.3, random_state=42)

# Select the most important features using ANOVA F-test
selector = SelectKBest(f_classif, k=10)  # Adjust 'k' to select the top k important features
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)

# Train a Random Forest Classifier to identify the most important features
clf = RandomForestClassifier(n_estimators=100, random_state=42)

# Define parameter grid for GridSearchCV (for Random Forest)
param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
}

# Perform grid search to find the best parameters for Random Forest
grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_selected, y_train)

# Print the best parameters found by GridSearchCV
print(f"Best parameters: {grid_search.best_params_}")

# Use the best estimator from GridSearchCV to predict and evaluate the model
best_clf = grid_search.best_estimator_
y_pred = best_clf.predict(X_test_selected)

# Evaluate the model
print(classification_report(y_test, y_pred))

# Identify and display the top selected features with importance
selected_feature_names = extracted_features_clean.columns[selector.get_support()]
important_features = pd.DataFrame({
    'Feature': selected_feature_names,
    'Importance': best_clf.feature_importances_
}).sort_values(by='Importance', ascending=False)

print(important_features)


Feature Extraction: 100%|██████████| 18/18 [00:03<00:00,  4.80it/s]
  f = msb / msw


Best parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 100}
              precision    recall  f1-score   support

           0       0.12      0.17      0.14         6
           1       0.38      0.30      0.33        10

    accuracy                           0.25        16
   macro avg       0.25      0.23      0.24        16
weighted avg       0.28      0.25      0.26        16

                     Feature  Importance
8    value__absolute_maximum    0.214460
7             value__maximum    0.168308
9             value__minimum    0.159271
4  value__standard_deviation    0.146875
6    value__root_mean_square    0.134352
0          value__sum_values    0.128472
1              value__median    0.048262
2                value__mean    0.000000
3              value__length    0.000000
5            value__variance    0.000000


1. Classifier Performance:
The Random Forest model's classification performance is summarized in the precision, recall, f1-score, and support columns for both classes:

- Class 0 (Control): The model predicted this class with a precision of 0.12, a recall of 0.17, and an F1-score of 0.14.
- Class 1 (Amblyopia): The model predicted this class with a precision of 0.38, a recall of 0.30, and an F1-score of 0.33.
- Overall Accuracy: The model's overall accuracy is 0.25 (25%), which indicates that the model didn't perform well in distinguishing between Amblyopia and Control participants.

Key Metrics:

Precision: Measures how many of the predicted positive results are true positives.
- For class 0 (Control), only 12% of the instances predicted as Control were correct.
- For class 1 (Amblyopia), 38% of the instances predicted as Amblyopia were correct.

Recall: Measures how many actual positive instances were correctly predicted.
- For class 0, 17% of the actual Control instances were correctly identified.
- For class 1, 30% of the actual Amblyopia instances were correctly identified.

F1-Score: A harmonic mean of precision and recall, giving a better sense of the balance between the two metrics. In both classes, the F1-scores are relatively low, especially for Control.

This low performance (accuracy 25%) suggests that the model struggled to differentiate between the two groups based on the extracted features. This could be due to various reasons, such as insufficient or irrelevant features, a small dataset, or an imbalance between the groups.

2. Feature Importance:
This table ranks the extracted features by their importance, as determined by the Random Forest classifier:

Feature	Importance
- 8    value__absolute_maximum    0.214460
- 7             value__maximum    0.168308
- 9             value__minimum    0.159271
- 4  value__standard_deviation    0.146875
- 6    value__root_mean_square    0.134352
- 0          value__sum_values    0.128472
- 1              value__median    0.048262
- 2                value__mean    0.000000
- 3              value__length    0.000000
- 5            value__variance    0.000000

Most Important Features:

- value__absolute_maximum: This feature, which represents the absolute maximum value in the time series, was the most important in distinguishing between Amblyopia and Control participants (importance = 0.214460).

Observations:

- Feature Importance Distribution: The features' importance values are skewed, with the top 6 features contributing most to the model's predictions, while others (such as mean, length, and variance) contributed very little or nothing.

Interpretation:
- Low Accuracy: The classifier struggled to differentiate between the Amblyopia and Control groups, possibly because the extracted features don't capture the necessary information to distinguish between these groups, or the dataset might be too small or imbalanced.
- Feature Importance: Features like absolute_maximum, maximum, and minimum seem to provide the most information for distinguishing between Amblyopia and Control participants. These might represent extreme fluctuations or peak characteristics in the EEG signals.

Next Steps:
- Feature Engineering: Consider extracting additional or more complex features using tsfresh or other methods to capture more meaningful aspects of the EEG data.
- Balanced Dataset: Ensure that the dataset is balanced between Amblyopia and Control groups to avoid skewed performance metrics.
- Cross-Validation: Use cross-validation to get a more robust estimate of model performance.
- Other Models: Try different classifiers (e.g., Support Vector Machines or Gradient Boosting) to see if they perform better.
- Hyperparameter Tuning: Tuning the Random Forest classifier (e.g., adjusting the number of trees or maximum depth) could potentially improve the model's accuracy.
Let me know if you need further insights or improvements!