EEG Data is in the .EDF format and is downloaded from https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0188629. Fifteen minutes of EEG data were recorded in all subjects during an eyes-closed resting state condition. Data were acquired with the sampling frequency of 250 Hz using the standard 10–20 EEG montage with 19 EEG channels: Fp1, Fp2, F7, F3, Fz, F4, F8, T3, C3, Cz, C4, T4, T5, P3, Pz, P4, T6, O1, O2. The reference electrode was placed at FCz.
Save the zip file in the google drive before following the below steps of pre-processing all the 28 files in the dataset.

Following the basic pre-processing pipeline:
1.   Loading the data
2.   Filtering
3.   Re-referencing
4.   ICA
5.   Epoching
6.   Autoreject
7.   Saving the data in drive





In [None]:
!pip install mne

Collecting mne
  Downloading mne-1.9.0-py3-none-any.whl.metadata (20 kB)
Downloading mne-1.9.0-py3-none-any.whl (7.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: mne
Successfully installed mne-1.9.0


In [None]:
!pip install neurokit2

Collecting neurokit2
  Downloading neurokit2-0.2.10-py2.py3-none-any.whl.metadata (37 kB)
Downloading neurokit2-0.2.10-py2.py3-none-any.whl (693 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m693.1/693.1 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: neurokit2
Successfully installed neurokit2-0.2.10


In [None]:
!pip install autoreject

Collecting autoreject
  Downloading autoreject-0.4.3-py3-none-any.whl.metadata (6.3 kB)
Collecting h5io>=0.2.4 (from mne[hdf5]>=1.0->autoreject)
  Downloading h5io-0.2.4-py3-none-any.whl.metadata (3.9 kB)
Collecting pymatreader (from mne[hdf5]>=1.0->autoreject)
  Downloading pymatreader-1.0.0-py3-none-any.whl.metadata (1.5 kB)
Collecting xmltodict (from pymatreader->mne[hdf5]>=1.0->autoreject)
  Downloading xmltodict-0.14.2-py2.py3-none-any.whl.metadata (8.0 kB)
Downloading autoreject-0.4.3-py3-none-any.whl (29 kB)
Downloading h5io-0.2.4-py3-none-any.whl (17 kB)
Downloading pymatreader-1.0.0-py3-none-any.whl (9.3 kB)
Downloading xmltodict-0.14.2-py2.py3-none-any.whl (10.0 kB)
Installing collected packages: xmltodict, pymatreader, h5io, autoreject
Successfully installed autoreject-0.4.3 h5io-0.2.4 pymatreader-1.0.0 xmltodict-0.14.2


In [None]:
from glob import glob
import mne
mne.set_log_level('error')
import numpy as np
import matplotlib.pyplot as plt
import neurokit2 as nk
import autoreject

In [None]:
import zipfile
!unzip /content/drive/MyDrive/Data_files/dataverse_files.zip

Archive:  /content/drive/MyDrive/Data_files/dataverse_files.zip
  inflating: h01.edf                 
  inflating: h02.edf                 
  inflating: h03.edf                 
  inflating: h04.edf                 
  inflating: h05.edf                 
  inflating: h06.edf                 
  inflating: h07.edf                 
  inflating: h08.edf                 
  inflating: h09.edf                 
  inflating: h10.edf                 
  inflating: h11.edf                 
  inflating: h12.edf                 
  inflating: h13.edf                 
  inflating: h14.edf                 
  inflating: s01.edf                 
  inflating: s02.edf                 
  inflating: s03.edf                 
  inflating: s04.edf                 
  inflating: s05.edf                 
  inflating: s06.edf                 
  inflating: s07.edf                 
  inflating: s08.edf                 
  inflating: s09.edf                 
  inflating: s10.edf                 
  inflating: s11.edf    

In [None]:
all_file_path = glob('*.edf')
print(len(all_file_path))

28


In [None]:
all_file_path[0]

's03.edf'

In [None]:
healthy_file_path = [i for i in all_file_path if 'h' in i]
patient_file_path = [i for i in all_file_path if 's' in i]
print(len(healthy_file_path), len(patient_file_path))

14 14


In [None]:
healthy_file_path[0]

'h01.edf'

In [None]:
def read_data(file_path):

  #Loading the data
  data = mne.io.read_raw_edf(file_path, preload = True)
  data.set_montage('standard_1020')
  data.filter(l_freq=0.5,h_freq=45)
  data.set_eeg_reference()

  #Copy raw data for ICA (to avoid modifying original)
  raw_copy = data.copy()

  #Fit ICA
  ica = mne.preprocessing.ICA(n_components=10, random_state=42)
  ica.fit(raw_copy)

  #Find artifact components

  def  find_bad_ecg_indices(data,threshold):
    ecg_signal = data.get_data(picks='Fp1')[0]  # Select a surrogate channel with strong cardiac artifacts
    rpeaks, info = nk.ecg_peaks(ecg_signal, sampling_rate=data.info['sfreq'])   # Process the signal to detect R-peaks
    rpeaks_array = rpeaks['ECG_R_Peaks'].to_numpy()  # Convert the Series to a NumPy array
    ica_scores = ica.score_sources(data, target=rpeaks_array)  # Use detected R-peaks to correlate with ICA components
    ecg_indices = [i for i, score in enumerate(ica_scores) if abs(score) > threshold]
    return ecg_indices

  def  find_bad_emg_indices(data,threshold):
    emg_signal = data.get_data(picks='Fp1')[0]  # Extract data from a noisy EEG channel as a proxy for EMG
    emg_cleaned = nk.emg_clean(emg_signal, sampling_rate=data.info['sfreq'])   # Process EMG signal and detect bursts
    bursts = nk.emg_amplitude(emg_cleaned)
    ica_scores = ica.score_sources(data, target=bursts)  # Use the EMG bursts to identify noisy ICA components
    emg_indices = [i for i, score in enumerate(ica_scores) if abs(score) > threshold]
    return emg_indices

  def  find_saccade_indices(data,threshold):
    eog_signal = data.get_data(picks=['F8', 'F7']).mean(axis=0)
    saccades = nk.eog_findpeaks(eog_signal, sampling_rate=data.info['sfreq'])
    n_samples = data.n_times    # Total time points in EEG recording
    saccade_binary = np.zeros(n_samples)  # Create a zero-filled array of length n_samples
    saccade_binary[saccades] = 1  # Mark saccade event locations with 1
    ica_scores = ica.score_sources(data, target=saccade_binary)  # Score ICA Components Based on Saccades
    saccade_indices = [i for i, score in enumerate(ica_scores) if abs(score) > threshold]
    return saccade_indices

  #Identify and remove artifacts
  eog_indices, _ = ica.find_bads_eog(raw_copy, ch_name=['Fp1', 'Fp2'], threshold=2)
  ecg_indices  = find_bad_ecg_indices(raw_copy,threshold=0.5)
  emg_indices = find_bad_emg_indices(raw_copy, threshold=0.5)
  saccade_indices = find_saccade_indices(raw_copy, threshold=0.1)

  artifact_indices = list(set(eog_indices + ecg_indices + emg_indices + saccade_indices))

  ica.exclude = artifact_indices
  cleaned_data = ica.apply(raw_copy, exclude=ica.exclude)

  #Create epochs from cleaned data
  epochs= mne.make_fixed_length_epochs(cleaned_data,duration=5,overlap=1)
  epochs.load_data()

  #AutoReject
  ar = autoreject.AutoReject(n_interpolate=[1, 2, 3, 4], random_state=11,
                           n_jobs=1, verbose=False)
  ar.fit(epochs)
  epochs_ar, reject_log = ar.transform(epochs, return_log=True)
  array= epochs_ar.get_data()

  return array

In [None]:
sample_data=read_data(healthy_file_path[0])

In [None]:
sample_data.shape #no of epochs , channels , length of signal(Number of time points in each epoch.)

(231, 19, 1250)

In [None]:
%%capture
control_epochs_array = [read_data(i) for i in healthy_file_path]
patient_epochs_array = [read_data(i) for i in patient_file_path]

In [None]:
control_epochs_array[0].shape

(231, 19, 1250)

In [None]:
control_epochs_labels = [len(i)*[0] for i in control_epochs_array]
patient_epochs_labels = [len(i)*[1] for i in patient_epochs_array]
print(len(control_epochs_labels), len(patient_epochs_labels))

14 14


In [None]:
epochs_array = control_epochs_array + patient_epochs_array
epochs_labels = control_epochs_labels + patient_epochs_labels
print(len(epochs_array),len(epochs_labels))

28 28


In [None]:
group_list = [[i]*len(j) for i , j  in enumerate(epochs_array)]
len(group_list)

28

In [None]:
data_array = np.vstack(epochs_array)
label_array = np.hstack(epochs_labels)
group_array = np.hstack(group_list)
print(data_array.shape, label_array.shape,group_array.shape )

(6843, 19, 1250) (6843,) (6843,)


In [None]:
# Save the arrays
np.save('/content/drive/My Drive/data_array.npy', data_array)
np.save('/content/drive/My Drive/label_array.npy', label_array)
np.save('/content/drive/My Drive/group_array.npy', group_array)