# **0. Download Dataset**

[dataset : doi:10.18150/repod.0107441](https://repod.icm.edu.pl/dataset.xhtml?persistentId=doi:10.18150/repod.0107441)

In [1]:
!wget -q https://repod.icm.edu.pl/api/datasets/251/versions/59/files/download?format=original -O data.zip
!unzip data.zip

Archive:  data.zip
  inflating: h01.edf                 
  inflating: h02.edf                 
  inflating: h03.edf                 
  inflating: h04.edf                 
  inflating: h05.edf                 
  inflating: h06.edf                 
  inflating: h07.edf                 
  inflating: h08.edf                 
  inflating: h09.edf                 
  inflating: h10.edf                 
  inflating: h11.edf                 
  inflating: h12.edf                 
  inflating: h13.edf                 
  inflating: h14.edf                 
  inflating: s01.edf                 
  inflating: s02.edf                 
  inflating: s03.edf                 
  inflating: s04.edf                 
  inflating: s05.edf                 
  inflating: s06.edf                 
  inflating: s07.edf                 
  inflating: s08.edf                 
  inflating: s09.edf                 
  inflating: s10.edf                 
  inflating: s11.edf                 
  inflating: s12.edf           

# **1. Download and Install Dependencies**

In [2]:
!pip install mne

import mne
from glob import glob
import numpy as np
import matplotlib.pyplot as plt
import os
from scipy import stats

Collecting mne
  Downloading mne-1.7.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m38.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: mne
Successfully installed mne-1.7.0


# **2. Load and Preprocess Data**

In [3]:
# Read Data
h_path = glob("./h*.edf")      # Healthy
s_path = glob("./s*.edf")      # Schizophernic

len(h_path), len(s_path)

(14, 14)

In [4]:
def read_data(path):
    '''
        Funtion to Read Data from Path and Apply Simple Preprocessing

        inputs:
            path ==> path to the data

        outputs:
            output ==> epoch/trial array
    '''

    data = mne.io.read_raw(path, preload=True, verbose=False)               # Read Data
    data = data.set_eeg_reference('average', verbose=False)                 # Apply Referencing
    data = data.filter(l_freq=0.1, h_freq=50, n_jobs=-1, verbose=False)     # Filter Data

    # Seperate Epochs
    epoch = mne.make_fixed_length_epochs(data, duration=5, overlap=1, verbose=False)

    return epoch.get_data(verbose=False)

In [5]:
## Apply on Data to Create Train Data
h_data = np.vstack([read_data(path) for path in h_path])
s_data = np.vstack([read_data(path) for path in s_path])

# Check Data Shapes
h_data.shape, s_data.shape

((3251, 19, 1250), (3950, 19, 1250))

In [6]:
## Create Labels for Data
h_label = np.zeros(len(h_data))
s_label = np.ones(len(s_data))

# Check Data Shapes
h_label.shape, s_label.shape

((3251,), (3950,))

In [7]:
## Merge Data
X = np.vstack((h_data, s_data))
Y = np.concatenate((h_label, s_label))

# Check Data Shapes
X.shape, Y.shape

((7201, 19, 1250), (7201,))

## **2.1 PreProcessing**

In [8]:
from scipy.fft import fft

def get_features(data):
    """
    Calculate various statistical features from the given EEG data.

    Inputes:
        data (ndarray) ==> Input data array.

    Outputs:
        ndarray ==> Concatenated array of statistical features.
    """

    features = [
        np.mean(data, axis=-1),                           # Mean
        np.std(data, axis=-1),                            # Standard Deviation
        np.ptp(data, axis=-1),                            # Peak-to-Peak (Range)
        np.var(data, axis=-1),                            # Variance
        np.min(data, axis=-1),                            # Minimum
        np.max(data, axis=-1),                            # Maximum
        np.argmin(data, axis=-1),                         # Index of Minimum
        np.argmax(data, axis=-1),                         # Index of Maximum
        np.mean(data**2, axis=-1),                        # Mean Square
        np.sqrt(np.mean(data**2, axis=-1)),               # Root Mean Square (RMS)
        np.sum(np.abs(np.diff(data, axis=-1)), axis=-1),  # Absolute Differences Sum
        stats.skew(data, axis=-1),                        # Skewness
        stats.kurtosis(data, axis=-1)                     # Kurtosis
    ]

    return np.concatenate(features, axis=-1)

def feature_extractor(data):
    """
    Extract comprehensive time domain and frequency domain features from EEG data.

    Inputes:
        data : ndarray
            Input EEG data in the shape (channels, samples).
    Outputs:
        ndarray  :
            Concatenated array of time domain and frequency domain features extracted from the input data.
            Features include statistical descriptors computed across each channel in both domains.
    """

    # Time Domain Features
    time_domain_features = get_features(data)

    # Frequency Domain Features
    fft_data = np.abs(fft(data))[:, :data.shape[1] // 2]  # Compute FFT and take positive frequencies
    frequency_domain_features = get_features(fft_data)

    # Concatenate time domain and frequency domain features
    all_features = [
        *time_domain_features,
        *frequency_domain_features
    ]

    return np.array(all_features)

In [9]:
# Feature Extraction
x = np.vstack([feature_extractor(X[i, :, :]) for i in range(len(X))])

# Check Feature Data Shape
x.shape

(7201, 494)

# **3. Classification**

In [10]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split

In [13]:
# Split data into training and test sets
x_train, x_test, Y_train, Y_test = train_test_split(x, Y, test_size=0.2, random_state=42)

# Create a pipeline with StandardScaler, PCA, and KNN classifier
Pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=28)),
    ('clf', KNeighborsClassifier(n_neighbors=13))
])

# Fit the pipeline on the training data
Pipe.fit(x_train, Y_train)

# Predict on the test data
Y_pred = Pipe.predict(x_test)

# Calculate accuracy
accuracy = accuracy_score(Y_test, Y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8056904927133934
