# Neurocheck data exploration

## Imports

In [14]:
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate, cross_val_score, train_test_split, RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from scipy import stats


## MEFAR dataset

Load the dataset:

In [2]:
df = pd.read_csv('../raw_data/MEFAR_preprocessed/MEFAR_MID.csv')
df.columns

Index(['BVP', 'EDA', 'TEMP', 'AccX', 'AccY', 'AccZ', 'HR', ' Delta', ' Theta',
       ' Alpha1', ' Alpha2', ' Beta1', ' Beta2', ' Gamma1', ' Gamma2',
       ' Attention', ' Meditation', 'class'],
      dtype='object')

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 923298 entries, 0 to 923297
Data columns (total 18 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   BVP          923298 non-null  float64
 1   EDA          923298 non-null  float64
 2   TEMP         923298 non-null  float64
 3   AccX         923298 non-null  float64
 4   AccY         923298 non-null  float64
 5   AccZ         923298 non-null  float64
 6   HR           923298 non-null  float64
 7    Delta       923298 non-null  float64
 8    Theta       923298 non-null  float64
 9    Alpha1      923298 non-null  float64
 10   Alpha2      923298 non-null  float64
 11   Beta1       923298 non-null  float64
 12   Beta2       923298 non-null  float64
 13   Gamma1      923298 non-null  float64
 14   Gamma2      923298 non-null  float64
 15   Attention   923298 non-null  float64
 16   Meditation  923298 non-null  float64
 17  class        923298 non-null  float64
dtypes: float64(18)
memory us

The MEFAR-MID preprocessed data has already been scaled and missing values were completed, so no need for preprocessing of this type.

Define features X and target y:

In [3]:
# 'X' to include only the EEG columns
features = [' Delta', ' Theta', ' Alpha1', ' Alpha2', ' Beta1', ' Beta2', ' Gamma1', ' Gamma2']
X = df[features]
y = df['class']

X.shape

(923298, 8)

Random sample from dataset (to run models evaluation quicker):

In [4]:
X_sample = X.sample(n=100_000,random_state=42)
y_sample = y.loc[X_sample.index]

Train/test split:

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size=0.2, random_state=42)

In [6]:
X_train.shape

(80000, 8)

## Models

### Baseline model: Logistic Regression

In [18]:
# Baseline: Logistic Regression
log_reg = LogisticRegression()

In [8]:
# Cross-validation
cv_results = cross_validate(log_reg, X_train, y_train, cv=10)

cv_results['test_score'].mean()

np.float64(0.5130625)

In [9]:
#Train baseline model
log_reg.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [10]:
#Evaluate baseline model
baseline = log_reg.score(X_test, y_test)
baseline

0.51265

Baseline for simple Logistic Regression model gives 51.26% accuracy.

### K Nearest Neighbors Classification

In [11]:
# KNN Classifier
knn = KNeighborsClassifier(n_neighbors=5)

In [12]:
# Cross-validation
cv_results = cross_validate(knn, X_train, y_train, cv=10)

cv_results['test_score'].mean()

np.float64(0.7941875)

In [23]:
# KNN model tuning
params = {
    'n_neighbors': stats.randint(1,50)
}

knn = KNeighborsClassifier()

random_search = RandomizedSearchCV(knn, params, n_iter=100, cv=5, n_jobs=-1)

random_search.fit(X_train, y_train)

0,1,2
,estimator,KNeighborsClassifier()
,param_distributions,{'n_neighbors': <scipy.stats....t 0x12ebbab60>}
,n_iter,100
,scoring,
,n_jobs,-1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,random_state,

0,1,2
,n_neighbors,1
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


In [24]:
random_search.best_score_

np.float64(0.9556875)

In [25]:
#KNN Best n_neighbors
random_search.best_params_

{'n_neighbors': 1}

In [26]:
#Best KNN model
knn_best = random_search.best_estimator_

In [27]:
# Train best KNN
knn_best.fit(X_train, y_train)

0,1,2
,n_neighbors,1
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


In [28]:
#Evaluate best KNN
knn_best.score(X_test, y_test)

0.9739

KNN has a score of 97.39%. Is this too good? Needs testing with the full dataset to confirm.

### 1D Convolutional Neural Network

Loading session map for preprocessing:

In [20]:
session_map = pd.read_csv('../raw_data/Session_Map.csv')
session_map.head(3)

Unnamed: 0,session_id,start_index,end_index
0,sub01_morning,0,1777
1,sub01_evening,1777,3679
2,sub02_morning,3679,5635


Time window preprocessing for Deep Learning:

In [None]:
# Define sampling and windowing parameters
sampling_rate = 32                     # MEFAR data has 32 samples per second
window_seconds = 10                    # Each window covers 10 seconds
window_size = sampling_rate * window_seconds  # 320 samples per window
stride = int(window_size * 0.5)        # 50% overlap between windows → 5s step

# Lists to store results
X_windows, y_windows, session_ids = [], [], []

# Loop through each session defined in session_map
for _, row in session_map.iterrows():
    session_id = row['session_id']               # Session identifier
    start = int(row['start_index'])              # Start index of session in full dataset
    end = int(row['end_index'])                  # End index of session

    # Extract the features and labels for this session
    X_session = X.iloc[start:end].values         # Shape: (session_length, num_features)
    y_session = y.iloc[start:end].values         # Shape: (session_length,)

    # Slide window through session using defined stride
    for i in range(0, len(X_session) - window_size + 1, stride):
        # Extract the feature window and corresponding label window
        window = X_session[i:i + window_size]          # Shape: (320, num_features)
        label_window = y_session[i:i + window_size]    # Shape: (320,)

        # ✅ Assign label based on majority vote in the 10s window
        # If more than 50% of samples in the window are "1" (fatigued), label the window as fatigued
        majority_label = int(label_window.mean() > 0.5)

        # Store results
        X_windows.append(window)
        y_windows.append(majority_label)
        session_ids.append(session_id)

# Convert lists to numpy arrays for modeling
X_windows = np.array(X_windows)       # Shape: (num_windows, 320, num_features)
y_windows = np.array(y_windows)       # Shape: (num_windows,)
session_ids = np.array(session_ids)   # Shape: (num_windows,)

# Optional: Normalize each window individually using min-max scaling
# This ensures each feature in each window is scaled to [0, 1] based on its own min/max
X_norm = np.array([MinMaxScaler().fit_transform(window) for window in X_windows])