# Preprocessing

### Imports


In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GroupShuffleSplit

In [4]:
#FILE PATH
file_path = '/home/cdennis51/code/Cdennis51/Neurocheck/raw_data/mental_fatigue/MEFAR_preprocessed/MEFAR_preprocessed/MEFAR_MID.csv'
df = pd.read_csv(file_path)

### Drop and assign columns 

In [5]:
print(df.columns.tolist())
df.columns = df.columns.str.strip()

['BVP', 'EDA', 'TEMP', 'AccX', 'AccY', 'AccZ', 'HR', ' Delta', ' Theta', ' Alpha1', ' Alpha2', ' Beta1', ' Beta2', ' Gamma1', ' Gamma2', ' Attention', ' Meditation', 'class']


In [6]:
#Load session map which defines start/end rows for each participant-session
session_map = pd.read_csv('/home/cdennis51/code/Cdennis51/Neurocheck/raw_data/mental_fatigue/Session_Map.csv')


In [7]:
#eeg_columns = ['Delta', 'Theta', 'Alpha1', 'Alpha2', 'Beta1', 'Beta2', 'Gamma1', 'Gamma2']
#X_eeg = df[eeg_columns].copy()
#y = df['class']

features = df.columns.drop('class')
X= df[features].copy()
y = df['class'].copy()

## Creating the windows

In [8]:
sampling_rate = 32
window_seconds = 10 #Window size = 10 seconds of data--> 320 rows per window
window_size = sampling_rate * window_seconds
stride = int(window_size*0.5) #Stride = 50% overlap --> shifts 5 seconds at a time

In [9]:
#Slide window within each session to avoid mixing subjects/sessions

X_windows, y_windows, session_ids = [], [], []

for _, row in session_map.iterrows():     #returns each row of DF as tuple (i,r) _ means ignore index use row
    session_id = row['session_id']
    start = int(row['start_index'])
    end = int(row['end_index'])

    X_session = X.iloc[start:end].values
    y_session = y.iloc[start:end].values

    for i in range(0, len(X_session) - window_size + 1, stride):
        window = X_session[i:i + window_size]
        label = y_session[i + window_size - 1]  # Label taken from the end of the window
        X_windows.append(window)
        y_windows.append(label)
        session_ids.append(session_id)



# convert to np for modelling
X_windows = np.array(X_windows)
y_windows = np.array(y_windows)
session_ids = np.array(session_ids)

In [10]:
# not sure whether to normalise per window again maybe test and remove
X_norm = np.array([MinMaxScaler().fit_transform(window) for window in X_windows])

In [11]:
print(len(X))
print("Window size:", window_size)
print("Stride:", stride)

923298
Window size: 320
Stride: 160


In [None]:
# split into training and test sets using GroupShuffleSplit (grouped by session id)
splitter = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)  #prevents pverlap between train/test split
train_idx, test_idx = next(splitter.split(X_norm, y_windows, groups=session_ids))

X_train, X_test = X_norm[train_idx], X_norm[test_idx]
y_train, y_test = y_windows[train_idx], y_windows[test_idx]

In [None]:
from sklearn.model_selection import cross_val_score, GroupKFold
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LogisticRegression
cv = GroupKFold(n_splits=5)
model = LogisticRegression(max_iter=1000)
# Flatten the 3D windowed data to 2D
n_samples, time_steps, n_feats = X_norm.shape
X_flat = X_norm.reshape(n_samples, time_steps * n_feats)

cross_val_score(model, X_flat, y_windows, groups=session_ids, cv=cv)


array([0.62626263, 0.63888889, 0.65306122, 0.63265306, 0.62244898])