## 00. Imports

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import xgboost as xgb
import lightgbm as lgb
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.models import Sequential
from sklearn.preprocessing import MinMaxScaler

## 01. Loading MEFAR PreProcessed DataSet and Setting X & Y Variables

In [2]:
MEFAR_MID = pd.read_csv('../raw_data//MEFAR_preprocessed/MEFAR_MID.csv')


In [4]:
X_all = MEFAR_MID.drop(columns=['class'])
y_all = MEFAR_MID['class']

In [5]:
X_EEG = X_all.drop(['BVP', 'EDA', 'TEMP', 'AccX', 'AccY', 'AccZ', 'HR'], axis=1)

In [6]:
X_all.head()

Unnamed: 0,BVP,EDA,TEMP,AccX,AccY,AccZ,HR,Delta,Theta,Alpha1,Alpha2,Beta1,Beta2,Gamma1,Gamma2,Attention,Meditation
0,-0.150426,0.026215,0.778824,-0.32549,0.019608,0.388235,0.357959,0.019138,0.082949,0.017362,0.020516,0.001465,0.002844,0.021253,0.044376,0.484848,0.090909
1,-0.145506,0.026215,0.778824,-0.333333,-0.003922,0.388235,0.357959,0.019138,0.082949,0.017362,0.020516,0.001465,0.002844,0.021253,0.044376,0.484848,0.090909
2,-0.135576,0.026215,0.778824,-0.34902,-0.035294,0.333333,0.357959,0.019138,0.082949,0.017362,0.020516,0.001465,0.002844,0.021253,0.044376,0.484848,0.090909
3,-0.121047,0.026215,0.778824,-0.380392,-0.027451,0.356863,0.357959,0.019138,0.082949,0.017362,0.020516,0.001465,0.002844,0.021253,0.044376,0.484848,0.090909
4,-0.103369,0.026215,0.778824,-0.411765,-0.011765,0.411765,0.357959,0.019138,0.082949,0.017362,0.020516,0.001465,0.002844,0.021253,0.044376,0.484848,0.090909


In [7]:
X_EEG.head()

Unnamed: 0,Delta,Theta,Alpha1,Alpha2,Beta1,Beta2,Gamma1,Gamma2,Attention,Meditation
0,0.019138,0.082949,0.017362,0.020516,0.001465,0.002844,0.021253,0.044376,0.484848,0.090909
1,0.019138,0.082949,0.017362,0.020516,0.001465,0.002844,0.021253,0.044376,0.484848,0.090909
2,0.019138,0.082949,0.017362,0.020516,0.001465,0.002844,0.021253,0.044376,0.484848,0.090909
3,0.019138,0.082949,0.017362,0.020516,0.001465,0.002844,0.021253,0.044376,0.484848,0.090909
4,0.019138,0.082949,0.017362,0.020516,0.001465,0.002844,0.021253,0.044376,0.484848,0.090909


## 02. Creating Time-Window Data

In [8]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np

sampling_rate = 32
window_seconds = 10  # 10 seconds of data = 320 rows
window_size = sampling_rate * window_seconds
stride = int(window_size * 0.5)  # 50% overlap = 5s shift


In [None]:
session_map = pd.read_csv('../raw_data/mental_fatigue/Session_Map.csv')

In [9]:

X_windows, y_windows, session_ids = [], [], []

for _, row in session_map.iterrows():  # Iterate over each session
    session_id = row['session_id']
    start = int(row['start_index'])
    end = int(row['end_index'])

    X_session = X.iloc[start:end].values
    y_session = y.iloc[start:end].values

    for i in range(0, len(X_session) - window_size + 1, stride):
        window = X_session[i:i + window_size]
        label_window = y_session[i:i + window_size]

        # Majority vote label
        majority_label = int(label_window.mean() > 0.5)

        X_windows.append(window)
        y_windows.append(majority_label)
        session_ids.append(session_id)

# Convert to numpy arrays
X_windows = np.array(X_windows)
y_windows = np.array(y_windows)
session_ids = np.array(session_ids)

# # Normalize per window (optional, test impact)
# X_norm = np.array([MinMaxScaler().fit_transform(window) for window in X_windows])

NameError: name 'session_map' is not defined

In [None]:
# Define sampling and windowing parameters
sampling_rate = 32                     # MEFAR data has 32 samples per second
window_seconds = 10                    # Each window covers 10 seconds
window_size = sampling_rate * window_seconds  # 320 samples per window
stride = int(window_size * 0.5)        # 50% overlap between windows → 5s step

# Lists to store results
X_windows, y_windows, session_ids = [], [], []

# Loop through each session defined in session_map
for _, row in session_map.iterrows():
    session_id = row['session_id']               # Session identifier
    start = int(row['start_index'])              # Start index of session in full dataset
    end = int(row['end_index'])                  # End index of session

    # Extract the features and labels for this session
    X_session = X.iloc[start:end].values         # Shape: (session_length, num_features)
    y_session = y.iloc[start:end].values         # Shape: (session_length,)

    # Slide window through session using defined stride
    for i in range(0, len(X_session) - window_size + 1, stride):
        # Extract the feature window and corresponding label window
        window = X_session[i:i + window_size]          # Shape: (320, num_features)
        label_window = y_session[i:i + window_size]    # Shape: (320,)

        # ✅ Assign label based on majority vote in the 10s window
        # If more than 50% of samples in the window are "1" (fatigued), label the window as fatigued
        majority_label = int(label_window.mean() > 0.5)

        # Store results
        X_windows.append(window)
        y_windows.append(majority_label)
        session_ids.append(session_id)

# Convert lists to numpy arrays for modeling
X_windows = np.array(X_windows)       # Shape: (num_windows, 320, num_features)
y_windows = np.array(y_windows)       # Shape: (num_windows,)
session_ids = np.array(session_ids)   # Shape: (num_windows,)

# Optional: Normalize each window individually using min-max scaling
# This ensures each feature in each window is scaled to [0, 1] based on its own min/max
X_norm = np.array([MinMaxScaler().fit_transform(window) for window in X_windows])