In [1]:
# Imports
import os, re, pathlib
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.decomposition import PCA
from gaussiannb_model import GaussianNBTF, save_model

In [2]:
# Configuration
data_root = r"/path/to/your/brainwave_readings/"
skip_dirs = { 'Group1-8channels' }
use_pca = True
pca_components = 32  # set None to disable or adjust
var_smoothing = 1e-9  # GNB variance smoothing

In [3]:
# Load all EEG files recursively into a list of DataFrames
core_dir = pathlib.Path(data_root)
dfs = []
for item in core_dir.rglob('*.txt'):
    try:
        if set(item.parts).isdisjoint(skip_dirs):
            df = pd.read_csv(item, sep=',', header=4, on_bad_lines='skip')
            df['src_filename'] = str(item)
            dfs.append(df)
    except Exception as e:
        print(f'Failed to read {item}: {e}')
print(f'Read {len(dfs)} files')
eeg_data = pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame()
print(eeg_data.shape)

Read 0 files
(0, 0)


In [4]:
# Robust label normalization from filenames
assert not eeg_data.empty, 'No data loaded. Check data_root.'
src = eeg_data['src_filename'].astype(str).str.lower()
src_norm = src.str.replace(r'[\s_\-]+', '', regex=True)
labels = pd.Series('', index=eeg_data.index)
def assign_where(patterns, value, labels, src_norm):
    m = pd.Series(False, index=src_norm.index)
    for pat in patterns:
        m = m | src_norm.str.contains(pat)
    cond = (labels == '') & m
    return labels.where(~cond, other=value)
labels = assign_where(['backward','backwards'], 'backward', labels, src_norm)
labels = assign_where(['fowward','forward'], 'forward', labels, src_norm)
labels = assign_where(['landing'], 'landing', labels, src_norm)
labels = assign_where(['left'], 'left', labels, src_norm)
labels = assign_where(['right'], 'right', labels, src_norm)
labels = assign_where(['takeoff','takeoff'], 'takeoff', labels, src_norm)
eeg_data['label_txt'] = labels.astype(str)
print(eeg_data.groupby(['label_txt'])['src_filename'].count())

label_txt
backward    223058
forward     226254
landing     218567
left        215514
right       223213
takeoff     222235
Name: src_filename, dtype: int64


In [5]:
# Build feature matrix X and label vector y
df = eeg_data.copy()
df = df[df['label_txt'].astype(str).str.len() > 0]
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
drop_cols = [c for c in ['Sample Index'] if c in num_cols]
feature_cols = [c for c in num_cols if c not in drop_cols]
feat = df[feature_cols].copy()
feat = feat.replace([np.inf, -np.inf], np.nan)
feat = feat.dropna(axis=1, how='all')
med = feat.median(numeric_only=True)
feat = feat.fillna(med)
std = feat.std(numeric_only=True)
keep = std[std > 0].index.tolist()
feat = feat[keep]
X = feat.to_numpy(dtype=np.float32)
y_cat = df['label_txt'].astype('category')
y = y_cat.cat.codes.to_numpy(dtype=np.int32)
label_names = list(y_cat.cat.categories)
print('X shape:', X.shape, 'classes:', len(label_names))

X shape: (1328841, 26) classes: 6


In [6]:
# Train/test split with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
# z-score standardization (fit on train only)
mu = X_train.mean(axis=0)
sd = X_train.std(axis=0)
sd[sd == 0] = 1.0
X_train_z = (X_train - mu) / sd
X_test_z  = (X_test  - mu) / sd
# Optional PCA
pca_components_ = None
pca_mean_ = None
if use_pca and pca_components is not None:
    n_comp = int(min(pca_components, X_train_z.shape[1]))
    pca = PCA(n_components=n_comp, whiten=True, random_state=42)
    X_train_z = pca.fit_transform(X_train_z)
    X_test_z = pca.transform(X_test_z)
    pca_components_ = pca.components_.astype(np.float32)
    pca_mean_ = pca.mean_.astype(np.float32)
print('Train/Test shapes:', X_train_z.shape, X_test_z.shape)

Train/Test shapes: (1063072, 26) (265769, 26)


In [None]:
# Train Gaussian Naive Bayes (TensorFlow)
model = GaussianNBTF(var_smoothing=var_smoothing)
model.fit(X_train_z, y_train)
y_pred = model.predict(X_test_z)
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc:.4f}")
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification report:\n", classification_report(y_test, y_pred, target_names=label_names, zero_division=0))

Accuracy: 0.3192
Confusion matrix:
 [[ 9863  1383  9492   727 22360   787]
 [ 3060 13227  7079   575 14465  6845]
 [ 1686  4566 19206   973 10753  6529]
 [ 1866  2546 10737  9355 17433  1166]
 [  552  1507 12045  3940 24993  1606]
 [ 1435  1334 12526  2707 18243  8202]]

Classification report:
               precision    recall  f1-score   support

    backward       0.53      0.22      0.31     44612
     forward       0.54      0.29      0.38     45251
     landing       0.27      0.44      0.33     43713
        left       0.51      0.22      0.30     43103
       right       0.23      0.56      0.33     44643
     takeoff       0.33      0.18      0.24     44447

    accuracy                           0.32    265769
   macro avg       0.40      0.32      0.32    265769
weighted avg       0.40      0.32      0.32    265769


Classification report:
               precision    recall  f1-score   support

    backward       0.53      0.22      0.31     44612
     forward       0.54    

In [None]:
# Save trained model with preprocessing metadata
meta = {
    'mu': mu.astype(np.float32),
    'sd': sd.astype(np.float32),
    'label_names': label_names,  # list[str]
    'feature_cols': feature_cols,  # list[str]
    'kept_feature_cols': keep,  # list[str]
    'pca_components': pca_components_ if pca_components_ is not None else None,
    'pca_mean': pca_mean_ if pca_mean_ is not None else None,
}
out_path = 'gaussiannb_trained.pth'
save_model(out_path, model, meta)
print(f'Saved trained model to {out_path}')

Saved trained model to gaussiannb_trained.pth


In [None]:
# Reload model module to ensure latest save_model is in scope
import importlib, gaussiannb_model
importlib.reload(gaussiannb_model)
from gaussiannb_model import GaussianNBTF, save_model