### Dimensionality Reduction and Feature Selection  
  
The first thing done below is running a model with a feature set that includes 9 different feature types and 401 overall features. The goal is going to be to reduce that number and improve performance. One possibility is signal averaging. This can be done by region or using specific electrode chains defined by the EEG montage used for generating the spectrograms which were provided with this data. Another possibility is using PCA to reduce 401 columns down substantially while retaining a dataset that explains the strong majority of the variance in the dataset.

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import fastparquet, pyarrow
import mne
from mne.decoding import Scaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, f1_score, accuracy_score, precision_score, recall_score
from utils import *

In [3]:
mne.set_log_level('WARNING')

In [4]:
metadata = pd.read_csv('by_patient.csv')

In [5]:
other_metadata = activity_df(metadata, 'Other', 'expert_consensus')
seizure_metadata = activity_df(metadata, 'Seizure', 'expert_consensus')
gpd_metadata = activity_df(metadata, 'GPD', 'expert_consensus')
lpd_metadata = activity_df(metadata, 'LPD', 'expert_consensus')
grda_metadata = activity_df(metadata, 'GRDA', 'expert_consensus')
lrda_metadata = activity_df(metadata, 'LRDA', 'expert_consensus')
activity_df_list = [other_metadata, seizure_metadata, gpd_metadata, lpd_metadata, grda_metadata, lrda_metadata]

In [6]:
y = get_yvals(2000)['activity']

### Features Included  
  
- Frequency Band Power  
- Hjorth Complexity Time and Frequency  
- Hjorth Mobility Time and Frequency  
- Wavelet Coef Energy  
- Higuchi  
- Zero Crossings  
- Spectral Slope

In [6]:
def get_scaled_features(stored_data):
    X = pd.read_csv(stored_data)
    cols = X.columns
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    return pd.DataFrame(X, columns = cols)

In [7]:
df_list = ['band_pow_df.csv', 'time_comp_df.csv', 'freq_comp_df.csv',
           'time_mob_df.csv', 'spectral_mob_df.csv', 'coef_energy_df.csv', 'higuchi_fd_set.csv', 
           'zero_xing_df.csv', 'spectslope_df.csv']
X = pd.DataFrame()
for df in df_list:
    X = pd.concat([X, get_scaled_features(df)], axis = 1)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 42)

### Full Feature Set Results  
  
These results are the baseline for dimensionality reduction and feature selection work. The testing accuracy with the full feature set with all of the feature types listed above included is 76%.

In [9]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
training_yhat = knn.predict(X_train)
testing_yhat = knn.predict(X_test)
print('Full Set Training')
print(classification_report(y_train, training_yhat))
print('Full Set Testing')
print(classification_report(y_test, testing_yhat))
full_set_acc = accuracy_score(y_test, testing_yhat)
full_set_macro_f1 = f1_score(y_test, testing_yhat, average = 'macro')
full_set_macro_prec = precision_score(y_test, testing_yhat, average = 'macro')
full_set_macro_rec = recall_score(y_test, testing_yhat, average = 'macro')
full_set_f1 = f1_score(y_test, testing_yhat, average = None)
full_set_prec = precision_score(y_test, testing_yhat, average = None)
full_set_rec = recall_score(y_test, testing_yhat, average = None)

Full Set Training
              precision    recall  f1-score   support

         GPD       0.86      0.94      0.90      1794
        GRDA       0.84      0.89      0.87      1819
         LPD       0.80      0.90      0.85      1800
        LRDA       0.88      0.91      0.89      1795
       Other       0.84      0.74      0.78      1808
     Seizure       0.90      0.72      0.80      1784

    accuracy                           0.85     10800
   macro avg       0.85      0.85      0.85     10800
weighted avg       0.85      0.85      0.85     10800

Full Set Testing
              precision    recall  f1-score   support

         GPD       0.79      0.89      0.84       206
        GRDA       0.72      0.79      0.75       181
         LPD       0.69      0.83      0.76       200
        LRDA       0.81      0.85      0.83       205
       Other       0.72      0.60      0.65       192
     Seizure       0.85      0.62      0.71       216

    accuracy                           0.7

### Signal Averaging: Bipolar Montage

In [10]:
sub_eeg = load_preprocess(metadata, 0, names, montage_chains, 1, None,
                      bandpass = True, notch = False, reref = True)

In [22]:
info = mne.create_info(
    sub_eeg.columns.to_list(),
    ch_types=(["eeg"]*(len(sub_eeg.columns))),
    sfreq=200
)
raw = mne.io.RawArray(
    sub_eeg.to_numpy().T,
    info
)


In [1]:
def average_signals(raw, names, montage_chains):
    sub_eeg = pd.DataFrame(raw.get_data(), index = raw.ch_names).transpose()
    averages = []
    for chain in montage_chains:
        signal_sum = 0
        for i in chain:
            signal_sum += sub_eeg[i]
        avg_signal = signal_sum / 4
        averages.append(avg_signal)
    avg_eeg = pd.DataFrame(averages, index = names).transpose()
    info = mne.create_info(
        avg_eeg.columns.to_list(),
        ch_types=(["eeg"]*(len(avg_eeg.columns))),
        sfreq=200
    )
    raw = mne.io.RawArray(
        avg_eeg.to_numpy().T,
        info
    )
    return raw

In [26]:
raw.get_data().shape

(5, 10000)

In [10]:
activity_indexes = pd.read_csv('activity_indexes.csv')

In [11]:
index_lists = [activity_indexes[col] for col in activity_indexes.columns]

### Attempt with Band Power Features

In [12]:
n = ['LL','LP','RP','RR', 'Central']

mc = [['Fp1','F7','T3','T5','O1'],
      ['Fp1','F3','C3','P3','O1'],
      ['Fp2','F8','T4','T6','O2'],
      ['Fp2','F4','C4','P4','O2'], 
      ['Fz', 'Cz', 'Pz']]

In [13]:
montage_bandpower = full_band_df(activity_df_list, index_lists, n, mc, 1, None, 
                                bandpass = True, notch = False, reref = True)
montage_bandpower.to_csv('montage_bandpower.csv', index = None)

In [7]:
X = pd.read_csv('montage_bandpower.csv')

scaler = StandardScaler()
X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 42)

In [16]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
training_yhat = knn.predict(X_train)
testing_yhat = knn.predict(X_test)
print('Montage Averaging Training')
print(classification_report(y_train, training_yhat))
print('Montage Averaging Testing')
print(classification_report(y_test, testing_yhat))
montage_acc = accuracy_score(y_test, testing_yhat)
montage_macro_f1 = f1_score(y_test, testing_yhat, average = 'macro')
montage_macro_prec = precision_score(y_test, testing_yhat, average = 'macro')
montage_macro_rec = recall_score(y_test, testing_yhat, average = 'macro')
montage_f1 = f1_score(y_test, testing_yhat, average = None)
montage_prec = precision_score(y_test, testing_yhat, average = None)
montage_rec = recall_score(y_test, testing_yhat, average = None)

Montage Averaging Training
              precision    recall  f1-score   support

         GPD       0.75      0.92      0.82      1794
        GRDA       0.73      0.83      0.78      1819
         LPD       0.73      0.84      0.78      1800
        LRDA       0.78      0.86      0.81      1795
       Other       0.75      0.53      0.62      1808
     Seizure       0.84      0.58      0.69      1784

    accuracy                           0.76     10800
   macro avg       0.76      0.76      0.75     10800
weighted avg       0.76      0.76      0.75     10800

Montage Averaging Testing
              precision    recall  f1-score   support

         GPD       0.70      0.83      0.76       206
        GRDA       0.58      0.65      0.61       181
         LPD       0.62      0.72      0.66       200
        LRDA       0.67      0.77      0.72       205
       Other       0.51      0.35      0.42       192
     Seizure       0.73      0.50      0.59       216

    accuracy            

### Hjorth  
  
Before running the code below, I need to check the utils functions to be sure that everything has been correctly adjusted for the new EEGs which have 5 columns after signal averaging instead of 20 columns with 19 of them being EEG and the 20th being EKG.

In [None]:
montage_tcomp = full_complexity_df(activity_df_list, index_lists, n, mc, 1, None, 
                                bandpass = True, notch = False, reref = True, spectral = False)
montage_tcomp.to_csv('montage_tcomp.csv', index = None)