# Experimenting with EEG data

In [1]:
import sklearn

In [2]:
import os

import matplotlib.pyplot as plt
import numpy as np

%matplotlib inline

In [32]:
RECORDINGS_FOLDER = os.path.join('..', '..', 'recordings')

RECORDINGS_TRAIN = {
    # 'left-02.csv', 
#     'left': ['left-01.csv', 'left-02.csv'],
#     'right': ['right-01.csv', 'right-02.csv'], #, 'right-03.csv'],
#     'lr': ['left-01.csv', 'left-02.csv', 'right-01.csv', 'right-03.csv'],
      'relaxed': ['closed-relax-01.csv', 'closed-relax-02.csv'],
#     'forward': ['forward-01.csv'],  # TODO(andrei): Maybe 'run-01-andrei.csv'.
#     'helicopter': ['helicopter-andrei.csv'],
      'baseline': ['baseline-andrei.csv', 'internet-browsing-01.csv'],
      'tense': ['typingtest-01.csv', 'typingtest-03.csv']
}

RECORDINGS_VALID = {
#     'lr': ['right-03.csv', 'left-03.csv'],
#     'right': [],
#     'forward': [],
#     'helicopter': [],
#     'baseline': ['internet-browsing-01.csv']
    'relaxed': [],
    'tense': ['typingtest-02.csv'],
    'baseline': []
}


MUSE_LABELS = {
 '/muse/acc',
 '/muse/batt',
 '/muse/config',
 '/muse/drlref',
 '/muse/eeg',
 '/muse/eeg/quantization',
 '/muse/elements/alpha_absolute',
 '/muse/elements/alpha_relative',
 '/muse/elements/alpha_session_score',
 '/muse/elements/beta_absolute',
 '/muse/elements/beta_relative',
 '/muse/elements/beta_session_score',
 '/muse/elements/blink',
 '/muse/elements/delta_absolute',
 '/muse/elements/delta_relative',
 '/muse/elements/delta_session_score',
 '/muse/elements/experimental/concentration',
 '/muse/elements/experimental/mellow',
 '/muse/elements/gamma_absolute',
 '/muse/elements/gamma_relative',
 '/muse/elements/gamma_session_score',
 '/muse/elements/horseshoe',
 '/muse/elements/is_good',
 '/muse/elements/jaw_clench',
 '/muse/elements/low_freqs_absolute',
 '/muse/elements/raw_fft0',
 '/muse/elements/raw_fft1',
 '/muse/elements/raw_fft2',
 '/muse/elements/raw_fft3',
 '/muse/elements/theta_absolute',
 '/muse/elements/theta_relative',
 '/muse/elements/theta_session_score',
 '/muse/elements/touching_forehead',
 '/muse/version'}

# 4 electrodes, 4 sets of FFT coefficients per window!
RAW_FFT0 = '/muse/elements/raw_fft0'
RAW_FFT1 = '/muse/elements/raw_fft1'
RAW_FFT2 = '/muse/elements/raw_fft2'
RAW_FFT3 = '/muse/elements/raw_fft3'
IS_GOOD = '/muse/elements/is_good'
# interesting_feats = ['/muse/elements/raw_fft0', 'alpha_absolute']

In [33]:
def read_rec(fpath: str):
    all_ts = []
    all_fft = []
    all_good_masks = []
    
    # TODO(andrei): the IS_GOOD may contain too little info, maybe. Should try horseshoe and a thresh of like <1.25 or so.
    
    last_good = [0, 0, 0, 0]
    last_good_time = -1
    
    current_feat = None
    
    # We expect fft indices always in 0-1-2-3 order. This variable keeps track of this.
    expecting = 0
    
    with open(fpath, 'r') as f:
        for line_idx, line in enumerate(f.readlines()):            
            parts = line.split(', ')
            ts = float(parts[0])
            label = parts[1]
            
            if label.startswith('/muse/elements/raw_fft'):
                rest_np = np.array([float(part) for part in parts[2:]])
                idx = int(label[-1])
                if idx != expecting:
                    print("WRONG")
                    raise ValueError();
                else:
                    if current_feat is None:
                        current_feat = rest_np
                    else:
                        current_feat = np.hstack((current_feat, rest_np))
                    
                    expecting = (idx + 1) % 4
                    
                    if expecting == 0:
                        if last_good_time != -1 and (ts - last_good_time) > 0.005 and (ts - last_good_time) > 0.00:
                            print("WARNING: Bad data sync.")
                  
                        all_ts.append(ts)
                        all_fft.append(current_feat)
                        all_good_masks.append(np.all(last_good))
#                         print('cfs', current_feat.shape)
                        current_feat = None
  
            
#             if label == RAW_FFT0:

#                 all_fft.append(rest_np)
#                 all_fft1.append(last_fft1)
#                 all_good_masks.append(np.all(last_good))
                  
#                 # TODO(andrei): WARNING, this tolerance is HUGE, so may be problematic.
#                 if last_fft1_time != -1 and (ts - last_fft1_time) > 0.18 and (ts - last_fft1_time) > 0.00:
#                     print("WARNING: Bad data sync (fft1).", ts - last_fft1_time)
#             elif label == RAW_FFT1:
#                 last_fft1_time = ts
#                 rest_np = np.array([float(part) for part in parts[2:]])
#                 if(rest_np.shape != (129,)):
#                     print("WTF:", rest_np.shape)
#                 last_fft1 = rest_np
            elif label == IS_GOOD:
                last_good_time = ts
                rest_np = np.array([float(part) for part in parts[2:]])
                last_good = rest_np
                
#     print(len(all_fft))
#     print(len(all_fft1))
#     print(len(all_fft[0]))
#     print(len(all_fft1[0]))
#     print(np.array(all_fft).shape)
#     print(np.array(all_fft1).shape)

    print(len(all_fft))
    print(np.array(all_fft).shape)
    return np.array(all_good_masks), np.array(all_ts), np.array(all_fft)

In [34]:
def spec2data(recording_map):
    data_map = {}
    for label, fnames in recording_map.items():
        cfeats = []
        for fname in fnames:
            # Read the data for that recording for that 
            good_mask, all_ts, all_feats = read_rec(os.path.join(RECORDINGS_FOLDER, fname))
    #         all_ts = all_ts[good_mask]
            print(all_feats.shape, good_mask.shape)
            all_feats = all_feats[good_mask]
            cfeats.append(all_feats)

        if len(cfeats) > 1:
            data_map[label] = np.vstack(cfeats)
        elif len(cfeats) == 1:
            data_map[label] = cfeats[0]
        else:
            data_map[label] = np.array([])
    
    for label, data in data_map.items():
        print("{0}: {1}".format(label, data.shape))
        
    return data_map

print("Processing train...")
data_map_train = spec2data(RECORDINGS_TRAIN)
print("Processing validation...")
data_map_test = spec2data(RECORDINGS_VALID)

Processing train...
1368
(1368, 516)
(1368, 516) (1368,)
1881
(1881, 516)
(1881, 516) (1881,)
1235
(1235, 516)
(1235, 516) (1235,)
1079
(1079, 516)
(1079, 516) (1079,)
1806
(1806, 516)
(1806, 516) (1806,)
1996
(1996, 516)
(1996, 516) (1996,)
tense: (2878, 516)
relaxed: (2304, 516)
baseline: (2764, 516)
Processing validation...
1895
(1895, 516)
(1895, 516) (1895,)
tense: (45, 516)
relaxed: (0,)
baseline: (0,)


In [51]:
def gen_data_matrix(data_map):
    X = None
    y = None

    for idx, (label, data) in enumerate(data_map.items()):
        if len(data) == 0:
            continue
            
        print(idx, label)
            
        if X is None:
            X = data
            y = np.zeros(X.shape[0])
        else:
            X = np.vstack((X, data))
            y = np.hstack((y, np.ones(data.shape[0]) * idx))

#     print(X.shape)
#     print(y.shape)
    return X, y
    
X_train, y_train = gen_data_matrix(data_map_train)
X_test, y_test = gen_data_matrix(data_map_test)

0 tense
1 relaxed
2 baseline
0 tense


In [87]:
print(", ".join([str(d) for d in X_train[0]]))

# all_ts_run = np.array(all_ts_run)
# deltas = all_ts_run[1:] - all_ts_run[:-1]
# print(deltas.mean())
# print(np.median(deltas))
# print(deltas.std())

11.9511175, 18.291094, 20.788092, 22.603186, 23.209288, 22.306587, 19.845474, 15.335599, 13.440422, 14.134462, 13.811703, 12.711906, 8.919536, 0.3454846, 0.6849613, 6.1878805, 6.5826874, 7.902959, 5.829215, 0.29376704, -4.789386, -5.8287697, -3.3021586, -2.1122236, -1.4895154, -0.8462546, -10.879423, -6.6052256, 0.17633395, -0.9099651, -3.7232182, -8.046315, -7.6615515, -1.0993654, -2.224989, -5.712921, -4.9520135, -3.4098516, -7.8462725, -4.8222146, -6.8093023, -14.508396, -10.934134, -8.291086, -7.6863613, -4.1332884, -5.0380807, -12.109217, -9.899872, -4.034348, -7.2110343, -3.2156935, -2.554939, -7.32338, -4.051792, -2.977265, -5.8236527, -9.683343, -12.700991, -12.114233, -9.5633545, -10.19207, -11.097839, -11.437096, -13.15785, -10.935534, -9.01671, -14.331649, -25.754673, -29.391386, -27.281206, -20.154917, -21.09897, -22.342802, -17.841003, -12.930537, -14.874968, -11.776426, -10.409021, -9.799373, -11.961889, -12.736766, -13.440732, -10.562811, -10.9712, -6.95038, -9.346109, -

In [37]:
# X_train = np.vstack((all_fft_base, all_fft_run))
# y_train = np.hstack((np.zeros(all_fft_base.shape[0]), np.ones(all_fft_run.shape[0])))
# X_train = np.vstack((all_fft_heli, all_fft_run, all_fft_base))
# y_train = np.hstack((np.zeros(all_fft_heli.shape[0]), np.ones(all_fft_run.shape[0]), 2 * np.ones(all_fft_base.shape[0])))

# print(all_fft_base.shape, all_fft_run.shape, all_fft_heli.shape)
print(X_train.shape, y_train.shape)
print(X_train.dtype, y_train.dtype)

(7946, 516) (7946,)
float64 float64


In [38]:
def report(grid_scores, n_top=3):
    top_scores = sorted(grid_scores, key=lambda x: x[1], reverse=True)[:n_top]
    for i, score in enumerate(top_scores):
        print("{2}: Mean validation score: {0:.3f} (std: {1:.3f})".format(
              score.mean_validation_score,
              np.std(score.cv_validation_scores),
              i + 1))
        print("Parameters: {0}".format(score.parameters))

In [81]:
from sklearn.svm import SVC
from sklearn.model_selection import *
from sklearn.linear_model import *
from sklearn.metrics import *
from sklearn.ensemble import *
from sklearn.tree import *
from sklearn.neighbors import *
from sklearn.pipeline import *
from sklearn.preprocessing import StandardScaler
from sklearn.dummy import *

# clf = DummyClassifier()
clf = RandomForestClassifier()
# clf = SVC()
clf = LogisticRegression()

Cs = [0.01, 0.1, 1.0, 2.0, 10, 25]
# tuned_parameters = [{'kernel': ['rbf'], 'gamma': [0.1, 0.01, 0.001, 0.0001],
#                      'C': [0.001, 0.01, 0.1, 0.5, 1, 5, 10, 25, 50]},
#                     {'kernel': ['linear'], 'C': [0.001, 0.01, 0.05, 0.1, 0.5, 10, 50, 100]}]
# cws = ['balanced', {0: 3, 1: 1}, {0: 1, 1: 1}]
tuned_parameters = [ # SVC
#     {
#         'clf__kernel': ['linear'],
#         'clf__C': Cs,
#     },
    {
        'clf__kernel': ['rbf'],
        'clf__gamma': [0.1, 0.01, 0.001], #, 0.0001],
        'clf__C': Cs,
#         'epsilon': []
    },
#     {
# #         'kernel': ['poly']
#     }
]
tuned_parameters = {
    'clf__n_estimators': [2, 3, 5, 10], #, 15, 20], #, 25, 50], #, 75, 100, 125]#, 150],
    'clf__class_weight': ['balanced'],
#     'scaler__with_mean': [True, False],
#     'scaler__with_std': [True, False],
}

tuned_parameters = { # LogReg
    'clf__C': [0.01, 0.1, 0.25, 1.0, 2.5, 5.0],
#     'scaler__with_mean': [True, False],
#     'scaler__with_std': [True, False],
}
# tuned_parameters = { }

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', clf)
])

print(np.histogram(y_train)[0])
grid = GridSearchCV(pipeline, tuned_parameters, cv=10, scoring='f1_macro', n_jobs=-2, verbose=1)

print(X_train.shape)
print("Kicking off grid search...")

# Hack to suppress tons of output when doing feature selection (doesn't impact the actual feature selection)
# np.seterr(all='ignore')
res = grid.fit(X_train, y_train)

[2878    0    0    0    0 2304    0    0    0 2764]
(7946, 516)
Kicking off grid search...
Fitting 10 folds for each of 6 candidates, totalling 60 fits


[Parallel(n_jobs=-2)]: Done  36 tasks      | elapsed:   59.5s
[Parallel(n_jobs=-2)]: Done  60 out of  60 | elapsed:  1.6min finished


In [78]:
report(res.grid_scores_, n_top=50)

1: Mean validation score: 0.881 (std: 0.050)
Parameters: {'scaler__with_std': True, 'clf__C': 0.001, 'scaler__with_mean': True}
2: Mean validation score: 0.880 (std: 0.040)
Parameters: {'scaler__with_std': True, 'clf__C': 0.005, 'scaler__with_mean': True}
3: Mean validation score: 0.880 (std: 0.046)
Parameters: {'scaler__with_std': True, 'clf__C': 0.001, 'scaler__with_mean': False}
4: Mean validation score: 0.878 (std: 0.039)
Parameters: {'scaler__with_std': True, 'clf__C': 0.01, 'scaler__with_mean': True}
5: Mean validation score: 0.872 (std: 0.044)
Parameters: {'scaler__with_std': True, 'clf__C': 0.005, 'scaler__with_mean': False}
6: Mean validation score: 0.868 (std: 0.036)
Parameters: {'scaler__with_std': True, 'clf__C': 0.1, 'scaler__with_mean': True}
7: Mean validation score: 0.867 (std: 0.041)
Parameters: {'scaler__with_std': True, 'clf__C': 0.01, 'scaler__with_mean': False}
8: Mean validation score: 0.866 (std: 0.037)
Parameters: {'scaler__with_std': False, 'clf__C': 0.001, 'sc



In [79]:
y_pred_train = grid.predict(X_train)
confusion_matrix(y_train, y_pred_train)

array([[2803,   48,   27],
       [  26, 2278,    0],
       [   0,   24, 2740]])

In [82]:
yy=res.predict(X_test)
print(yy)
counts = np.bincount(yy.astype(np.int64))
print(np.argmax(counts))
# print(f1_score(y_test, yy, average='macro'))


[ 0.  0.  0.  2.  2.  2.  2.  2.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  1.  1.
  1.  0.  0.  0.  0.  0.  2.  2.  2.]
0


In [47]:
# print(res)

GridSearchCV(cv=10, error_score='raise',
       estimator=Pipeline(steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('clf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=-2,
       param_grid={'clf__n_estimators': [10, 15, 20, 25, 50], 'clf__class_weight': ['balanced']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='f1_micro', verbose=1)


# Saving the model for later use

In [84]:
import pickle

with open('../model-lr.pkl', 'wb') as f:
    pickle.dump(res.best_estimator_, f)
    print("Successfully dumped model pickle.")

Successfully dumped model pickle.
