In [1]:
from utils import *
from dataset import *
from constants import *
from models.BaseModels import *

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

from dataset import Participant

random.seed(RANDOM_STATE)

In [2]:
use_saved = True

In [3]:
if use_saved:
    participant = Participant.load_from_pickle('saved/s6.pkl')
else:
    participant = Participant('s6', data_path=DATA_PATH_NOTEBOOK, alpha=0.05)
    with open('saved/s6.pkl', 'wb') as f:
        pickle.dump(participant, f, pickle.HIGHEST_PROTOCOL)

In [4]:
test_size = 0.3
pca_expl_var = 0.95

## Execution

In [5]:
if use_saved:
    ex_features = pd.read_hdf('saved/ex_features.h5', 'df')
else:
    ex_features = participant.get_features_all_sessions_mvt('E')
    ex_features.to_hdf('saved/ex_features.h5', 'df', mode='w', data_columns=True)

In [7]:
print(f'The dataset contains {ex_features.shape[0]} samples and {ex_features.shape[1]} features.')
print(f'The {len(participant.relevant_channels_ex)} relevant channels are located in the following locations:')
print(sorted(list(set([participant.channels_locations[i] for i in [channel.idx for channel in participant.relevant_channels_ex]]))))

The dataset contains 128 samples and 4897 features.
The 68 relevant channels are located in the following locations:
['WM_insula', 'WM_paracentral', 'WM_precentral', 'caudalmiddlefrontal', 'insula', 'paracentral', 'postcentral', 'precentral', 'superiorfrontal', 'supramarginal']


Let's create a baseline by taking the same number of channels, but without checking whether they are responsive:

In [None]:
baseline_features = participant.get_features_all_sessions_rnd(len(participant.relevant_channels_ex), movtype='E')

100%|██████████| 68/68 [00:04<00:00, 15.13it/s]
100%|██████████| 68/68 [00:03<00:00, 22.53it/s]
100%|██████████| 68/68 [00:02<00:00, 26.11it/s]
100%|██████████| 68/68 [00:03<00:00, 18.76it/s]
100%|██████████| 68/68 [00:03<00:00, 19.44it/s]
100%|██████████| 68/68 [00:02<00:00, 25.40it/s]]
100%|██████████| 68/68 [00:03<00:00, 21.37it/s]]
100%|██████████| 68/68 [00:02<00:00, 24.64it/s]]
100%|██████████| 68/68 [00:04<00:00, 15.41it/s]]
100%|██████████| 68/68 [00:03<00:00, 17.18it/s]]
100%|██████████| 68/68 [00:02<00:00, 28.25it/s]]
100%|██████████| 68/68 [00:02<00:00, 24.87it/s]]
100%|██████████| 68/68 [00:03<00:00, 20.08it/s]]
100%|██████████| 68/68 [00:02<00:00, 23.87it/s]]
100%|██████████| 68/68 [00:04<00:00, 14.42it/s]]
100%|██████████| 68/68 [00:02<00:00, 22.89it/s]]
100%|██████████| 68/68 [00:03<00:00, 21.01it/s]]
100%|██████████| 68/68 [00:02<00:00, 25.05it/s]]
100%|██████████| 68/68 [00:04<00:00, 16.35it/s]]
100%|██████████| 68/68 [00:02<00:00, 25.97it/s]]
100%|██████████| 68/68 [0

In [None]:
print(f'The baseline dataset contains {baseline_features.shape[0]} samples and {baseline_features.shape[1]} features.')

The baseline dataset contains 128 samples and 4897 features.


### Train a model on the baseline features (Logistic Regression)

In [None]:
X = baseline_features.drop('label', axis=1)
y = baseline_features['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=RANDOM_STATE)

logreg = LogisticRegressionModel()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.85


## Let's now do the analysis for the responsive channels

In [9]:
X = ex_features.drop('label', axis=1)
y = ex_features['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=RANDOM_STATE)

### Train a model (Logistic Regression)

In [10]:
logreg = LogisticRegressionModel()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.77


#### With PCA

In [11]:
logreg = LogisticRegressionModel(use_pca=True, expl_var=0.95)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.69


### Train a model (SVM)

In [12]:
svm = SVMModel()
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.62


#### With PCA

In [14]:
svm = SVMModel(use_pca=True, expl_var=0.95)
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.51


### Train a model (Random Forest)

In [15]:
svm = RandomForestModel()
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.54


## Observation

In [None]:
if use_saved:
    obs_features = pd.read_hdf('saved/obs_features.h5', 'df')
else:
    obs_features = participant.get_features_all_sessions_mvt('O')
    obs_features.to_hdf('saved/obs_features.h5', 'df', mode='w', data_columns=True)

100%|██████████| 51/51 [00:03<00:00, 14.79it/s]
100%|██████████| 51/51 [00:02<00:00, 19.41it/s]
100%|██████████| 51/51 [00:04<00:00, 12.15it/s]
100%|██████████| 51/51 [00:03<00:00, 16.93it/s]
100%|██████████| 51/51 [00:03<00:00, 16.81it/s]
100%|██████████| 51/51 [00:02<00:00, 20.05it/s]
100%|██████████| 51/51 [00:02<00:00, 23.71it/s]]
100%|██████████| 51/51 [00:02<00:00, 22.41it/s]]
100%|██████████| 51/51 [00:03<00:00, 13.51it/s]]
100%|██████████| 51/51 [00:02<00:00, 17.20it/s]]
100%|██████████| 51/51 [00:03<00:00, 14.76it/s]]
100%|██████████| 51/51 [00:02<00:00, 22.37it/s]]
100%|██████████| 51/51 [00:02<00:00, 23.64it/s]]
100%|██████████| 51/51 [00:02<00:00, 22.34it/s]]
100%|██████████| 51/51 [00:03<00:00, 13.45it/s]]
100%|██████████| 51/51 [00:02<00:00, 20.16it/s]]
100%|██████████| 51/51 [00:02<00:00, 18.51it/s]]
100%|██████████| 51/51 [00:02<00:00, 22.31it/s]]
100%|██████████| 51/51 [00:02<00:00, 17.97it/s]]
100%|██████████| 51/51 [00:02<00:00, 20.70it/s]]
100%|██████████| 51/51 [00

In [None]:
print(f'The dataset contains {obs_features.shape[0]} samples and {obs_features.shape[1]} features.')

The dataset contains 128 samples and 2881 features.


## Train a model (SVM)

In [None]:
X = obs_features.drop('label', axis=1)
y = obs_features['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=RANDOM_STATE)

# Normalize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train SVM
parameters = {'C': [0.1, 1, 10, 100, 1000], 'kernel': ['linear', 'rbf', 'sigmoid']}
svm = SVC()
clf = GridSearchCV(svm, parameters)
clf.fit(X_train, y_train)
print(clf.best_params_)

# Test SVM
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

{'C': 1, 'kernel': 'rbf'}
Accuracy: 0.67


#### With PCA

In [None]:
X = obs_features.drop('label', axis=1)
y = obs_features['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=RANDOM_STATE)

svm = 

# Normalize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

pca = PCA(n_components=pca_expl_var)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

# Train SVM
parameters = {'C': [0.1, 1, 10, 100, 1000], 'kernel': ['linear', 'rbf', 'sigmoid']}
svm = SVC()
clf = GridSearchCV(svm, parameters)
clf.fit(X_train_pca, y_train)
print(clf.best_params_)

# Test SVM
y_pred = clf.predict(X_test_pca)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

{'C': 0.1, 'kernel': 'sigmoid'}
Accuracy: 0.59


## Train a model (Random Forest)

In [None]:
X = obs_features.drop('label', axis=1)
y = obs_features['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=RANDOM_STATE)

# Normalize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train Random Forest
n_estimators = [10, 50, 90, 130]
max_depth = [10, 25, 50]
param_grid = {'n_estimators': n_estimators, 'max_depth': max_depth}

rf = RandomForestClassifier() 
clf = GridSearchCV(rf, param_grid)
clf.fit(X_train, y_train)
print(clf.best_params_)

# Test Random Forest
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")


{'max_depth': 50, 'n_estimators': 130}
Accuracy: 0.64
