In [1]:
import pandas as pd
from pathlib import Path
import os
from Step_4_MLmodels.PrepareDatasetForLearning import PrepareDatasetForLearning
from Step_4_MLmodels.LearningAlgorithms import ClassificationAlgorithms
from Step_4_MLmodels.Evaluation import ClassificationEvaluation
from Step_4_MLmodels.FeatureSelection import FeatureSelectionClassification
from util import util
from util.VisualizeDataset import VisualizeDataset
from sklearn.ensemble import RandomForestClassifier

# Set up file names and locations.
FOLDER_PATH = Path('C:\\Users\\DUC_AN\\Documents\\GitHub\\EEG-ICSSE\\intermediate_datafiles\\motor_imagery\\step3_result\\all')
RESULT_PATH = Path('C:\\Users\\DUC_AN\\Documents\\GitHub\\EEG-ICSSE\\intermediate_datafiles\\motor_imagery\\step4_result\\all')

In [2]:
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
import pandas as pd
import numpy as np
import os
import pickle
import inspect

In [3]:
RESULT_PATH.mkdir(exist_ok=True, parents=True)
# for this script, we want to first load in all datasets
# since the Prepare dataset function accepts a list of pd dataframes
prepare = PrepareDatasetForLearning()
all_datasets = []

for instance in os.scandir(FOLDER_PATH): # go through all instances of experiments
    instance_path = instance.path
    dataset = pd.read_csv(instance_path, index_col=0)
    dataset.index = pd.to_datetime(dataset.index)
    all_datasets.append(dataset)

#now all dataframes are added to the list all_datasets
#print(all_datasets)

# Let's create our visualization class again.

'''
the classification of the motor imagery can be seen as a non-temporal task, as we want to predict imagery based on a window of e.g. 2 sec,
without taking into account previous windows.
We first create 1 column representing our classes, and then create a train val test split of 60 20 20
In order to do this, we first create a train test split of 80 20, and then for the train set we split again in 75 25
For each dataset instance. we split trainvaltest split individually.
Then later we add all train data together, all val data together, and all test data together.
This way we sample randomly across all users to get a result for the whole 'population' of subjects.
'''
# we set filter is false so also the data besides left and right are taken with us
train_X, val_X, test_X, train_y, val_y, test_y = prepare.split_multiple_datasets_classification(
    all_datasets, ['0', '1'], 'like', [0.2, 0.25],filter=False, temporal=False)
print('Training set length is: ', len(train_X.index))
print('Validation set length is: ', len(val_X.index))
print('Test set length is: ', len(test_X.index))   

# select subsets of features which we will consider:
pca_features = ['pca_1','pca_2','pca_3','pca_4']
ica_features = ['FastICA_1','FastICA_2','FastICA_3','FastICA_4','FastICA_5','FastICA_6','FastICA_7','FastICA_8','FastICA_9','FastICA_10',
'FastICA_11','FastICA_12','FastICA_13','FastICA_14','FastICA_15','FastICA_16','FastICA_17','FastICA_18','FastICA_19','FastICA_20']
time_features = [name for name in dataset.columns if '_temp_' in name]
freq_features = [name for name in dataset.columns if (('_freq' in name) or ('_pse' in name))]


# feature selection below we will use as input for our models:
basic_features = ['Delta_TP9','Delta_AF7','Delta_AF8','Delta_TP10','Theta_TP9','Theta_AF7','Theta_AF8','Theta_TP10','Alpha_TP9','Alpha_AF7',
'Alpha_AF8','Alpha_TP10','Beta_TP9','Beta_AF7','Beta_AF8','Beta_TP10','Gamma_TP9','Gamma_AF7','Gamma_AF8','Gamma_TP10']
basic_w_PCA = list(set().union(basic_features, pca_features))
basic_w_ICA = list(set().union(basic_features, ica_features))
all_features = list(set().union(basic_features, ica_features, time_features, freq_features))

fs = FeatureSelectionClassification()
num_features = 20

# we will select the top 20 features based on an experiment with a deciscion tree which we will use as input for our models as well
# this is already been run, see below

'''
selected_features, ordered_features, ordered_scores = fs.forward_selection(num_features,
                                                              train_X[all_features],
                                                              test_X[all_features],
                                                              train_y,
                                                              test_y,
                                                              gridsearch=False)
print(selected_features)
'''

# the best feature are right now:
selected_features = ['Delta_AF7_temp_max_ws_10', 'Alpha_TP9_temp_mean_ws_10', 'Delta_AF7_temp_slope_ws_30', 'FastICA_2', 
'Alpha_TP9_temp_median_ws_20', 'Delta_AF8_temp_max_ws_10', 'Beta_TP10_freq_30.0_Hz_ws_10', 
'Beta_TP9_temp_std_ws_20', 'Theta_TP10_temp_max_ws_20', 'Gamma_TP9_temp_median_ws_20',
 'Gamma_TP10_freq_30.0_Hz_ws_10', 'Alpha_TP10_temp_std_ws_20', 'Gamma_AF7_freq_30.0_Hz_ws_10', 'Delta_TP10', 
 'Beta_TP9_temp_median_ws_20', 'Delta_TP10_temp_min_ws_20', 'Theta_TP9_temp_median_ws_30', 
 'Delta_AF8_temp_min_ws_20', 'Delta_AF8_temp_mean_ws_10', 'Beta_TP9_freq_0.0_Hz_ws_10']


possible_feature_sets = [basic_features, basic_w_PCA, basic_w_ICA, all_features, selected_features]
feature_names = ['initial set', 'basic_w_PCA', 'basic_w_ICA', 'all_features', 'Selected features']
N_KCV_REPEATS = 10 # some non deterministic models we will run a couple of times as their inits are random to get average results


# then here, we run each model
learner = ClassificationAlgorithms()
eval = ClassificationEvaluation()
scores_over_all_algs = []

Training set length is:  3132
Validation set length is:  1051
Test set length is:  1057


In [4]:
print(len(train_X))

3132


In [16]:
# Apply a random forest approach for classification upon the training data (with the specified value for
# the minimum samples in the leaf, the number of trees, and if we should print some of the details of the
# model print_model_details=True) and use the created model to predict the outcome for both the
# test and training set. It returns the categorical predictions for the training and test set as well as the
# probabilities associated with each class, each class being represented as a column in the data frame.
def random_forest(train_X, train_y, test_X, n_estimators=10, min_samples_leaf=5, criterion='gini', print_model_details=False, gridsearch=True, save_model=False):

    if gridsearch:
        tuned_parameters = [{'min_samples_leaf': [2, 10, 50, 100, 200],
                                'n_estimators':[10, 50, 100],
                                'criterion':['gini', 'entropy']}]
        rf = GridSearchCV(RandomForestClassifier(), tuned_parameters, cv=5, scoring='accuracy', error_score= 'raise')
    else:
        rf = RandomForestClassifier(n_estimators=n_estimators, min_samples_leaf=min_samples_leaf, criterion=criterion)

    # Fit the model

    rf.fit(train_X, train_y)

    if gridsearch and print_model_details:
        print(rf.best_params_)

    if gridsearch:
        rf = rf.best_estimator_

    pred_prob_training_y = rf.predict_proba(train_X)
    pred_prob_test_y = rf.predict_proba(test_X)
    pred_training_y = rf.predict(train_X)
    pred_test_y = rf.predict(test_X)
    frame_prob_training_y = pd.DataFrame(pred_prob_training_y, columns=rf.classes_)
    frame_prob_test_y = pd.DataFrame(pred_prob_test_y, columns=rf.classes_)

    if print_model_details:
        ordered_indices = [i[0] for i in sorted(enumerate(rf.feature_importances_), key=lambda x:x[1], reverse=True)]
        print('Top 20 feature importances random forest:')
        for i in range(0, 20):
            print(train_X.columns[ordered_indices[i]], end='')
            print(' & ', end='')
            print(rf.feature_importances_[ordered_indices[i]])
    
    if save_model:
        # save the model to disk
        filename = 'final_' + str(inspect.stack()[0][3]) + '_model_BCI.sav'
        pickle.dump(rf, open(filename, 'wb'))

    return pred_training_y, pred_test_y, frame_prob_training_y, frame_prob_test_y

In [29]:
train_y.nunique()

check    2
dtype: int64

In [28]:
train_y.check.unique()

array([1., 0.])

In [31]:
train_X.count()

Delta_TP9                        3132
Delta_AF7                        3132
Delta_AF8                        3132
Delta_TP10                       3132
Theta_TP9                        3132
                                 ... 
Gamma_TP10_freq_20.0_Hz_ws_10    3132
Gamma_TP10_freq_30.0_Hz_ws_10    3132
Gamma_TP10_freq_40.0_Hz_ws_10    3132
Gamma_TP10_freq_50.0_Hz_ws_10    3132
class                            3132
Length: 585, dtype: int64

In [18]:
class_train_y, class_test_y, class_train_prob_y, class_test_prob_y = random_forest(
            train_X, train_y, test_X, gridsearch=True, print_model_details=True, save_model=True)
performance_training_rf_final = eval.f1(train_y, class_train_y)
performance_test_rf_final = eval.f1(test_y, class_test_y)
confusionmatrix_rf_final = eval.confusion_matrix(test_y, class_test_y, ['check'])
print(performance_test_rf_final) #test performance is reasonable!
print(confusionmatrix_rf_final)

ValueError: could not convert string to float: 'undefined'