In [1]:
import os
import scipy.io
import numpy as np
import pandas as pd
from sklearn.metrics import normalized_mutual_info_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.manifold import TSNE

parent_dir=os.getcwd()

# Data preproccessing

1)Find the lowest number of rows among all subjects(child's EEG data).

In [2]:
def number_of_rows(directory):
    rows = []
    for file in sorted(os.listdir(directory)):
        data = scipy.io.loadmat(directory + file)
        keys = list(data.keys())
        rows.append(len(data[keys[3]]))
    return rows

In [3]:
row_sizes = []
num_rows = []
path = parent_dir + "\\DATAFORCOURSE\\"
types = ["adhd\\", "control\\"]
for type in types:
    directory = path + type
    num_rows = number_of_rows(directory)
    row_sizes.append(num_rows)
row_sizes = list(np.concatenate(row_sizes).flat)
min_number_of_rows = min(row_sizes)
print("Minimum rows number: ", min_number_of_rows)

Minimum rows number:  7983


2)Load data from mat files into matrix

In [4]:
def load_from_mat_file_into_matrix(directory):
    matrices = []
    for file in sorted(os.listdir(directory)):
        data = scipy.io.loadmat(directory + file)
        keys = list(data.keys())
        data_array = np.array(data[keys[3]][0:min_number_of_rows])
        matrices.append(data_array)
    return matrices

In [5]:
adhd = load_from_mat_file_into_matrix(path + types[0])
control = load_from_mat_file_into_matrix(path + types[1])

4)Calculating mutual information between 19 channels (features) of each child.

Done by sklearn.metrics.normalized_mutual_info_score.

This is useful to measure the agreement of two independent label assignments strategies on the same dataset when the real ground truth is not known.

In this function, mutual information is normalized by some generalized mean of H(labels_true) and H(labels_pred)), defined by the average_method (default=’arithmetic’).

https://scikit-learn.org/stable/modules/generated/sklearn.metrics.normalized_mutual_info_score.html#sklearn.metrics.normalized_mutual_info_score

In [6]:
def calc_mutual_information(df, features, current_col):
    rows = []
    for i in features:
        rows.append(normalized_mutual_info_score(df[current_col], df[i]))
    return rows

In [7]:
mutual_adhd = []
mutual_control = []

features = ["Fz" , "Cz", "Pz", "C3", "T3", "C4", "T4", "Fp1", "Fp2", "F3", "F4", "F7", "F8", "P3", "P4", "T5", "T6", "O1", "O2"]

for i in range(len(adhd)):
    child = adhd[i]
    df = pd.DataFrame(child, columns=features)  
    matrix = []
    for i in features:
        row = calc_mutual_information(df, features, i)
        matrix.append(row)
    final = pd.DataFrame(matrix, columns=features)
    mutual_adhd.append(final)
    
for i in range(len(control)):
    child = control[i]
    df = pd.DataFrame(child, columns=features)   
    matrix = []
    for i in features:
        row = calc_mutual_information(df, features, i)
        matrix.append(row)
    final = pd.DataFrame(matrix, columns=features)
    mutual_control.append(final)


5)Calculating average mutual information for each column

The average mutual information I(X; Y) is a measure of the amount of “information” that the random variables X and Y provide about one another. It denotes the average mutual information for each feature from every other features.

Output : for every child 19-dimensional vector with average mutual information of each feature (19 features).

In [8]:
def average_mutual(group, features):
    subjects = []
    for i in range(len(group)):
        child = group[i]
        col_average = []
        for column in features:
            average = child[column].mean()
            col_average.append(average)
        subjects.append(col_average)
    return subjects

In [9]:
subjects = []
subjects.append(average_mutual(mutual_adhd,features))
subjects.append(average_mutual(mutual_control,features))

6)Build the data set.

In [10]:
adhd = pd.DataFrame(subjects[0], columns = features)
adhd['Class'] = 0
control = pd.DataFrame(subjects[1], columns = features)
control['Class'] = 1
EEG_data_set = pd.concat([adhd,control])
EEG_data_set = EEG_data_set.sample(frac=1).reset_index(drop=True) #Shuffles the dataframe
EEG_data_set = EEG_data_set.drop(['Cz', 'Pz','C3','T3','C4','T4','P3','P4','T5','T6','O1','O2'], axis=1)
EEG_data_set.head()

Unnamed: 0,Fz,Fp1,Fp2,F3,F4,F7,F8,Class
0,0.254842,0.121455,0.22814,0.119689,0.247716,0.138682,0.253258,1
1,0.190639,0.168243,0.127769,0.148559,0.170156,0.190432,0.169368,0
2,0.167687,0.154063,0.167344,0.131414,0.160936,0.148533,0.16902,0
3,0.161793,0.1784,0.168914,0.163307,0.172735,0.171937,0.186835,1
4,0.107716,0.118172,0.132474,0.088213,0.104596,0.103627,0.123782,0


# Classification

2 models were tested: SVM and KNN.

For both models, we are testing a vatious of parameters with sklearn.model_selection.GridSearchCV.
GridSearchCV is exhaustive search over specified parameter values for an estimator.
https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

In [65]:
features = ['Fz', 'Fp1','Fp2','F3','F4','F7','F8']
data = EEG_data_set[features]
labels = EEG_data_set['Class']
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2)

In [66]:
parameters = {
    "C": [0.01, 0.1, 1.],
    "kernel": ["linear", "poly", "rbf", "sigmoid"],
    "gamma": ["scale", "auto"],
}
model_svc = SVC(class_weight="balanced",probability=True,)
model_svc = GridSearchCV(model_svc, parameters, scoring='accuracy')
model_svc.fit(X_train, y_train)
print(f'Parameters {model_svc.best_params_}')

Parameters {'C': 1.0, 'gamma': 'scale', 'kernel': 'rbf'}


In [67]:
svm_model = SVC(**model_svc.best_params_)
svm_model.fit(X_train, y_train)
svm_predictions = svm_model.predict(X_test)

In [68]:
print("Score: ", svm_model.score(X_test, y_test))

Score:  0.8


In [69]:
gridsearch_knn = {
    'n_neighbors': [1, 2, 3, 4, 5, 6],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}
knn = GridSearchCV(KNeighborsClassifier(),gridsearch_knn, )
knn_result = knn.fit(X_train, y_train)
print(f'Parameters {knn_result.best_params_}')

Parameters {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'uniform'}


In [70]:
knn_model = KNeighborsClassifier(**knn_result.best_params_)
knn_model.fit(X_train, y_train)
knn_predictions = knn_model.predict(X_test)
print("Score: ", knn_model.score(X_test, y_test))

Score:  0.64
