In [1]:
import numpy as np
import scipy.io
import pandas as pd
import json
from typing import List, Callable
import run_classifiers
import matplotlib.pyplot as plt

corrs_location = '../corrs'
A_mats_location = '../A_mats'

participants = pd.read_table('../SRPBS_OPEN/participants.tsv')
participants = participants.dropna()

train_sites = ['COI','KUT','SWA','UTO']
test_sites = ['ATT','ATV','CIN','HKH','HRC','HUH','KTT']

In [2]:
def participant_id_to_number(participant_id: str) -> int:
    return int(participant_id.split('-')[1])

def extract_participants_id(participants: pd.DataFrame, id: int) -> pd.DataFrame:
	return participants[[participant_id_to_number(name) == id for name in participants.participant_id.to_numpy()]]

def unpair(pairs: List[List[int]]) -> List[int]:
	return [id for pair in pairs for id in pair]

* 2 types of data (A matrices or correlations)
* 2 outlier choices (remove or not)
* 3 dataset subsets (all data, all pairs, perfect pairs)

In total - we have 2x2x3 = 12 datasets to feed into the classifiers

In [3]:
with open('../Dataset Analysis/dataset_all.txt','r') as f:
	ids_all_all = np.array(json.loads(f.read()))
with open('../Dataset Analysis/dataset_inliers.txt','r') as f:
	ids_inliers_all = np.array(json.loads(f.read()))
with open('../Dataset Analysis/all_pairs.txt','r') as f:
	ids_all_pairs = np.array(json.loads(f.read()))
with open('../Dataset Analysis/all_inliers_pairs.txt','r') as f:
	ids_inliers_pairs = np.array(json.loads(f.read()))
with open('../Dataset Analysis/all_perfect_pairs.txt','r') as f:
	ids_all_perfectpairs = np.array(json.loads(f.read()))
with open('../Dataset Analysis/all_inliers_perfect_pairs.txt','r') as f:
	ids_inliers_perfectpairs = np.array(json.loads(f.read()))

dataset_types = {
	'all': ids_all_all, 
	'inliers': ids_inliers_all, 
	# 'all_pairs': ids_all_pairs,
	# 'inliers_pairs': ids_inliers_pairs, 
	# 'all_perfectpairs': ids_all_perfectpairs, 
	# 'inliers_perfectpairs': ids_inliers_perfectpairs
}

In [4]:
def read_correlation(subject: int) -> np.ndarray:
	return np.nan_to_num(scipy.io.loadmat(f'{corrs_location}/correlation_components_{subject:04d}.mat')['corr_components'].reshape(1,-1))

def read_A_matrix(subject: int) -> np.ndarray:
	return scipy.io.loadmat(f'{A_mats_location}/dcm_A_{subject:04d}.mat')['A'].reshape(1,-1)

load_data_functions = {'corr': read_correlation, 'Amat': read_A_matrix}

In [5]:
def load_dataset(ids: np.ndarray, load_data_function: Callable[[int], np.ndarray]):
	if ids.ndim == 1: # single list of participants
		particpant_data = [extract_participants_id(participants, id) for id in ids]
	else: # pairs of participants
		particpant_data = [extract_participants_id(participants, id) for pair in ids for id in pair]
		ids = ids.flatten()
	particpants_train = [participant.site.to_numpy() in train_sites for participant in particpant_data]
	particpants_test = [participant.site.to_numpy() in test_sites for participant in particpant_data]

	X_train = np.vstack([load_data_function(id) for id in ids[particpants_train]])
	X_test = np.vstack([load_data_function(id) for id in ids[particpants_test]])
	labels = np.array(list(map(lambda participant: int(participant.diag // 2), particpant_data)))
	Y_train = labels[particpants_train]
	Y_test = labels[particpants_test]
	return X_train, X_test, Y_train, Y_test

In [6]:
datasets = []

for dataset_name, ids in dataset_types.items():
	for function_name, load_data_function in load_data_functions.items():
		datasets.append([f'{dataset_name}_{function_name}', *load_dataset(ids, load_data_function)])

In [7]:
for dataset in datasets:
	train_proportion = 100*dataset[3].shape[0]/(dataset[3].shape[0] + dataset[4].shape[0])
	test_proportion = 100*dataset[4].shape[0]/(dataset[3].shape[0] + dataset[4].shape[0])
	print(f'{dataset[0]}: (total - {dataset[3].shape[0] + dataset[4].shape[0]})\n\ttrain - [X: {dataset[1].shape}, Y: {dataset[3].shape}] ({train_proportion:.1f}%)\n\ttest -  [X: {dataset[2].shape}, Y: {dataset[4].shape}] ({test_proportion:.1f}%)')
for dataset_name, ids in dataset_types.items():
	print(f'{dataset_name} has {ids.size} participants')

all_corr: (total - 1043)
	train - [X: (626, 70876), Y: (626,)] (60.0%)
	test -  [X: (417, 70876), Y: (417,)] (40.0%)
all_Amat: (total - 1043)
	train - [X: (626, 142129), Y: (626,)] (60.0%)
	test -  [X: (417, 142129), Y: (417,)] (40.0%)
inliers_corr: (total - 994)
	train - [X: (589, 70876), Y: (589,)] (59.3%)
	test -  [X: (405, 70876), Y: (405,)] (40.7%)
inliers_Amat: (total - 994)
	train - [X: (589, 142129), Y: (589,)] (59.3%)
	test -  [X: (405, 142129), Y: (405,)] (40.7%)
all has 1043 participants
inliers has 994 participants


In [8]:
scores = run_classifiers.run_classifiers([datasets[0]])
np.save('100_classifier_scores', scores)

Processing dataset all_corr
... with Lasso_cv classifier
Running outer loop 0
Running subsample 0
alpha validation scores = [0.7139601139601138, 0.7142450142450143, 0.7061253561253562, 0.6676638176638175], best alpha = 0.01
Running subsample 1
alpha validation scores = [0.7250712250712251, 0.7401709401709401, 0.7403133903133903, 0.7478632478632479], best alpha = 1.0
Running subsample 2
alpha validation scores = [0.717094017094017, 0.7245014245014245, 0.7283475783475784, 0.7015669515669516], best alpha = 0.1
Running subsample 3
alpha validation scores = [0.7556980056980057, 0.7670940170940171, 0.7403133903133903, 0.7098290598290599], best alpha = 0.01
Running subsample 4
alpha validation scores = [0.7249287749287749, 0.7170940170940171, 0.7249287749287749, 0.7098290598290597], best alpha = 0.001
Running subsample 5
alpha validation scores = [0.7703703703703704, 0.7437321937321938, 0.7437321937321938, 0.754985754985755], best alpha = 0.001
Running subsample 6
alpha validation scores = [0

AttributeError: 'list' object has no attribute 'update'

In [14]:
preds_train = '[0.   0.   0.01 0.   0.   0.   0.   0.   0.29 0.04 0.04 0.   0.11 0.01 \
 0.   0.07 0.   0.06 0.02 0.43 0.04 0.   0.02 0.   0.   0.   0.15 0.03 \
 0.02 0.   0.08 0.   0.01 0.07 0.   0.07 0.13 0.02 0.11 0.   0.21 0. \
 0.   0.   0.   0.   0.09 0.07 0.08 0.   0.08 0.01 0.03 0.02 0.03 0.03 \
 0.01 0.   0.17 0.   0.33 0.   0.01 0.04 0.   0.   0.   0.4  0.   0.02 \
 0.02 0.   0.07 0.   0.02 0.   0.   0.   0.01 0.08 0.05 0.   0.01 0. \
 0.19 0.   0.08 0.   0.31 0.   0.   0.   0.   0.   0.12 0.   0.01 0. \
 0.2  0.08 0.   0.47 0.97 0.73 0.99 0.73 0.95 0.71 0.94 0.63 0.91 0.31 \
 1.   0.55 1.   0.02 0.97 0.54 0.53 0.98 0.24 1.   0.06 0.96 0.62 0.98 \
 0.47 1.   0.41 1.   0.63 1.   0.78 0.92 0.8  1.   0.73 0.96 0.29 0.95 \
 0.74 0.97 0.69 0.98 0.71 1.   0.48 0.95 0.78 1.   0.66 1.   0.76 0.97 \
 0.32 1.   0.56 0.99 0.58 0.99 0.7  0.94 0.61 1.   0.64 0.96 0.39 1. \
 0.52 0.99 0.19 0.99 0.64 1.   0.55 0.98 0.66 0.97 0.28 0.98 0.68 1. \
 0.1  1.   0.64 0.92 0.73 0.99 0.36 1.   0.23 0.98 0.6  1.   0.13 1. \
 0.66 0.94 0.32 0.99 0.47 0.99 0.59 1.   0.04 1.   0.37 0.93 0.27 0.99 \
 0.75 0.09 1.   0.53 1.   0.76 1.   0.25 0.96 0.57 0.99 0.23 0.47 0.11 \
 0.28 0.57 0.49 0.29 0.6  0.52 0.56 0.45 0.59 0.66 0.29 0.53 0.22 0.2 \
 0.77 0.33 0.15 0.3  0.54 0.49 0.55 0.58 0.29 0.63 0.74 0.22 0.67 0.69 \
 0.54 0.55 0.62 0.26 0.08 0.33 0.38 0.4  0.58 0.47 0.73 0.78 0.16 0.64 \
 0.73 0.53 0.73 0.09 0.11 0.59 0.53 0.33 0.58 0.08 0.59 0.43 0.52 0.19 \
 0.2  0.45 0.35 0.22 0.   0.43 0.21 0.08 0.12 0.   0.01 0.3  0.76 0.08 \
 0.06 0.   0.09 0.39 0.38 0.05 0.26 0.01 0.   0.   0.04 0.01 0.   0.2 \
 0.   0.33 0.04 0.01 0.   0.   0.02 0.58 0.3  0.   0.17 0.48 0.08 0.09 \
 0.02 0.08 0.   0.13 0.02 0.04 0.05 0.25 0.   0.   0.   0.17 0.09 0.01 \
 0.04 0.01 0.05 0.04 0.67 0.   0.09 0.07 0.   0.   0.01 0.11 0.05 0.48 \
 0.   0.   0.   0.32 0.07 0.21 0.31 0.13 0.01 0.02 0.15 0.04 0.   0. \
 0.14 0.2  0.   0.03 0.01 0.01 0.25 0.03 0.19 0.04 0.   0.   0.02 0.5 \
 0.18 0.04 0.   0.06 0.37 0.   0.11 0.   0.17 0.08 0.   0.06 0.01 0.01 \
 0.57 0.   0.01 0.1  0.1  0.   0.16 0.   0.   0.25 0.12 0.11 0.19 0.14 \
 0.   0.12 0.   0.06 0.   0.07 0.55 0.14 0.12 0.14 0.4  0.11 0.4  0. \
 0.17 0.02 0.   0.03 0.06 0.09 0.1  0.   0.2  0.   0.01 0.03 0.03 0.66 \
 0.05 0.   0.07 0.25 0.   0.1  0.07 0.01 0.35 1.   0.95 1.   0.92 0.99 \
 0.9  0.91 0.98 0.91 0.9  0.95 0.9  0.9  0.9  0.94 0.93 1.   1.   0.99 \
 0.99 1.   0.99 1.   1.   1.   0.93 0.9  0.61 0.64 0.47 0.78 0.73 0.76 \
 0.76 0.54 0.36 0.15 0.43 0.61 0.31 0.72 0.51 0.29 0.18 0.14 0.18 0.38 \
 0.57 0.68 0.68 0.73 0.6  0.67 0.42 0.64 0.4  0.62 0.42 0.44 0.32 0.73 \
 0.19 0.41 0.36 0.32 0.39 0.62 0.77 0.65 0.34 0.48 0.62 0.29 0.72 0.59 \
 0.35 0.56 0.12 0.09 0.79 0.67 0.76 0.49 0.76 0.75 0.66 0.77 0.32 0.72 \
 0.63 0.62 0.51 0.67 0.44 0.72 0.04 0.79 0.05 0.81 0.01 0.05 0.37 0.57 \
 0.6  0.65 0.58 0.7  0.32 0.66 0.1  0.19 0.71 0.43 0.46 0.56 0.12 0.62 \
 0.73 0.26 0.2  0.36 0.78 0.74 0.99 0.97 0.98 0.98 1.   1.   0.91 1. \
 1.   0.99 1.   0.95 0.95 1.   1.   0.99 1.   1.   1.   1.   1.   0.98 \
 1.   0.99 1.   1.   1.   1.   0.99 0.99 0.97 1.   0.99 1.   1.   1. \
 1.   0.96 1.   0.94 1.   1.   1.   1.   0.94 1.   1.   0.95 1.   1. \
 1.   0.99 1.   0.94 0.99 1.   0.97 0.99 1.   0.98]'[1:-1].split()
preds_train = [float(f) for f in preds_train]
preds_test = '[0.44 0.98 0.17 0.23 0.68 0.34 0.45 0.49 0.33 0.51 0.25 0.41 0.22 0.58 \
 0.51 0.49 0.86 0.84 0.24 0.29 0.04 0.89 0.55 0.39 0.21 0.55 0.11 0.78 \
 0.82 0.33 0.53 0.17 0.55 0.81 0.16 0.33 0.92 0.82 0.01 0.66 0.06 0.11 \
 0.76 0.75 0.26 0.75 0.1  0.6  0.34 0.73 0.02 0.73 0.09 0.07 0.39 0.15 \
 0.14 0.93 0.06 0.37 0.02 0.55 0.1  0.76 0.1  0.43 0.39 0.45 0.34 0.25 \
 0.36 0.28 0.13 0.29 0.76 0.85 0.84 0.75 0.84 0.65 0.85 0.42 0.31 0.05 \
 0.38 0.4  0.5  0.07 0.38 0.21 0.36 0.72 0.53 0.66 0.23 0.81 0.91 0.13 \
 0.5  0.97 0.84 0.47 0.28 0.77 0.39 0.67 0.02 0.39 0.81 0.42 0.08 0.85 \
 0.48 0.61 0.28 0.22 0.84 0.25 0.81 0.46 0.87 0.12 0.77 0.16 0.18 0.68 \
 0.7  0.57 0.75 0.73 0.37 1.   0.92 0.86 0.12 0.82 0.66 0.76 0.25 0.48 \
 0.28 0.31 0.35 0.26 0.15 0.31 0.1  0.93 0.67 0.81 0.32 0.88 0.96 0.78 \
 0.34 0.53 0.08 0.75 0.71 0.15 0.98 0.15 0.36 0.74 0.79 0.56 0.35 0.82 \
 0.   0.09 0.82 0.98 0.63 0.36 0.51 0.9  0.85 0.6  0.54 0.22 0.81 0.27 \
 0.88 0.93 0.57 0.1  0.92 0.6  0.29 0.38 0.82 0.85 0.86 0.24 0.3  0.02 \
 0.92 0.21 0.32 0.38 0.24 0.53 0.81 0.89 0.54 0.85 0.77 0.54 0.45 0.57 \
 0.64 0.59 0.82 0.7  0.46 0.49 0.72 0.32 0.12 0.34 0.32 0.13 0.75 0.7 \
 0.23 0.1  0.23 0.76 0.6  0.4  0.12 0.32 0.85 0.19 0.36 0.2  0.8  0.08 \
 0.59 0.56 0.15 0.51 0.96 0.25 0.34 0.36 0.69 0.48 0.16 0.93 0.72 0.97 \
 0.34 0.61 0.98 0.78 0.88 0.23 0.78 0.08 0.17 0.19 0.8  0.9  0.57 0.89 \
 0.95 0.69 0.77 0.71 0.47 0.56 0.5  0.82 0.72 0.83 0.53 0.75 0.02 0.89 \
 0.86 0.26 0.43 0.24 0.99 0.99 0.4  0.84 0.91 0.4  0.57 0.56 0.96 0.62 \
 0.56 0.73 0.91 0.25 0.51 0.94 0.82 0.67 0.26 0.07 0.9  0.6  0.97 0.59 \
 0.47 0.15 0.56 0.69 0.77 0.01 0.06 0.07 0.45 0.06 0.94 0.64 0.99 0.99 \
 1.   0.97 0.86 0.91 0.73 0.23 0.04 0.   0.42 0.17 0.01 0.02 0.   0.44 \
 0.06 0.02 0.   0.63 0.19 0.02 0.22 0.44 0.19 0.   0.   0.1  0.01 0.88 \
 0.09 0.01 0.69 0.   0.03 0.12 0.44 0.29 0.11 0.03 0.   0.02 0.03 0.05 \
 0.41 0.27 0.28 0.73 0.   0.5  0.36 0.03 0.   0.   0.41 0.04 0.   0. \
 0.23 0.09 0.09 0.37 0.61 0.03 0.08 0.22 0.13 0.07 0.13 0.79 0.06 0.66 \
 0.89 0.02 0.39 0.09 0.26 0.1  0.92 0.85 0.02 0.84 0.37 0.16 0.94 0.6 \
 0.83 0.14 0.5  0.78 0.   0.58 0.01 0.88 0.85 0.56 0.04]'[1:-1].split()
preds_test = [float(f) for f in preds_test]

In [17]:
y_train = datasets[0][3]
y_test = datasets[0][4]
y_train_pred = np.array(preds_train) >= 0.5
y_test_pred = np.array(preds_test) >= 0.5

In [20]:
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score
print(f'train: accuracy = {accuracy_score(y_train, y_train_pred)}, balanced = {balanced_accuracy_score(y_train, y_train_pred)}, f1 = {f1_score(y_train, y_train_pred)}')
print(f'test: accuracy = {accuracy_score(y_test, y_test_pred)}, balanced = {balanced_accuracy_score(y_test, y_test_pred)}, f1 = {f1_score(y_test, y_test_pred)}')

train: accuracy = 0.7971246006389776, balanced = 0.8677083333333333, f1 = 0.6968973747016707
test: accuracy = 0.5587529976019184, balanced = 0.5580446520657647, f1 = 0.39072847682119205


In [None]:
scores_df = pd.DataFrame(scores)
scores_df

In [None]:
scores_df.filter(regex='.*Testing Balanced.*', axis='index')

In [None]:
scores_df.filter(regex='.*Testing Balanced.*', axis='index').T.plot(kind='bar')
plt.title('Not really clear which classifier is the best')
plt.legend(bbox_to_anchor=(1.05,1))
plt.show()

scores_df.filter(regex='.*Testing Balanced.*', axis='index').plot(kind='bar')
plt.title('Using matched pairs seems a little bit better than using unbalanced data')
plt.legend(bbox_to_anchor=(1.05,1))
plt.show()

In [None]:
np.save('classifier_scores', scores)