In [4]:
import os
import math
import numpy as np
import pandas as pd
import argparse
from tqdm import tqdm
import pyperclip

from skfda import FDataGrid
from skfda.representation.basis import BSpline
from skfda.preprocessing.smoothing import BasisSmoother
from skfda.preprocessing.registration import landmark_registration
from skfda.ml.classification import KNeighborsClassifier, NearestCentroid

from sklearn.metrics import precision_score, recall_score, f1_score, balanced_accuracy_score, roc_curve, confusion_matrix


agg_columns = ['patient_id', 'slice_id', 'img_type']

n_basis=18
order=4

prc_rm=0.05
n_points =111

basis = BSpline(domain_range=(0, 1), n_basis=n_basis, order=order)
smoother = BasisSmoother(basis=basis, return_basis=True, method='svd')


def cut_ends(bsplined, order=0, prc_rm_start=prc_rm, prc_rm_end=prc_rm, n_points=n_points):
	bsplined_grid = bsplined.derivative(order=order).to_grid(np.linspace(0, 1, n_points))
	return FDataGrid(
		data_matrix=bsplined_grid.data_matrix[
			..., int(n_points * prc_rm_start): int(n_points * (1 - prc_rm_end)), 0
		],
		grid_points=bsplined_grid.grid_points[0][
			int(n_points * prc_rm_start): int(n_points * (1 - prc_rm_end))
		]
	)


def get_landmark_registration(bsplined, order=0):
	bsplined_grid = cut_ends(bsplined, order)
	landmark_indexes = cut_ends(bsplined, order, prc_rm_end=0.5).data_matrix.argmax(axis=1)
	grid_points = bsplined_grid.grid_points[0]
	landmarks = [grid_points[index] for index in np.concatenate(landmark_indexes)]
	return landmark_registration(bsplined_grid, landmarks)


def specificity(y_true, y_pred, zero_division=0):
	tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
	if tn+fp == 0 and zero_division:
		return zero_division
	return tn / (tn+fp)


def to_fd(segmentation_id, prefix='fixed', is_registered=False):
	file = f'./segmentations/{prefix}-{segmentation_id}.csv'
	dataset = (
	pd.read_csv(
		file,
		dtype={
			'img_type': int,
			'patient_id': int,
			'cycle_id': int,
			'slice_id': int,
			'label': bool,
			'mask_int_mean': float,
			'segment': int,
		},
	)
	.drop_duplicates()
	.sort_values(agg_columns + ['cycle_id'])
	)
	dataset = dataset.merge(dataset.query('label').patient_id.drop_duplicates())
	dataset = dataset[dataset.patient_id.apply(lambda x: x in [2, 15, 28, 32, 35, 39, 40, 41, 45, 50, 52, 64, 66])]
	ts = (
		dataset[['patient_id', 'cycle_id']].drop_duplicates()
			.groupby('patient_id').cycle_id.count()
			.apply(lambda x: np.linspace(0, 1, int(x)))
			.reset_index()
	)

	dataset = dataset.groupby(agg_columns + ['label']).mask_int_mean.apply(list).reset_index()
	bsplined = dataset.groupby('patient_id').mask_int_mean.apply(list).reset_index().merge(ts)
	bsplined = bsplined.apply(
		lambda x: smoother.fit_transform(
			FDataGrid(data_matrix=x['mask_int_mean'], grid_points=x['cycle_id'])
		),
		axis='columns',
	)
	dataset = dataset[['patient_id', 'label']].groupby('patient_id').label.apply(list).reset_index()
	if is_registered:
		dataset['fd_smooth'] = [get_landmark_registration(fd_smooth, 1) for fd_smooth in bsplined]
	else:
		dataset['fd_smooth'] = [cut_ends(fd_smooth, 1) for fd_smooth in bsplined]
	return dataset


def train_model(patient_data, segmentation_id):
	labels = patient_data['label']
	model = NearestCentroid()
	model = model.fit(patient_data['fd_smooth'], labels)
	pred = model.predict(patient_data['fd_smooth'])
	return {
		'patient_id': patient_data['patient_id'],
		'segmentation_id': segmentation_id,
		'precision': precision_score(labels, pred, zero_division=0),
		'recall': recall_score(labels, pred, zero_division=0),
		'f1': f1_score(labels, pred, zero_division=0),
		'balanced_accuracy': balanced_accuracy_score(labels, pred),
		'specificity': specificity(labels, pred, zero_division=0),
	}



## Training

In [None]:
dataset = np.concatenate(
	[
		to_fd(segmentation_id, 'proportionate', True).apply(lambda x: train_model(x, segmentation_id), axis='columns')
		for segmentation_id in tqdm(range(0, 25))
	]
)
# pd.DataFrame(list(dataset)).to_csv(f'./show/functional_data/nearestCentroid/proportionate_unregistered.csv', index=False)
pd.DataFrame(list(dataset)).to_csv(f'./show/functional_data/nearestCentroid/proportionate_registered.csv', index=False)
# pd.DataFrame(list(dataset)).to_csv(f'./show/functional_data/nearestCentroid/fixed_unregistered.csv', index=False)
# pd.DataFrame(list(dataset)).to_csv(f'./show/functional_data/nearestCentroid/fixed_registered.csv', index=False)

In [6]:
def metrics(file):
	return (
		pd.read_csv(file)[['precision', 'recall', 'f1', 'balanced_accuracy', 'specificity']]
			.median()
			.to_frame()
			.transpose()
	)


tables = [
	metrics(file).assign(registered='_unregistered.' not in file, fixed_number_of_slic_regions='fixed_' in file)
	for file in [
		'./show/functional_data/nearestCentroid/fixed_registered.csv',
		'./show/functional_data/nearestCentroid/fixed_unregistered.csv',
		'./show/functional_data/nearestCentroid/proportionate_registered.csv',
		'./show/functional_data/nearestCentroid/proportionate_unregistered.csv',
	]
]
# label = 'xgb'
# description = (
# 	'Classification accuracy metrics of validation set for XGboost classification model. '
# )
# label = 'nn_params'
label = 'nn'
# description = (
# 	'Classification accuracy metrics of validation set for flat neural network model. '
# )
tables = pd.concat(tables).sort_values(['registered', 'fixed_number_of_slic_regions'])
col_format = '|'.join(['c'] * (tables.shape[1] - 1))
tables.columns = [col.replace("_", " ") for col in tables.columns]
tables = tables.set_index(['registered', 'fixed number of slic regions'])
# pyperclip.copy(
# 	'\\begin{table*}[tbp]\n' +
# 	f'\\caption{{{description}}}\n' +
# 	f'\\label{{table:{label}}}\n' +
# 	'\\centering\n' +
# 	tables.to_latex(float_format='%0.3f', column_format=col_format, escape=False, multicolumn=True, multirow=True, na_rep='') +
# 	'\n\\end{table*}\n'
# )
tables

Unnamed: 0_level_0,Unnamed: 1_level_0,precision,recall,f1,balanced accuracy,specificity
registered,fixed number of slic regions,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
False,False,0.126984,0.717647,0.225,0.77246,0.816248
False,True,0.135593,0.747664,0.238806,0.801357,0.833744
True,False,0.121622,0.714286,0.214286,0.760784,0.800391
True,True,0.127273,0.727273,0.225352,0.78386,0.82801


In [10]:
def process(data, experiment, experiment_true, experiment_false):
	return	(
		data.groupby(experiment).median().reset_index()
			.assign(experiments_done_with=lambda y: np.where(y[experiment], experiment_true, experiment_false))
			.drop(['fixed_number_of_slic_regions', 'registered'], axis='columns')
	)


results = pd.concat([
	pd.read_csv(file)
		.filter(regex='^(precision|recall|balanced_accuracy|f1|specificity)$', axis='columns')
		.assign(registered='_unregistered.' not in file, fixed_number_of_slic_regions='/fixed_' in file)
	for file in [
		'./show/functional_data/nearestCentroid/fixed_registered.csv',
		'./show/functional_data/nearestCentroid/fixed_unregistered.csv',
		'./show/functional_data/nearestCentroid/proportionate_registered.csv',
		'./show/functional_data/nearestCentroid/proportionate_unregistered.csv',
	]
])

result = pd.concat(
	[
		process(results, 'registered', 'registered', 'unregistered'),
		process(results, 'fixed_number_of_slic_regions', 'fixed number of SLIC regions', 'proportionate number of SLIC regions'),
	]
)

columns = result.columns.tolist()
columns.remove('experiments_done_with')

col_format = '|'.join(['c'] * (result.shape[1] - 1))
pyperclip.copy(result[['experiments_done_with'] + columns].to_latex(float_format='%0.3f', column_format=col_format, index=False))
result

Unnamed: 0,precision,recall,f1,balanced_accuracy,specificity,experiments_done_with
0,0.132075,0.735632,0.233333,0.787496,0.826629,unregistered
1,0.123288,0.714286,0.218466,0.770493,0.825078,registered
0,0.123288,0.714286,0.218182,0.764621,0.80909,proportionate number of SLIC regions
1,0.132075,0.741338,0.231033,0.795769,0.832047,fixed number of SLIC regions


In [8]:
from scipy import stats
import re

files = [
		'./show/functional_data/nearestCentroid/fixed_registered.csv',
		'./show/functional_data/nearestCentroid/fixed_unregistered.csv',
		'./show/functional_data/nearestCentroid/proportionate_registered.csv',
		'./show/functional_data/nearestCentroid/proportionate_unregistered.csv',
]

def extract_configuration(file):
	result = { 'file': file }
	result['registered'] = 'unregistered' if '_unregistered.' in file else 'registered'
	result['slic_zones'] = 'fixed number of SLIC regions' if '/fixed_' in file else 'proportionate number of SLIC regions'
	return result


def get_balanced_acc(file):
	return pd.read_csv(file).sort_values('patient_id')['balanced_accuracy'].tolist()


def get_p_value(file_pair):
	balanced_acc = [get_balanced_acc(file) for file in file_pair]
	return stats.wilcoxon(*balanced_acc)[1]


def get_configuration_p_values(configs, config_col):
	config_types = configs[config_col].drop_duplicates().tolist()
	columns = configs.columns.tolist()
	columns.remove('file')
	type0 = configs.query(f'{config_col} == "{config_types[0]}"').sort_values(columns)
	type1 = configs.query(f'{config_col} == "{config_types[1]}"').sort_values(columns)
	type0 = np.concatenate([get_balanced_acc(file) for file in type0.file])
	type1 = np.concatenate([get_balanced_acc(file) for file in type1.file])
	return stats.wilcoxon(type0, type1)[1]

configs = pd.DataFrame([extract_configuration(file) for file in files])
pd.DataFrame(
	{
		'registerd vs unregistered': get_configuration_p_values(configs, 'registered'),
		'fixed vs proportionate number of SLIC regions': get_configuration_p_values(configs, 'slic_zones'),
	},
	index=[0],
)



Unnamed: 0,registerd vs unregistered,fixed vs proportionate number of SLIC regions
0,8.760412999999999e-36,7.896253e-14
