In [None]:
import os
import math
import numpy as np
import pandas as pd
import argparse

from skfda import FDataGrid
from skfda.representation.basis import BSpline
from skfda.preprocessing.smoothing import BasisSmoother
from skfda.preprocessing.registration import ElasticRegistration, landmark_registration

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from skfda.exploratory.depth import IntegratedDepth, ModifiedBandDepth

from cycler import cycler
from matplotlib import pyplot as plt

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

from sklearn.metrics import (
	precision_score, recall_score, f1_score, balanced_accuracy_score, roc_curve, confusion_matrix
)


agg_columns = ['patient_id', 'slice_id', 'img_type']

n_basis=18
order=4

prc_rm=0.05
n_points =111

basis = BSpline(domain_range=(0, 1), n_basis=n_basis, order=order)
smoother = BasisSmoother(basis=basis, return_basis=True, method='svd')

registration = ElasticRegistration()
ID = IntegratedDepth()
MBD = ModifiedBandDepth()


def get_model():
	return {
		'svm': SVC(class_weight='balanced', probability=True),
		'rf': RandomForestClassifier(class_weight='balanced', n_jobs=4),
		'xgb': xgb.XGBClassifier(tree_method='gpu_hist', eval_metric='logloss', use_label_encoder=False),
	}[MODEL]


def cut_ends(bsplined, order=0, prc_rm_start=prc_rm, prc_rm_end=prc_rm, n_points=n_points):
	bsplined_grid = bsplined.derivative(order=order).to_grid(np.linspace(0, 1, n_points))
	return FDataGrid(
		data_matrix=bsplined_grid.data_matrix[
			..., int(n_points * prc_rm_start): int(n_points * (1 - prc_rm_end)), 0
		],
		grid_points=bsplined_grid.grid_points[0][
			int(n_points * prc_rm_start): int(n_points * (1 - prc_rm_end))
		]
	)


def get_landmark_registration(bsplined, order=0):
	bsplined_grid = cut_ends(bsplined, order)
	landmark_indexes = cut_ends(bsplined, order, prc_rm_end=0.5).data_matrix.argmax(axis=1)
	grid_points = bsplined_grid.grid_points[0]
	landmarks = [grid_points[index] for index in np.concatenate(landmark_indexes)]
	return landmark_registration(bsplined_grid, landmarks)


def specificity(y_true, y_pred, zero_division=0):
	tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
	if tn+fp == 0 and zero_division:
		return zero_division
	return tn / (tn+fp)


## Generate dataset

In [None]:
FILE = '50-slic-zones-features.csv'

In [None]:
train_n_points = 100


def get_descrete_points(fd_smooth):
	t_cut = np.linspace(
		start=0,
		stop=fd_smooth.data_matrix.shape[1] - 1,
		num=train_n_points, endpoint=True, dtype=int,
	)
	return fd_smooth.data_matrix[:, t_cut, 0]


segmentations = []

for segmentation_id in range(0, 25):
# for segmentation_id in [5]:
	print(segmentation_id)
	file = f'./segmentations/proportionate-{segmentation_id}.csv'
	dataset = (
	pd.read_csv(
		file,
		dtype={
			'img_type': int,
			'patient_id': int,
			'cycle_id': int,
			'slice_id': int,
			'label': bool,
			'mask_int_mean': float,
			'segment': int,
		},
	)
	.drop_duplicates()
	.sort_values(agg_columns + ['cycle_id'])
	)
	dataset = dataset.merge(dataset.query('label').patient_id.drop_duplicates())
	# dataset = dataset[dataset.patient_id.apply(lambda x: x not in [3, 6, 29, 86, 108, 119, 140])]
	dataset = dataset[dataset.patient_id.apply(lambda x: x in [2, 15, 28, 32, 35, 39, 40, 41, 45, 50, 52, 64, 66])]
	ts = (
		dataset[['patient_id', 'cycle_id']].drop_duplicates()
			.groupby('patient_id').cycle_id.count()
			.apply(lambda x: np.linspace(0, 1, int(x)))
			.reset_index()
	)

	dataset = dataset.groupby(agg_columns + ['label']).mask_int_mean.apply(list).reset_index()
	bsplined = dataset.groupby('patient_id').mask_int_mean.apply(list).reset_index().merge(ts)
	bsplined = bsplined.apply(
		lambda x: smoother.fit_transform(
			FDataGrid(data_matrix=x['mask_int_mean'], grid_points=x['cycle_id'])
		),
		axis='columns',
	)
	print('extracting features ...')
	registered = [get_landmark_registration(fd_smooth, 1) for fd_smooth in bsplined]
	unregistered = [cut_ends(fd_smooth, 1) for fd_smooth in bsplined]
	
	dataset['unregistered_ids'] = np.concatenate(
		[ID(fd_smooth).reshape(-1, 1) for fd_smooth in unregistered]
	)
	dataset['unregistered_mbds'] = np.concatenate(
		[MBD(fd_smooth).reshape(-1, 1) for fd_smooth in unregistered]
	)
	dataset['unregistered_max_values'] = np.concatenate(
		[fd_smooth.data_matrix.max(axis=1).reshape(-1, 1) for fd_smooth in unregistered]
	)
	
	dataset['registered_ids'] = np.concatenate(
		[ID(fd_smooth).reshape(-1, 1) for fd_smooth in registered]
	)
	dataset['registered_mbds'] = np.concatenate(
		[MBD(fd_smooth).reshape(-1, 1) for fd_smooth in registered]
	)
	dataset['registered_max_values'] = np.concatenate(
		[fd_smooth.data_matrix.max(axis=1).reshape(-1, 1) for fd_smooth in registered]
	)
	registered = [get_descrete_points(fd_smooth) for fd_smooth in registered]
	unregistered = [get_descrete_points(fd_smooth) for fd_smooth in unregistered]

	segmentations += [
		dataset.drop('mask_int_mean', axis='columns')
			.assign(segmentation_id=segmentation_id)
			.join(
				pd.DataFrame(
					np.concatenate(registered),
					columns=[f'registered_discrete_{i}' for i in range(100)],
				)
			)
			.join(
				pd.DataFrame(
					np.concatenate(unregistered),
					columns=[f'unregistered_discrete_{i}' for i in range(100)],
				)
			)
	]

dataset = pd.concat(segmentations)
dataset.to_csv(FILE, index=False)

In [None]:
import xgboost as xgb
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from sklearn.model_selection import train_test_split
import re


print(FILE)
dataset = pd.read_csv(FILE)


def f1_loss(y_true, y_pred):
	return -f1_score(y_true, y_pred > 0.5)


os.environ['HYPEROPT_FMIN_SEED'] = "1"

space = {
	'max_depth': hp.quniform("max_depth", 3, 18, 1),
	'gamma': hp.uniform('gamma', 0, 1),
	'reg_alpha' : hp.uniform('reg_alpha', 0, 1),
	'colsample_bytree' : hp.uniform('colsample_bytree', .4, .8),
	'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
	'n_estimators': 180,
	'seed': 0
}

result = []
# features = r'^unregistered_(discrete_\d+|ids|mbds|max_values)'
features = r'^unregistered_(discrete_\d+)'
result_file = 'unregistered_discrete'

for patient, segmentation in dataset[['patient_id', 'segmentation_id']].drop_duplicates().to_records(index=False):
	print(f'patient - {patient}, segmentation - {segmentation}')
	dataset_group = dataset.query(f'segmentation_id == {segmentation} and patient_id == {patient}')
	y = dataset_group.label.astype(int).to_numpy()
	X = dataset_group[[name for name in dataset.columns if re.match(features, name)]].to_numpy()
	if patient == 2:
		print(X.shape)

	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0, stratify=y)
	
	neg_class_weight = (y_train == 1).sum() / len(y_train)
	weights = [neg_class_weight if label == 0 else 1 - neg_class_weight for label in y_train ]
	scaler = MinMaxScaler(feature_range=(0, 1), copy=False)
	scaler = scaler.fit(X_train)
	X_train = scaler.transform(X_train)
	X_test = scaler.transform(X_test)


	def objective(space):
		model = xgb.XGBClassifier(
			use_label_encoder=False,
			eval_metric=f1_loss,
			early_stopping_rounds=10,
			n_estimators=space['n_estimators'],
			max_depth=int(space['max_depth']),
			gamma=space['gamma'],
			reg_alpha=space['reg_alpha'],
			min_child_weight=space['min_child_weight'],
			colsample_bytree=space['colsample_bytree'],
		)
		
		model.fit(X_train, y_train, sample_weight=weights, eval_set=[( X_train, y_train), ( X_test, y_test)], verbose=False)
		pred = model.predict(X_test)
		return {'loss': -f1_score(y_test, pred, zero_division=0), 'status': STATUS_OK }


	trials = Trials()
	best_hyperparams = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=400, trials=trials)
	model = xgb.XGBClassifier(
		use_label_encoder=False,
		eval_metric=f1_loss,
		early_stopping_rounds=10,
		n_estimators=space['n_estimators'],
		max_depth=int(best_hyperparams['max_depth']),
		gamma=best_hyperparams['gamma'],
		reg_alpha=best_hyperparams['reg_alpha'],
		min_child_weight=best_hyperparams['min_child_weight'],
		colsample_bytree=best_hyperparams['colsample_bytree'],
	)

	model.fit(X_train, y_train, sample_weight=weights, eval_set=[( X_train, y_train), ( X_test, y_test)], verbose=False)
	pred = model.predict(X_test)
	metrics = {
		'patient_id': patient,
		'segmentation_id': segmentation,
		'precision': precision_score(y_test, pred, zero_division=0),
		'recall': recall_score(y_test, pred, zero_division=0),
		'f1': f1_score(y_test, pred, zero_division=0),
		'balanced_accuracy': balanced_accuracy_score(y_test, pred),
		'specificity': specificity(y_test, pred, zero_division=0),
	}
	result += [{**metrics, **best_hyperparams}]

pd.DataFrame(result).to_csv(f'./show/fixed/xgboost/{result_file}_no_cv.csv', index=False)

In [None]:
from scipy import stats

data_file = f'./show/proportionate/xgboost/{result_file}_no_cv.csv'

transformed = (
	pd.read_csv(data_file)
		.sort_values(['segmentation_id', 'patient_id'])
		.groupby('segmentation_id').balanced_accuracy.apply(list).tolist()
)

stats.friedmanchisquare(*transformed)