In [2]:
import os
import math
import numpy as np
import pandas as pd
import argparse

from skfda import FDataGrid
from skfda.representation.basis import BSpline
from skfda.preprocessing.smoothing import BasisSmoother
from skfda.preprocessing.registration import ElasticRegistration, landmark_registration

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from skfda.exploratory.depth import IntegratedDepth, ModifiedBandDepth

from cycler import cycler
from matplotlib import pyplot as plt

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

from sklearn.metrics import (
	precision_score, recall_score, f1_score, balanced_accuracy_score, roc_curve, confusion_matrix
)


agg_columns = ['patient_id', 'slice_id', 'img_type']

n_basis=18
order=4

prc_rm=0.05
n_points=100

basis = BSpline(domain_range=(0, 1), n_basis=n_basis, order=order)
smoother = BasisSmoother(basis=basis, return_basis=True, method='svd')

registration = ElasticRegistration()
ID = IntegratedDepth()
MBD = ModifiedBandDepth()


def get_model():
	return {
		'svm': SVC(class_weight='balanced', probability=True),
		'rf': RandomForestClassifier(class_weight='balanced', n_jobs=4),
		'xgb': xgb.XGBClassifier(tree_method='gpu_hist', eval_metric='logloss', use_label_encoder=False),
	}[MODEL]


def cut_ends(bsplined, order=0, prc_rm_start=prc_rm, prc_rm_end=prc_rm, n_points=n_points):
	bsplined_grid = bsplined.derivative(order=order).to_grid(np.linspace(0, 1, n_points))
	return FDataGrid(
		data_matrix=bsplined_grid.data_matrix[
			..., int(n_points * prc_rm_start): int(n_points * (1 - prc_rm_end)), 0
		],
		grid_points=bsplined_grid.grid_points[0][
			int(n_points * prc_rm_start): int(n_points * (1 - prc_rm_end))
		]
	)


def get_landmark_registration(bsplined, order=0):
	bsplined_grid = cut_ends(bsplined, order)
	landmark_indexes = cut_ends(bsplined, order, prc_rm_end=0.5).data_matrix.argmax(axis=1)
	grid_points = bsplined_grid.grid_points[0]
	landmarks = [grid_points[index] for index in np.concatenate(landmark_indexes)]
	return landmark_registration(bsplined_grid, landmarks)
		


In [2]:
segmentation_id = 4
patients = [3, 6, 7, 29, 66, 76, 108, 119, 133, 140]
bspline_levels = ['patient_id']
min_n_curves = 5

dataset = (
	pd.read_csv(
		f'./segmentations/segmentation-{segmentation_id}.csv',
		dtype={
			'img_type': int,
			'patient_id': int,
			'cycle_id': int,
			'slice_id': int,
			'label': bool,
			'mask_int_mean': float,
			'segment': int,
		},
	)
		.drop_duplicates()
		.sort_values(agg_columns + ['cycle_id'])
)
# dataset = dataset[dataset.patient_id.apply(lambda x: x in patients)]
dataset = dataset.merge(dataset.query('label').patient_id.drop_duplicates())
ts = (
	dataset[bspline_levels + ['cycle_id']].drop_duplicates()
		.groupby(bspline_levels).cycle_id.count()
		.apply(lambda x: np.linspace(0, 1, int(x)))
		.reset_index()
)

dataset = dataset.groupby(agg_columns + ['label']).mask_int_mean.apply(list).reset_index()
labels = dataset.groupby(bspline_levels).label.apply(list).reset_index()
dataset = dataset.groupby(bspline_levels).mask_int_mean.apply(list).reset_index().merge(ts)
dataset = dataset[dataset.mask_int_mean.apply(lambda x: len(x) >= min_n_curves)]
dataset['bsplined'] = dataset.apply(
	lambda x: smoother.fit_transform(
		FDataGrid(data_matrix=x['mask_int_mean'], grid_points=x['cycle_id'])
	),
	axis='columns',
)
dataset['bsplined'] = dataset.bsplined.apply(get_landmark_registration, order=1)
dataset = dataset[bspline_levels + ['bsplined']]
dataset = dataset.merge(labels)


In [None]:
def print_patient(row, segmentation_id):
	colors = ['red' if x else 'green' for x in row['label']]
	alphas = [1 if x else 0.1 for x in row['label']]
	color_cycle = cycler(color=colors, alpha=alphas)

	fig = plt.figure(figsize=(11, 7))
	ax = plt.axes(xlabel='t')
	ax.set_prop_cycle(color_cycle)
	row['bsplined'].plot(fig=fig, ax=ax)
	name = f'./patient_plots/patient-{row["patient_id"]}-segment-{segmentation_id}.png'
	if 'slice_id' in dataset.columns:
		name = f'./patient_slice_plots/patient-{row["patient_id"]}-slice{row["slice_id"]}-segment-{segmentation_id}.png'
	plt.savefig(name, dpi=150)
	plt.close()


x = dataset.apply(print_patient, segmentation_id=segmentation_id, axis='columns')

In [10]:
from sklearn.model_selection import StratifiedKFold
import xgboost as xgb


k_folds = StratifiedKFold(n_splits=5)
train_n_points = 100


def get_descrete_points(fd_smooth):
	t_cut = np.linspace(
		start=0,
		stop=fd_smooth.data_matrix.shape[1] - 1,
		num=train_n_points, endpoint=True, dtype=int,
	)
	return fd_smooth.data_matrix[:, t_cut, 0]


def train_kfolds(Y, X, segmentation_id):
	ax = None
	roc_scores = []

	for train_index, test_index in k_folds.split(X, Y):
		print(train_index, test_index)
		X_train, X_test = X[train_index], X[test_index]
		y_train, y_test = y[train_index], y[test_index]

		model =  xgb.XGBClassifier(
			tree_method='gpu_hist', eval_metric='logloss', use_label_encoder=False,
		)
		model.fit(X_train, y_train)
		preds = model.predict_proba(X_test)[:,1]

		fpr, tpr, thresholds = roc_curve(y_test, preds)
		roc_scores = pd.DataFrame(
			{
				'fpr': fpr,
				'tpr': tpr,
				'threshold': thresholds,
				'fold': i,
				'segmentation_id': segmentation_id,
			}
		)
		if not os.path.exists(f'./roc/{MODEL}.csv'):
			roc_scores.to_csv(f'./roc/{MODEL}.csv', index=False)
		else:
			roc_scores.to_csv(f'./roc/{MODEL}.csv', index=False, mode='a', header=False)
		ax = sns.scatterplot(x=fpr, y=tpr, ax=ax)
		preds = np.reshape(preds >= 0.5, (-1)).astype(int)

		f1s += [f1_score(y_test, preds, zero_division=0)]
		precisions += [precision_score(y_test, preds, zero_division=0)]
		recalls += [recall_score(y_test, preds, zero_division=0)]
		accuracies += [balanced_accuracy_score(y_test, preds)]
		specificities += [specificity(y_test, preds)]

		preds = model.predict(X_train)
		preds = np.reshape(preds >= 0.5, (-1)).astype(int)
		train_f1s += [f1_score(y_train, preds, zero_division=0)]
		train_precisions += [precision_score(y_train, preds, zero_division=0)]
		train_recalls += [recall_score(y_train, preds, zero_division=0)]
		train_accuracies += [balanced_accuracy_score(y_train, preds)]
		train_specificities += [specificity(y_train, preds)]

	ax.set_ylabel('False positive rate')
	ax.set_xlabel('True positive rate')
	plt.legend(labels=['TPR x FPR', 'threshold'] * 5)
	plt.savefig(f'./roc/{MODEL}_segmentation-{segmentation_id}.png')
	plt.close()


def process_patient(patient):
	print('getting depths ...')
	ids = ID(patient['fd_smooth']).reshape(-1, 1)
	mbds = MBD(patient['fd_smooth']).reshape(-1, 1)
	print('getting other features ...')
	max_values = patient['fd_smooth'].data_matrix.max(axis=1).reshape(-1, 1)
	descret_points = get_descrete_points(patient['fd_smooth'])

	train_set = np.hstack([ids, mbds, max_values, descret_points])
	labels = patient['label'].astype(int).to_numpy()
	train_kfolds(train_set, labels, segmentation_id)


In [4]:
# for segmentation_id in range(0, 25):
for segmentation_id in [4]:
	print(segmentation_id)
	file = f'./segmentations/segmentation-{segmentation_id}.csv'
	dataset = (
	pd.read_csv(
		file,
		dtype={
			'img_type': int,
			'patient_id': int,
			'cycle_id': int,
			'slice_id': int,
			'label': bool,
			'mask_int_mean': float,
			'segment': int,
		},
	)
	.drop_duplicates()
	.sort_values(agg_columns + ['cycle_id'])
	)
	dataset = dataset.merge(dataset.query('label').patient_id.drop_duplicates())
	dataset = dataset[dataset.patient_id.apply(lambda x: x not in [3, 6, 29, 86, 108, 119, 140])]
	ts = (
		dataset[['patient_id', 'cycle_id']].drop_duplicates()
			.groupby('patient_id').cycle_id.count()
			.apply(lambda x: np.linspace(0, 1, int(x)))
			.reset_index()
	)

	dataset = dataset.groupby(agg_columns + ['label']).mask_int_mean.apply(list).reset_index()
	bsplined = dataset.groupby('patient_id').mask_int_mean.apply(list).reset_index().merge(ts)
	bsplined = bsplined.apply(
		lambda x: smoother.fit_transform(
			FDataGrid(data_matrix=x['mask_int_mean'], grid_points=x['cycle_id'])
		),
		axis='columns',
	)
	print('registering ...')
	bsplined = [get_landmark_registration(fd_smooth, 1) for fd_smooth in bsplined]

	patients = dataset.groupby('patient_id').label.apply(list).reset_index()
	patients['fd_smooth'] = bsplined
	patients.apply(
		process_patient,
		segmentation_id=segmentation_id,
		axis='columns',
	)


4
registering ...


In [6]:
len(bsplined)

28