In [None]:
import os
import math
import cv2 as cv
import csv
import numpy as np
import pandas as pd
import seaborn as sns
import itertools

# from ftplib import FTP_TLS
# from cycler import cycler

# from matplotlib.cm import cool

# from skimage.segmentation import slic, mark_boundaries
# from skimage.color import label2rgb

from skfda.exploratory.visualization import Boxplot
from skfda.ml.classification import KNeighborsClassifier

# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.linear_model import LogisticRegression





In [None]:
from matplotlib import pyplot as plt
%matplotlib ipympl


plt.style.use('dark_background')
# plt.style.use('default')
# plt.style.use('seaborn-dark')

In [1]:
import os
import numpy as np
import pandas as pd
import math

from cycler import cycler
from matplotlib import pyplot as plt

from skfda import FDataGrid
from skfda.representation.basis import BSpline
from skfda.preprocessing.smoothing import BasisSmoother
from skfda.preprocessing.registration import ElasticRegistration, landmark_registration

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from skfda.exploratory.depth import IntegratedDepth, ModifiedBandDepth, BandDepth
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
from sklearn.metrics import (
	precision_score, recall_score, f1_score, balanced_accuracy_score, confusion_matrix
)

## FOLDER SETUP

In [None]:
path_original =  './data/'
path_processed = './processed/'
path_norm =      './processed/full_normalized/'
path_segments =  './processed/segmented/'

path_maskP =     './processed/mask_prostate/'
path_maskC =     './processed/mask_cancer/'
path_maskB =     './processed/mask_biopsy/'


## SLIC

### General

In [4]:
def get_cycles(patient, slice_no, path_norm):
	return [
		int(os.path.splitext(file)[0].split('-')[0])
		for file in os.listdir(f'{path_norm}{patient}')
		if '-' in file and os.path.splitext(file)[0].split('-')[1] == str(slice_no)
	]



### SLIC functions

In [5]:

def slic_slice(
	patient, slice_no, n_segments, to_number=False
):
	functions = {
		10: lambda x: f'00{x}',
		100: lambda x: f'0{x}',
		1000: lambda x: str(x),
	}
	patient_str = [func(patient) for limit, func in functions.items() if patient < limit][0]
	prostate_mask = cv.imread(f'{path_maskP}/{patient}/{slice_no}.png', cv.IMREAD_GRAYSCALE)
	if np.all(prostate_mask == 0):
		return None
	cancer_mask = cv.imread(f'{path_maskC}/{patient}/{slice_no}.png', cv.IMREAD_GRAYSCALE)
	benign, malignant, undetermined = get_biopsy_masks(patient_str, slice_no)
	palette = sns.color_palette(None, n_segments)
	if not os.path.exists(f'{path_segments}{patient}-{n_segments}'):
		os.makedirs(f'{path_segments}{patient}-{n_segments}')
		os.makedirs(f'{path_segments}{patient}-{n_segments}/contour')
		os.makedirs(f'{path_segments}{patient}-{n_segments}/filled')

	cycle_ids = get_cycles(patient, slice_no, path_norm)
	return [
		slic_cycle(
			patient, slice_no, cycle_id, n_segments, palette,
			prostate_mask, cancer_mask, benign, malignant, undetermined, to_number,
		)
		for cycle_id in sorted(cycle_ids)
	]


def get_biopsy_masks(patient, slice_no):
	biopsy_mask = cv.imread(
		f'{path_original}{patient}/biopsyMask/{slice_no}.png', cv.IMREAD_ANYDEPTH
	)
	biopsies = (
		pd.read_csv(
			f'./biopsies/{patient}.csv',
			header=None,
			names=['biopsy_id', 'type', 'a', 'b'],
		)
	)
	malignant = biopsy_mask.copy()
	for biopsy in biopsies.query("type != ' Malignant'")['biopsy_id'].to_list():
		malignant[malignant == biopsy] = 0
	malignant[malignant != 0] = 255
	benign = biopsy_mask.copy()
	for biopsy in biopsies.query("type != ' Benign'")['biopsy_id'].to_list():
		benign[benign == biopsy] = 0
	benign[benign != 0] = 255
	undetermined = biopsy_mask.copy()
	for biopsy in biopsies.query("type != ' Benign'")['biopsy_id'].to_list():
		undetermined[undetermined == biopsy] = 0
	undetermined[undetermined != 0] = 255
	return benign, malignant, undetermined


def slic_cycle(
	patient, slice_no, cycle_id, n_segments, palette,
	prostate_mask, cancer_mask, benign, malignant, undetermined,
	to_number=False,
):
	cycle_slice_i = f'{cycle_id}-{slice_no}.png'
	slice_i = cv.imread(f'{path_norm}{patient}/{cycle_slice_i}')
	segments = slic(
		slice_i,
		n_segments=n_segments,
		compactness=7,
		start_label=1,
		mask=prostate_mask,
	)
	# segmented_image = label2rgb(
	# 	segments,
	# 	slice_i,
	# 	bg_label=0,
	# 	alpha=0.7,
	# 	colors=palette,
	# )
	# segmented_image = mark_boundaries(segmented_image, cancer_mask, color=(1, 1, 1))
	# save_image(
	# 	segmented_image,
	# 	f'{path_segments}{patient}-{n_segments}/filled/{cycle_id}-{slice_no}.png',
	# )

	slice_i, confirmed_cancer = color_segments(
		cycle_id, n_segments, cancer_mask, malignant, slice_i, segments,
		(0.5, 1), (0, 1), (1, 0, 0), to_number,
	)
	slice_i, unconfirmed_cancer = color_segments(
		cycle_id, n_segments, cancer_mask, malignant, slice_i, segments,
		(0.5, 1), (-1, 0), (1, 0.5, 0), to_number,
	)
	slice_i, _ = color_segments(
		cycle_id, n_segments, cancer_mask, malignant, slice_i, segments,
		(-1, 0.5), (-1, 1), (1, 1, 1), to_number,
	)
	slice_i, benign_zones = color_segments(
		cycle_id, n_segments, cancer_mask, malignant + undetermined, slice_i, segments,
		(-1, 0), (-1, 0), (0, 1, 0), to_number,
	)
	slice_i = mark_boundaries(slice_i, cancer_mask, color=(0, 1, 1))
	slice_i = mark_boundaries(slice_i, malignant, color=(1, 0, 0))
	slice_i = mark_boundaries(slice_i, benign, color=(0, 1, 0))
	slice_i = mark_boundaries(slice_i, undetermined, color=(0, 0, 1))
	save_image(
		mark_boundaries(slice_i, cancer_mask, color=(1, 0, 1)),
		f'{path_segments}{patient}-{n_segments}/contour/{cycle_id}-{slice_no}.png',
	)
	return (segments, cycle_id, confirmed_cancer, unconfirmed_cancer, benign_zones)


def color_segments(
	cycle_id, n_segments, cancer_mask, biopsy, image, segments,
	cancer_coverage=(0.8, 1), biopsy_coverage=(0, 1), to_color=(1, 0, 0),
	to_number=False,
):
	marked = image
	marked_segments = []
	for segment_id in list(range(1, n_segments + 1)):
		region = image[np.where(segments == segment_id)]
		cancer_in_region = image[np.where(cancer_mask * segments / 255 == segment_id)]
		biopsy_in_region = image[np.where(biopsy * segments / 255 == segment_id)]
		if (
			region.shape[0] != 0
			and cancer_in_region.shape[0] / region.shape[0] > cancer_coverage[0]
			and cancer_in_region.shape[0] / region.shape[0] <= cancer_coverage[1]
			and biopsy_in_region.shape[0] / region.shape[0] > biopsy_coverage[0]
			and biopsy_in_region.shape[0] / region.shape[0] <= biopsy_coverage[1]
		):
			marked_segments += [segment_id]
			marked = mark_boundaries(marked, segments == segment_id, color=to_color)
			if to_number:
				coords = np.argwhere(segments == segment_id)
				x = np.quantile([axis for axis, _ in coords], 0.5)
				y = np.quantile([axis_y for axis_x, axis_y in coords], 0.5)
				cv.putText(
					marked, str(segment_id), (int(y), int(x)),
					fontFace=cv.FONT_HERSHEY_SCRIPT_SIMPLEX,
					fontScale=0.35,
					color=(0, 1, 1),
				)
	
	return marked, marked_segments


def save_image(image, path):
	fig = plt.figure(frameon=False)
	ax = plt.Axes(fig, [0., 0., 1., 1.])
	ax.set_axis_off()
	fig.add_axes(ax)
	ax.imshow(image)
	fig.savefig(path, dpi=150)
	plt.close()



### Means of SLIC regions functions

In [6]:
def segments_means_foreach_slice(
	patient, slice_no, segments, n_segments,
	confirmed_cancer, unconfirmed_cancer, benign_zones,
	agg_func=np.median, normalize_to=None, drop_start_by=0,drop_end_by=0,
	segmented_in_cycle=None
):
	cycles = get_cycles(patient, slice_no, path_norm)
	# cycles = np.array(cycles)[np.where(np.array(cycles) >= drop_start_by)]
	# if drop_end_by != 0:
	# 	cycles = np.array(cycles)[
	# 		np.where(np.array(cycles) < np.max(cycles) - drop_end_by)
	# 	]
	slices = {
		cycle_id: cv.imread(f'{path_norm}{patient}/{cycle_id}-{slice_no}.png')
		for cycle_id in sorted(cycles)
	}
	segment_means = [
		(
			segment_id, patient, cycle_id, slice_no,
			get_segments_mean(slices[cycle_id], segment_id, segments, cycle_id, agg_func),
			segment_id in confirmed_cancer,
			segment_id in unconfirmed_cancer,
			segment_id in benign_zones,
			segmented_in_cycle,
		)
		for segment_id, cycle_id  in itertools.product(
			range(1, n_segments + 1),
			cycles
		)
	]
	# segment_means = pd.DataFrame(
	# 	segment_means,
	# 	columns=[
	# 		'img_type', 'patient_id', 'cycle_id', 'slice_id', 'mask_int_mean',
	# 		'confirmed_cancer', 'unconfirmed_cancer', 'benign', 'segment',
	# 	]
	# )
	if normalize_to:
		max_vals = segment_means.groupby('img_type').mask_int_mean.transform('max')
		segment_means['mask_int_mean'] = segment_means.mask_int_mean / max_vals
		segment_means['mask_int_mean'] = segment_means.mask_int_mean * normalize_to
	return segment_means


def get_segments_mean(slice_i, seg_i, segments, cycle_id, agg_func=np.median):
	selected_region = slice_i[np.where(segments == seg_i)]
	if selected_region.shape[0] == 0:
		return None
	if agg_func:
		return agg_func(selected_region)
	return ','.join(selected_region.reshape(-1).astype('str'))


def add_differentials(segment_means, src_col='mask_int_mean', dest_col='differential'):
	segment_means = segment_means.sort_values(
		['patient_id', 'slice_id', 'segment', 'img_type', 'cycle_id']
	)
	segment_means[dest_col] = (
		segment_means.groupby(['patient_id', 'slice_id', 'segment', 'img_type'])
			[src_col]
			.shift(1)
	)
	segment_means[dest_col] = segment_means.apply(
		lambda x: x[src_col] - x[dest_col] if x[dest_col] == x[dest_col] else None,
		axis='columns'
	)
	return segment_means




## Bspline

In [None]:

prc_rm=0.05
n_points=100
descret_points = np.linspace(0 + prc_rm, 1 - prc_rm, n_points)

registration = ElasticRegistration()
ID = IntegratedDepth()
MBD = ModifiedBandDepth()

scaler = StandardScaler()
l_enc = LabelEncoder()

def cut_ends(bsplined, order=0, prc_rm_start=prc_rm, prc_rm_end=prc_rm, n_points=n_points):
	bsplined_grid = bsplined.derivative(order=order).to_grid(np.linspace(0, 1, n_points))
	return FDataGrid(
		data_matrix=bsplined_grid.data_matrix[
			..., int(n_points * prc_rm_start): int(n_points * (1 - prc_rm_end)), 0
		],
		grid_points=bsplined_grid.grid_points[0][
			int(n_points * prc_rm_start): int(n_points * (1 - prc_rm_end))
		]
	)


def agg_fdatas(
	bsplined, agg_to=['patient_id', 'slice_id', 'segment'], add_segment_ids=False,
):
	bsplined_agg = bsplined[agg_to].drop_duplicates()
	bsplined_agg['fd_smooth'] = bsplined_agg.apply(
		lambda y: concat_fdatas(
			bsplined
				.query(
					f'segment == {y["segment"]}'
					+ (f' and slice_id == {y["slice_id"]}' if 'slice_id' in agg_to else '')
				)
				.fd_smooth.tolist()
		),
		axis='columns',
	)
	groups = {
		lambda x: x['confirmed_cancer']: 'red',
		lambda x: x['unconfirmed_cancer']: 'orange',
		lambda x: x['benign']: 'green',
		lambda x: True: 'white',
	}
	bsplined['color'] = bsplined.apply(
		lambda x: [color for condition, color in groups.items() if condition(x)][0],
		axis='columns',
	)
	bsplined_agg['color'] = bsplined_agg.apply(
		lambda y: bsplined[~bsplined.fd_smooth.isnull()]
			.query(
				f'segment == {y["segment"]}'
				+ (f' and slice_id == {y["slice_id"]}' if 'slice_id' in agg_to else '')
			)
			.color.tolist(),
		axis='columns',
	)
	bsplined_agg['segment_region'] = bsplined_agg.apply(
		lambda y: bsplined[~bsplined.fd_smooth.isnull()]
			.query(
				f'segment == {y["segment"]}'
				+ (f' and slice_id == {y["slice_id"]}' if 'slice_id' in agg_to else '')
			)
			.img_type.tolist(),
		axis='columns',
	)
	if add_segment_ids:
		bsplined_agg = bsplined_agg.assign(
			segment_ids=lambda y: [
				(
					bsplined[~bsplined.fd_smooth.isnull()]
						.query(f'segment == {segment}').img_type.tolist()
				)
				for segment in y.segment.drop_duplicates()
			],
			slice_ids=lambda y: [
				(
					bsplined[~bsplined.fd_smooth.isnull()]
						.query(f'segment == {segment}').slice_id.tolist()
				)
				for segment in y.segment.drop_duplicates()
			],
		)
	return bsplined_agg


def concat_fdatas(fd_smooths):
	fd = None
	for fdata in fd_smooths:
		if not fdata:
			continue
		if not fd:
			fd = fdata
			continue
		fd = fd.concatenate(fdata)

	return fd


def add_registrations(fdatas, order=0):
	fdatas['registration_order_' + str(order)] = fdatas.fd_smooth.apply(
		lambda y: registration.fit_transform(
			y.derivative(order=order).to_grid(descret_points)
		)
	)
	return fdatas


def add_depths(fdatas, order=0):
	fdatas['ID_order' + str(order)] = fdatas.fd_smooth.apply(
		lambda y: scaler.fit_transform(ID(get_landmark_registration(y, 1)).reshape(-1, 1))
	)
	fdatas['MBD_order' + str(order)] = fdatas.fd_smooth.apply(
		lambda y: scaler.fit_transform(MBD(get_landmark_registration(y, 1)).reshape(-1, 1))
	)
	# fdatas['ID_order' + str(order)] = fdatas['ID_order' + str(order)].apply(
	# 	lambda y: (1 - 0.1) * ((y - np.min(y)) / (np.max(y) - np.min(y))) + 0.1
	# )
	# fdatas['MBD_order' + str(order)] = fdatas['MBD_order' + str(order)].apply(
	# 	lambda y: (1 - 0.1) * ((y - np.min(y)) / (np.max(y) - np.min(y))) + 0.1
	# )
	return fdatas

def get_landmark_registration(bsplined, order=0):
	bsplined_grid = cut_ends(bsplined, order)
	landmark_indexes = cut_ends(bsplined, order, prc_rm_end=0.5).data_matrix.argmax(axis=1)
	grid_points = bsplined_grid.grid_points[0]
	landmarks = [grid_points[index] for index in np.concatenate(landmark_indexes)]
	return landmark_registration(bsplined_grid, landmarks)



In [4]:
n_basis=18
order=4
palette = {'red': (1, 0, 0), 'orange': (1, 0.5, 0), 'green': (0, 1, 0), 'white': (1, 1, 1)}

basis = BSpline(domain_range=(0, 1), n_basis=n_basis, order=order)
smoother = BasisSmoother(basis=basis, return_basis=True, method='svd')

DEFAULT_ROW_CONF = [
	(lambda x, fig, ax, **kwargs: x.plot(fig=fig, ax=ax), ''),
	(lambda x, fig, ax, **kwargs: cut_ends(x, 1).plot(fig=fig, ax=ax), 'derivative=1'),
	(
		lambda x, fig, ax, **kwargs: get_landmark_registration(x, 1).plot(fig=fig, ax=ax),
		'derivative=1, registered'
	),
	(
		lambda x, fig, ax, **kwargs: plot_func_boxplot(cut_ends(x, 1), fig, ax),
		'derivative=1, depth',
	),
	(
		lambda x, fig, ax, **kwargs: plot_func_boxplot(
			get_landmark_registration(x, 1), fig, ax
		),
		'derivative=1, depth registered',
	),
]


def bspline_patient(
	patient, slice_no, segments, n_segments,
	confirmed_cancer, unconfirmed_cancer, benign_zones,
):
	patient_data = segments_means_foreach_slice(
		patient, slice_no, segments, n_segments,
		confirmed_cancer, unconfirmed_cancer, benign_zones,
		lambda x: x.mean(),
	)
	regions = patient_data['img_type'].drop_duplicates().tolist()
	processed_patient_data = []
	for region in regions:
		region_data = (
			patient_data.query(f'img_type == {region}')
				.sort_values('cycle_id').explode('mask_int_mean').reset_index()
		)
		t = (region_data.cycle_id / region_data.cycle_id.max()).tolist()
		intensities = region_data['mask_int_mean'].tolist()
		fd_smooth = None
		if np.any(intensities):
			fd = FDataGrid(data_matrix=intensities, grid_points=t)
			fd_smooth = smoother.fit_transform(fd)
		processed_patient_data += [
			region_data
				.drop(['cycle_id', 'mask_int_mean', 'index'], axis='columns')
				.drop_duplicates()
				.assign(fd_smooth=[fd_smooth])
		]

	return pd.concat(processed_patient_data)


def plot_bsplined(
	bsplined, n_segments, segmentations=None,
	temp_file_name='./temp.png',
	row_config=DEFAULT_ROW_CONF,
	filename_suff='',
):
	segmentation_ids = bsplined.segment.drop_duplicates().tolist()
	segmentation_ids = [
		segmentation_id
		for segmentation_id in segmentation_ids
		if (
			segmentations is None
			or len(segmentations) == 0
			or segmentation_id in segmentations
		)
	]
	print(f'total segmentations = {len(segmentation_ids)}')
	rows = [
		create_bsplined_row(segmentation, bsplined, n_segments, temp_file_name, row_config)
		for segmentation in segmentation_ids
	]
	final_image = np.concatenate(rows, axis=0)
	patient, slice_id = get_patient_slice(bsplined)
	cv.imwrite(
		f'./plots_v3/segmented-{patient}-{slice_id}-{n_segments}{filename_suff}.png',
		final_image,
	)


def create_bsplined_row(
	segmentation, bsplined, n_segments,
	temp_file_name='./temp.png', row_config=DEFAULT_ROW_CONF
):
	print(f'processing {segmentation} ...')
	plots = [
		create_bsplined_plot(segmentation, bsplined, plot_func, temp_file_name, title)
		for plot_func, title in row_config
	]
	patient, slice_id = get_patient_slice(bsplined)
	contour = cv.imread(
		f'{path_segments}{patient}-{n_segments}/contour/{segmentation}-{slice_id}.png'
	)
	contour = cv.resize(contour, (1575, 1050))
	return np.concatenate(plots + [contour], axis=1)


def create_bsplined_plot(
	segmentation,
	bsplined,
	plot_func=lambda x, fig, ax: x.plot(fig=fig, ax=ax),
	temp_file_name='./temp.png',
	title='',
):
	bsplined = bsplined.query(f'segment == {segmentation}')
	fd_smooth = bsplined['fd_smooth'].tolist()[0]
	color = bsplined['color'].tolist()[0]
	# color = [c if c != 'white' else 'grey' for c in color]
	color_cycle = cycler(color=color)
	color_labels = [color + ' ' + str(num + 1) for num, color in enumerate(color)]
	patient,slice_id = get_patient_slice(bsplined)
	fig = plt.figure(figsize=(11, 7))
	ax = plt.axes(
		title=(
			f'patient_id={patient}, '
			f'slice={slice_id}, '
			f'segmented_t={segmentation} bspline, '
			f'{title}'
		),
		xlabel='t',
	)
	ax.set_prop_cycle(color_cycle)
	fig.subplots_adjust(hspace=0.3, wspace=0.3)
	plot_func(x=fd_smooth, fig=fig, ax=ax, color_labels=color_labels, color=color)
	plt.savefig(temp_file_name, dpi=150)
	plt.close()
	return cv.imread(temp_file_name)


def get_patient_slice(bsplined):
	bsplined_meta = bsplined[['patient_id', 'slice_id']].drop_duplicates().reset_index()
	patient = bsplined_meta['patient_id'].tolist()[0]
	slice_id = bsplined_meta['slice_id'].tolist()[0]
	return patient, slice_id



In [5]:
MODELS = [
	(
		lambda: KNeighborsClassifier(n_neighbors=7),  # , weights='distance'
		'K nearest neighbors'
	),
	# (
	# 	lambda: LogisticRegression(max_iter=9999999999999999, class_weight='balanced'),
	# 	'logistic regression'
	# ),
	(lambda: SVC(probability=True), 'c-support vector'),
	# (lambda: SVC(class_weight='balanced', probability=True), 'c-support vector'),
]
OTHER_X_COLS = ['ID_order1', 'MBD_order1', 'max_values', 'max_points']
LABEL_ORDERING = ['green', 'red']
MODEL_NAMES = [
	# ('knn', 'K nearest neighbors'),
	# ('log_reg', 'logistic regression'),
	('svc', 'c-support vector'),
]
METRICS = [
	('f1', f1_score),
	('prec', precision_score),
	('rec', recall_score),
	('acc', balanced_accuracy_score),
	# ('spec', specificity),
]


def add_max_values_points(bsplined, order=1):
	return bsplined.assign(
		max_values=lambda y: y.fd_smooth.apply(
			lambda x: scaler.fit_transform(
				get_landmark_registration(x, order)
					.data_matrix
					.max(axis=1)
					.reshape(-1, 1)
			)
		),
		max_points=lambda y: y.fd_smooth.apply(
			lambda x: scaler.fit_transform(
				(
					get_landmark_registration(x, order).data_matrix.argmax(axis=1)
					/
					n_points
				).reshape(-1, 1)
			)
		),
	)


def get_point_values(fd_smooth, order, n_points):
	registered = get_landmark_registration(fd_smooth, order)
	t_cut = np.linspace(
		start=0,
		stop=registered.data_matrix.shape[1] - 1,
		num=n_points, endpoint=True, dtype=int,
	)
	return scaler.fit_transform(registered.data_matrix[:, t_cut, 0])


def add_point_values(bsplined, n_points=10, order=1):
	bsplined['all_points'] = bsplined.fd_smooth.apply(get_point_values, order=order, n_points=n_points)
	return bsplined


def prep_x_data(
	bsplined, point_prefix='point_', other_x_cols=OTHER_X_COLS,
	y_col='color', validation=False, fit_labels=True,
):
	x_cols = [column for column in bsplined.keys() if column.startswith(point_prefix)]
	x_cols += other_x_cols
	X = [bsplined[x_col] for x_col in x_cols]
	Y = [1 if label == 'red' else 0 for label in bsplined[y_col]]
	if fit_labels:
		Y = l_enc.fit_transform(Y)
	return np.hstack(X), Y
	# if validation:
	# 	X = np.take(X, bsplined['leave_out'], axis=1)
	# 	Y = np.take(Y, bsplined['leave_out'])
	# else:
	# 	X = np.delete(X, bsplined['leave_out'], axis=1)
	# 	Y = np.delete(Y, bsplined['leave_out'])
	# return np.transpose(X), Y


def train_models(
	bsplined,
	# y_col='color', point_prefix='point_', other_x_cols=OTHER_X_COLS,
	y_col='color', point_prefix='qwe', other_x_cols=OTHER_X_COLS + ['all_points'],
	models=MODELS,
):
	X, Y = prep_x_data(bsplined, point_prefix, other_x_cols, y_col)
	fX = get_landmark_registration(bsplined['fd_smooth'], 1)
	fY = [1 if label == 'red' else 0 for label in bsplined[y_col]]
	return [
		(model().fit(X, Y) if name != 'K nearest neighbors' else model().fit(fX, fY), name)
		for model, name in models
	]


def predict_model(
	bsplined, model_name='K nearest neighbors',
	# point_prefix='point_', other_x_cols=OTHER_X_COLS,
	point_prefix='qwe', other_x_cols=OTHER_X_COLS + ['all_points'],
	label_ordering=LABEL_ORDERING, thresholds=(0.5, 0.5), validation=False,
):
	model = [model for model, name in bsplined['models'] if name == model_name][0]
	if model_name == 'K nearest neighbors':
		X = get_landmark_registration(bsplined['fd_smooth'], 1)
	else:
		X, _ = prep_x_data(
			bsplined, point_prefix, other_x_cols, validation=validation, fit_labels=False,
		)
	print(f'predicting {bsplined["segment"]}')
	return model.predict(X)


def specificity(y_true, y_pred, zero_division=0):
	tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
	if tn+fp == 0 and zero_division:
		return zero_division
	return tn / (tn+fp)


def evaluate_model(
	bsplined, func=f1_score, model_name='K nearest neighbors',
	y_col='color', point_prefix='point_', other_x_cols=OTHER_X_COLS,
	label_ordering=LABEL_ORDERING, thresholds=(0.5, 0.5), validation=False,
):
	# model = [model for model, name in bsplined['models'] if name == model_name][0]
	Y = [1 if label == 'red' else 0 for label in bsplined[y_col]]
	# if model_name == 'K nearest neighbors':
	# 	X = get_landmark_registration(bsplined['fd_smooth'], 1)
	# 	Y = [1 if label == 'red' else 0 for label in bsplined[y_col]]
	# else:
	# 	X, Y = prep_x_data(
	# 		bsplined, point_prefix, other_x_cols, y_col, validation, fit_labels=False,
	# 	)
	# preds = [
	# 	[
	# 		label
	# 		for label, threshold, prob in zip(label_ordering, thresholds, pred)
	# 		if prob >= threshold
	# 	][0]
	# 	for pred in model.predict_proba(X)
	# ]
	# preds = model.predict(X)
	preds = bsplined[
		[col_model + '_preds' for col_model, model in MODEL_NAMES if model == model_name][0]
	]
	if func == balanced_accuracy_score:
		return func(Y, preds)
	return func(Y, preds, zero_division=0)


def eval_all_metrics(
	bsplined, models=MODEL_NAMES, metrics=METRICS,
	# y_col='color', point_prefix='point_', other_x_cols=OTHER_X_COLS,
	y_col='color', point_prefix='qwe', other_x_cols=OTHER_X_COLS + ['all_points'],
	label_ordering=LABEL_ORDERING, thresholds=(0.5, 0.5), validation=False,
):
	for col_model, model in models:
		bsplined[col_model + '_preds'] = bsplined.apply(
			lambda x: predict_model(
				x, model, point_prefix, other_x_cols, label_ordering, thresholds,
			),
			axis='columns',
		)
	suff = '_val' if validation else ''
	for (col_model, model), (col_metric, metric) in itertools.product(models, metrics):
		print(f'{col_model} - {col_metric}')
		bsplined[col_model + '_' + col_metric + suff] = bsplined.apply(
			lambda x: evaluate_model(
				x, metric, model,
				y_col, point_prefix, other_x_cols,
				label_ordering, thresholds, validation
			),
			axis='columns',
		)


def predict_all(
	bsplined, models=MODEL_NAMES,
	point_prefix='point_', other_x_cols=OTHER_X_COLS,
	label_ordering=LABEL_ORDERING, thresholds=(0.5, 0.5),
):
	for col_model, model in models:
		bsplined[col_model + '_preds'] = bsplined.apply(
			lambda x: predict_model(
				x, model, point_prefix, other_x_cols, label_ordering, thresholds,
			),
			axis='columns',
		)



In [12]:


def plot_rated_with_depth(x, fig, ax, func=ID, order=1):
	x = cut_ends(x, order)
	id_val = func(x)
	id_val = (1 - 0.1) * ((id_val - np.min(id_val)) / (np.max(id_val) - np.min(id_val))) + 0.1
	colors = cool(np.linspace(0, 1, len(id_val)))
	colors = [colors[index] for index in np.argsort(id_val)]
	color_cycle = cycler(color=colors)
	ax.set_prop_cycle(color_cycle)
	x.plot(fig=fig, ax=ax)


def plot_landmark_registration_box_plots(bsplined, color, order=1):
	depths = ID(cut_ends(bsplined, order))
	registered_depths = ID(get_landmark_registration(bsplined, order))
	plot_data = pd.DataFrame({
		'graph_type': (
			['derivative 1'] * len(depths)
			+ ['registered derivative 1'] * len(registered_depths)
		),
		'ID': np.concatenate([depths, registered_depths]),
		'color': color * 2,
	})
	sns.boxplot(x='graph_type', y='ID', data=plot_data, color='orange')
	sns.swarmplot(x='graph_type', y='ID', data=plot_data, hue='color', palette=palette)


def plot_corr_matrix(bsplined, color_labels, order=1):
	max_values = cut_ends(bsplined, order).data_matrix.max(axis=1).reshape(-1)
	correlation_matrix = [[min(x, y) / max(x, y) for y in max_values] for x in max_values]
	red_labels = np.flatnonzero(np.core.defchararray.find(color_labels, 'red') != -1)
	green_labels = np.flatnonzero(np.core.defchararray.find(color_labels, 'green') != -1)
	non_green_labels = np.setdiff1d(range(0, len(correlation_matrix)), green_labels)
	non_green_all_correlations = np.array(
		[
			row
			for row_index, row in enumerate(correlation_matrix)
			if row_index in non_green_labels
		]
	)
	red_green_correlations = [
		[value for col_index, value in enumerate(row) if col_index in green_labels]
		for row_index, row in enumerate(correlation_matrix) if row_index in red_labels
	]
	red_red_correlations = [
		[value for col_index, value in enumerate(row) if col_index in red_labels]
		for row_index, row in enumerate(correlation_matrix) if row_index in red_labels
	]
	intersection_interval = (np.min(red_red_correlations), np.max(red_green_correlations))
	non_green_all_correlations[
		non_green_all_correlations > max(intersection_interval)
	] = 2
	non_green_all_correlations[
		non_green_all_correlations < min(intersection_interval)
	] = 0
	non_green_all_correlations[
		(non_green_all_correlations + 1) % 1 != 0
	] = 1
	
	sns.heatmap(
		non_green_all_correlations,
		xticklabels=color_labels,
		yticklabels=[
			label for index, label in enumerate(color_labels) if index in non_green_labels
		],
		ax=plt.axes(
			title=(
				'white - above UI, '
				'black - below UI, '
				'red - in UI, '
				f' uncertainty interval (UI)={intersection_interval}'
			)
		),
	)

	
def plot_func_boxplot(fgrid, fig, ax):
	fig.add_axes(ax)
	boxplot = Boxplot(fgrid, depth_method=MBD, fig=fig, factor=1)
	boxplot.show_full_outliers = True
	boxplot.plot()



In [13]:
def plot_metric(
	bsplined, value_vars=['log_reg_acc', 'knn_acc', 'svc_acc'], y='balanced accuracy',
	model_name_rm='_acc', temp_file_name='./temp.png',
):
	# patient, slice_id = get_patient_slice(bsplined)
	bsplined = (
		bsplined
			.melt(
				id_vars='segment',
				value_vars=value_vars,
				var_name='model_name',
				value_name=y,
			)
			.assign(model_name=lambda x: x.model_name.str.replace(model_name_rm, ''))
			.explode(y)
			.reset_index(drop=True)
	)

	spec_palette = None
	hue = None
	if y != 'balanced accuracy':
		hue = 'model_name_label'
		bsplined['model_name_label'] = (
			bsplined.model_name.str.replace(model_name_rm, '') + ' '
				+ LABEL_ORDERING * int(bsplined.shape[0] / len(LABEL_ORDERING))
		)
		spec_palette = {
			model.replace(model_name_rm, '') + ' ' + label: palette[label]
			for model, label in itertools.product(value_vars, LABEL_ORDERING)
		}

	fig = plt.figure(figsize=(11, 7))
	fig.subplots_adjust(hspace=0.3, wspace=0.3)
	ax = plt.axes(
		# title=f'patient_id={patient}, slice={slice_id}', xlabel='SLIC\'ed in t',
		title=f'patient_id={patient}', xlabel='SLIC\'ed in t',
	)
	sns.lineplot(
		x='segment', y=y, hue=hue, style='model_name', markers=True,
		data=bsplined, palette=spec_palette, ax=ax,
	)

	plt.tight_layout()
	plt.savefig(temp_file_name, dpi=150)
	plt.close()
	return cv.imread(temp_file_name)


def plot_all_metrics(bsplined, suff, col_suff=''):
	f1_cols = [col + col_suff for col in ['log_reg_f1', 'knn_f1', 'svc_f1']]
	acc_cols = [col + col_suff for col in ['log_reg_acc', 'knn_acc', 'svc_acc']]
	plots_row1 = [
		plot_metric(bsplined, acc_cols, model_name_rm='_acc' + col_suff),
		plot_metric(bsplined, f1_cols, 'f1', '_f1' + col_suff),
	]
	prec_cols = [col + col_suff for col in ['log_reg_prec', 'knn_prec', 'svc_prec']]
	rec_cols = [col + col_suff for col in ['log_reg_rec', 'knn_rec', 'svc_rec']]
	plots_row2 = [
		plot_metric(bsplined, prec_cols, 'precision', '_prec' + col_suff),
		plot_metric(bsplined, rec_cols, 'recall', '_rec' + col_suff),
	]
	plots_row1 = np.concatenate(plots_row1, axis=1)
	plots_row2 = np.concatenate(plots_row2, axis=1)
	final_plot = np.concatenate([plots_row1, plots_row2], axis=0)
	# patient, slice_id = get_patient_slice(bsplined)
	cv.imwrite(
		# f'./plots_v4/modeling_metrics-{patient}-{slice_id}-{n_segments}-{suff}.png',
		f'./plots_v4/modeling_metrics-{patient}-{n_segments}-{suff}.png',
		final_plot,
	)


def experiment(bsplined, thresholds, other_x_cols=OTHER_X_COLS + ['all_points']):
	overall_metrics = []
	for threshold in thresholds:
		print(threshold)
		eval_all_metrics(bsplined, thresholds=threshold, other_x_cols=other_x_cols)
		# eval_all_metrics(bsplined, thresholds=threshold, validation=True)
		# plot_all_metrics(bsplined, str(threshold[1] * 100))
		# plot_all_metrics(bsplined, str(threshold[1] * 100) + '_val', '_val')
		overall_metrics += [
			bsplined.copy().assign(threshold=[threshold] * bsplined.shape[0])
		]
	overall_metrics = pd.concat(overall_metrics)
	overall_metrics.to_csv('./overall_metrics.csv', index=False)
	return overall_metrics
	final_image = []
	for segment in sorted(overall_metrics.segment.drop_duplicates()):
		row = np.concatenate(
			[
				create_metric_plot(segment, overall_metrics),
				create_metric_plot(segment, overall_metrics, metric='f1'),
				create_metric_plot(segment, overall_metrics, metric='prec'),
				create_metric_plot(segment, overall_metrics, metric='rec'),
			],
			axis=1,
		)
		final_image += [row]
	
	final_image = np.concatenate(final_image, axis=0)
	# patient, slice_id = get_patient_slice(bsplined)
	cv.imwrite(
		# f'./plots_v4/overall_metrics-{patient}-{slice_id}-{n_segments}.png',
		f'./plots_v4/overall_metrics-{patient}-{n_segments}.png',
		final_image,
	)


def predict_experiment(bsplined, thresholds):
	all_preds = []
	for threshold in thresholds:
		print(threshold)
		predict_all(bsplined, thresholds=threshold)
		all_preds += [
			bsplined.copy().assign(threshold=[threshold] * bsplined.shape[0])
		]
	all_preds = pd.concat(all_preds)
	return all_preds


def create_metric_plot(
	segmentation,
	metrics,
	models=['knn', 'log_reg', 'svc'],
	metric='acc',
	temp_file_name='./temp.png',
):
	metrics = metrics.query(f'segment == {segmentation}')
	# patient, slice_id = get_patient_slice(metrics)
	fig = plt.figure(figsize=(11, 7))
	ax = plt.axes(
		title=(
			f'patient_id={patient}, '
			# f'slice={slice_id}, '
			f'segmented_t={segmentation} bspline, '
		),
		xlabel=metric,
	)
	fig.subplots_adjust(hspace=0.3, wspace=0.3)

	sns.boxplot(
		x='model_name', y='metric', ax=ax,
		data=metrics
			.melt(
				value_vars=[model + '_' + metric for model in models],
				var_name='model_name',
				value_name='metric',
			)
		.assign(
			metric=lambda x: x.metric.apply(np.mean),
			model_name=lambda x: x.model_name.str.replace('_' + metric, ''),
		)
	)
	plt.savefig(temp_file_name, dpi=150)
	plt.close()
	return cv.imread(temp_file_name)



In [14]:
def boxplot_outlier_classifier(fdata, factor=1):
	fgrid = cut_ends(fdata, 1)
	return [
		'red' if is_outlier else 'green'
		for is_outlier in Boxplot(fgrid, depth_method=MBD, factor=factor).outliers
	]


def evaluate_boxplot(bsplined, func=f1_score, y_col='color'):
	preds = np.concatenate(bsplined.preds.to_list())
	Y = np.concatenate(bsplined[y_col].to_list())
	if func == balanced_accuracy_score:
		return func(Y, preds)
	return func(Y, preds, average=None, labels=LABEL_ORDERING, zero_division=0)


def eval_all_boxplot_metrics(bsplined, metrics=METRICS, y_col='color'):
	metric_results = bsplined[['patient_id', 'segment']].drop_duplicates()
	for col_metric, metric_func in metrics:
		metric_results[col_metric] = metric_results.segment.apply(
			lambda x: evaluate_boxplot(
				bsplined.query(f'segment == {x}'), metric_func, y_col,
			),
		)
	return metric_results


def experiment_boxplots(bsplined, factors, suffix=''):
	overall_metrics = []
	for factor in factors:
		print(factor)
		bsplined['preds'] = bsplined.fd_smooth.apply(
			boxplot_outlier_classifier, factor=factor,
		)
		metrics = eval_all_boxplot_metrics(bsplined)
		overall_metrics += [metrics.assign(factor=[factor] * metrics.shape[0])]
		bsplined['preds_' + str(factor)] = bsplined['preds']
	overall_metrics = pd.concat(overall_metrics)
	overall_metrics.to_csv(f'./boxplot_metrics{suffix}.csv', index=False)
	(
		bsplined
			.drop(['fd_smooth', 'preds'], axis='columns')
			.explode(
				['preds_' + str(factor) for factor in factors]
					+ ['color', 'segment_region']
			)
			.to_csv(f'./boxplot_preds{suffix}.csv', index=False)
	)


### Plot functions

In [None]:
def create_row(
	patient, slice_no, n_segments, segmentation, segment_means, path_segments,
	pallete=None, temp_file_name='./temp.png',
	differentials=False, differentials_lvl2=False, countour=True, filled=True,
	group_lineplots=False,
):
	order = None
	plot_data = segment_means.query(f'segment == {segmentation}')
	if group_lineplots:
		groups = {
			lambda x: x['with_cancer'] and not x['with_80_cancer']: 'blue',
			lambda x: x['with_80_cancer']: 'red',
			lambda x: not x['with_cancer']: 'white',
		}
		# order = plot_data[['img_type', 'with_80_cancer', 'with_cancer']].drop_duplicates()
		
		groups = {
			lambda x: x['confirmed_cancer']: 'red',
			lambda x: x['unconfirmed_cancer']: 'orange',
			lambda x: x['benign']: 'green',
			lambda x: True: 'grey',
		}
		order = plot_data[['img_type', 'confirmed_cancer', 'unconfirmed_cancer', 'benign']].drop_duplicates()
		pallete = order.apply(
			lambda x: [color for condition, color in groups.items() if condition(x)][0],
			axis='columns',
		).tolist()
		order = order['img_type'].tolist()
	print(f'processing {segmentation} ...')
	fig = plt.figure(figsize=(11, 7))
	fig.subplots_adjust(hspace=0.3, wspace=0.3)
	ax = plt.axes(
		ylabel='intensity',
		xlabel='t',
	)
	sns.lineplot(
		data=plot_data,
		# x="t",
		# y='bsplined',
		x="cycle_id",
		y='mask_int_mean',
		hue="img_type",
		palette=pallete,
		hue_order=order,
		legend=False,
		ax=ax,
	).set_title(f'patient_id={patient}, segmented_t={segmentation} bspline')
	plt.savefig(temp_file_name, dpi=150)
	plt.close()

	plot = cv.imread(temp_file_name)
	
	if differentials:
		fig = plt.figure(figsize=(11, 7))
		fig.subplots_adjust(hspace=0.3, wspace=0.3)
		sns.lineplot(
			data=plot_data,
			x="cycle_id",
			y='differential',
			hue="img_type",
			palette=pallete,
			hue_order=order,
		).set_title(f'patient_id={patient}, segmented_t={segmentation} differentials')
		plt.savefig(temp_file_name, dpi=150)
		plt.close()
		differential_plot = cv.imread(temp_file_name)
		plot = np.concatenate([plot, differential_plot], axis=1)
	if differentials_lvl2:
		fig = plt.figure(figsize=(11, 7))
		fig.subplots_adjust(hspace=0.3, wspace=0.3)
		sns.lineplot(
			data=plot_data,
			x="cycle_id",
			y='differential_lvl2',
			hue="img_type",
			palette=pallete,
			hue_order=order,
		).set_title(f'patient_id={patient}, segmented_t={segmentation} differentials lvl2')
		plt.savefig(temp_file_name, dpi=150)
		plt.close()
		differential_plot = cv.imread(temp_file_name)
		plot = np.concatenate([plot, differential_plot], axis=1)
	if filled:
		filled = cv.imread(f'{path_segments}{patient}-{n_segments}/filled/{segmentation}-{slice_no}.png')
		filled = cv.resize(filled, (1575, 1050))
		plot = np.concatenate([plot, filled], axis=1)
	if countour:
		contour = cv.imread(f'{path_segments}{patient}-{n_segments}/contour/{segmentation}-{slice_no}.png')
		contour = cv.resize(contour, (1575, 1050))
		plot = np.concatenate([plot, contour], axis=1)
	return plot
	

def print_graph_with_segments(
	patient, slice_no, n_segments, path_segments, page_size,
	segments=None, segmentations=None, pallete=None, temp_file_name='./temp.png',
	differentials=False, differentials_lvl2=False, countour=True, filled=True,
	group_lineplots=False,
):
	if segments and len(segments) != 0 and pallete:
		pallete = [pallete[segment - 1] for segment in segments]

	segment_means = (
		pd.read_csv(f'./segmented-{patient}-{slice_no}-{n_segments}-processed.csv')
			.query(f'patient_id == {patient}')
	)
	segment_means = segment_means[
		segment_means['img_type'].apply(lambda x: segments is None or len(segments) == 0 or x in segments)
	]
	segmentation_ids = segment_means.segment.drop_duplicates().tolist()
	segmentation_ids = [
		segmentation_id
		for segmentation_id in segmentation_ids
		if segmentations is None or len(segmentations) == 0 or segmentation_id in segmentations
	]
	print(f'total segmentations = {len(segmentation_ids)}')
	rows = [
		create_row(
			patient, slice_no, n_segments, segmentation, segment_means, path_segments,
			pallete, temp_file_name,
			differentials, differentials_lvl2, countour, filled,
			group_lineplots,
		)
		for segmentation in segmentation_ids
	]
	n_pages = math.ceil(len(rows) / page_size)
	for page_no in range(0, n_pages):
		final_image = np.concatenate(
			rows[page_no * page_size:(page_no + 1) * page_size], axis=0
		)
		cv.imwrite(f'./plots_v2/segmented-{patient}-{n_segments}-{page_no}.png', final_image)
		

def print_graph(patient, slice_no, n_segments, cycles=None, pallete=None):
	if cycles and len(cycles) != 0 and pallete:
		pallete = [pallete[cycle - 1] for cycle in cycles]

	segment_means = (
		pd.read_csv(f'./segmented-{patient}-{slice_no}-{n_segments}.csv')
			.query(f'patient_id == {patient}')
	)
	segment_means = segment_means[
		segment_means['img_type'].apply(lambda x: cycles is None or len(cycles) == 0 or x in cycles)
	]
	segmentation_ids = segment_means.segment.drop_duplicates().tolist()
	x_axis = 3
	y_axis = math.ceil(len(segmentation_ids) / 3)
	if len(segmentation_ids) < 3:
		x_axis = len(segmentation_ids)
	
	fig, axs = plt.subplots(y_axis, x_axis, figsize=(40, y_axis * 11))
	fig.subplots_adjust(hspace=0.3, wspace=0.3)

	for segmentation_id, segmentation in  enumerate(segmentation_ids):
		y_position = math.floor(segmentation_id / 3)
		x_position = segmentation_id - y_position * 3
		sns.lineplot(
			data=segment_means.query(f'segment == {segmentation}'),
			x="cycle_id",
			y="mask_int_mean",
			hue="img_type",
			palette=pallete,
			ax=axs[y_position][x_position],
		).set_title(f'patient_id={patient}, segmented_t={segmentation} int_mean')
	plt.savefig(f'./plots/segmented-{patient}-{n_segments}.png')



## Check patients

In [None]:
for path, subdirs, files in os.walk('./data'):
	for name in files:
		image = cv.imread(os.path.join(path, name), cv.IMREAD_ANYDEPTH)
		if image is None:
			print(os.path.join(path, name))

In [None]:
def auth():
	ftp = FTP_TLS('158.129.140.191')
	ftp.login('a.vaitulevicius', '@vina5Pox')
	ftp.cwd('/NVIdata/Mpmrt Gb/images/full')
	ftp.prot_p()
	return ftp


def quit_auth(ftp, verbal=True):
	try:
		ftp.quit()
	except Exception as e:
		if verbal:
			print(e)


def save_file(slice_file, directory, content, folder):
	if not os.path.isdir(f'./data/{directory}'):
		os.mkdir(f'./data/{directory}')
	if not os.path.isdir(f'./data/{directory}/{folder}'):
		os.mkdir(f'./data/{directory}/{folder}')
	with open(f'./data/{patient}/{folder}/{slice_file}', 'wb') as f:
		f.write(content)


def get_filenames(patient, ftp, folder=None, ret=5):
	if ret == 0:
		ftp.quit()
		raise 'Exceded maximum retries'
	path = f'/{patient}/'
	if folder:
		path = f'/{patient}/{folder}/'
	try:
		return [
			name
			for name, _ in ftp.mlsd(f'.{path}')
			if name not in ['.', '..']
			# if name not in ['.', '..'] and not os.path.isfile(f'./data{path}{name}')
		]
	except Exception as e:
		print(e)
		quit_auth(ftp)
		ftp = auth()
		return get_filenames(patient, ftp, folder, ret=ret-1)


def get_slice(patient, folder, slice_file, ftp, ret=5):
	if ret == 0:
		ftp.quit()
		raise 'Exceded maximum retries'
	try:
		ftp.retrbinary(
			f'RETR ./{patient}/{folder}/{slice_file}',
			lambda x: save_file(slice_file, patient, x, folder),
		)
	except Exception as e:
		# if ret != 5:
		# 	print(e)
		quit_auth(ftp, False)
		ftp = auth()
		return get_slice(patient, folder, slice_file, ftp, ret=ret-1)


ftp = auth()
	
patients = [name for name, _ in ftp.mlsd() if name not in ['.', '..']]
for patient in patients:
	print(patient)
	folders = [
		folder
		for folder in get_filenames(patient, ftp)
		if folder.startswith('contrast-t') or folder.endswith('Mask')
	]
	for folder in folders:
		slices = get_filenames(patient, ftp, folder)
		if len(slices) != 0:
			print(folder)
		for slice_file in slices:
			get_slice(patient, folder, slice_file, ftp)
		
ftp.quit()

In [None]:
def inspect_slice(patient, image, biopsies):
	biopsy_mask = cv.imread(f'{path_original}{patient}/biopsyMask/{image}', cv.IMREAD_ANYDEPTH)
	cancer_mask = cv.imread(f'{path_maskC}{int(patient)}/{image}', cv.IMREAD_GRAYSCALE)
	if biopsy_mask.shape != cancer_mask.shape:
		return (patient, image, None, None, None, None, None, None, None, True)
	malignant_biopsies = biopsy_mask.copy()
	for biopsy in biopsies.query("type != ' Malignant'")['biopsy_id'].to_list():
		malignant_biopsies[malignant_biopsies == biopsy] = 0
	malignant_biopsies[malignant_biopsies != 0] = 1
	benign_biopsies = biopsy_mask.copy()
	for biopsy in biopsies.query("type != ' Benign'")['biopsy_id'].to_list():
		benign_biopsies[benign_biopsies != biopsy] = 0
	benign_biopsies[benign_biopsies != 0] = 1
	non_malignant_biopsies = biopsy_mask.copy()
	for biopsy in biopsies.query("type == ' Malignant'")['biopsy_id'].to_list():
		non_malignant_biopsies[non_malignant_biopsies == biopsy] = 0
	non_malignant_biopsies[non_malignant_biopsies != 0] = 1
	malignant_diff = malignant_biopsies - (cancer_mask * malignant_biopsies / 255)
	malignant_intersection = cancer_mask * malignant_biopsies / 255
	non_malignant_intersection = cancer_mask * non_malignant_biopsies / 255
	benign_intersection = cancer_mask * benign_biopsies / 255
	return (
		patient, image,
		list(np.unique(malignant_diff.reshape(-1))) == [0],
		list(np.unique(malignant_intersection.reshape(-1))) != [0],
		list(np.unique(non_malignant_intersection.reshape(-1))) == [0],
		list(np.unique(benign_intersection.reshape(-1))) == [0],
		list(np.unique(malignant_biopsies.reshape(-1))) != [0],
		list(np.unique(biopsy_mask.reshape(-1))) != [0],
		list(np.unique(cancer_mask.reshape(-1))) != [0],
		False,
	)


patient_slices = []
for patient in os.listdir('./data'):
	if patient == '050':
		continue
	if os.listdir(f'{path_original}{patient}/biopsyMask/') != os.listdir(f'{path_maskC}{int(patient)}/'):
		print(f'patient {patient}: n biopsy slices != n cancer slices')
		continue
	if not os.path.isfile(f'./biopsies/{patient}.csv'):
		print(f'patient {patient}: does not have biopsies csv file')
		continue
	biopsies = pd.read_csv(
		f'./biopsies/{patient}.csv',
		header=None,
		names=['biopsy_id', 'type', 'a', 'b'],
	)
	patient_slices += [
		inspect_slice(patient, image, biopsies)
		for image in os.listdir(f'{path_original}{patient}/biopsyMask/')
	]
pd.DataFrame(
	patient_slices,
	columns=[
		'patient', 'slice',
		'all_malignants_are_marked', 'at_least_1_malignant_is_marked',
		'all_other_in_unmarked', 'all_benign_in_unmarked',
		'has_malignant', 'at_least_1_biopsy', 'is_marked', 'differing_shapes'
	]
).to_csv('./report.csv', index=False)
	

In [None]:
report = (
	pd.read_csv('./report.csv')
		.query('not differing_shapes and at_least_1_biopsy')
		.assign(
			use_it=lambda x: x.apply(
				lambda y: (
					(y['all_malignants_are_marked'] or not y['is_marked'])
					and (y['at_least_1_malignant_is_marked'] or not y['is_marked'])
					and y['all_benign_in_unmarked']
				),
				axis='columns',
			)
		)
)

(
	report.groupby('patient')['use_it'].all().reset_index().query('use_it')['patient']
		# .to_csv('./good_bad_patients.csv', index=False)
)


In [None]:
def mark_biopsies_and_cancer(patient, image, biopsies):
	biopsy_mask = cv.imread(f'{path_original}{patient}/biopsyMask/{image}', cv.IMREAD_ANYDEPTH)
	cancer_mask = cv.imread(f'{path_maskC}{int(patient)}/{image}', cv.IMREAD_GRAYSCALE)
	malignant_biopsies = biopsy_mask.copy()
	for biopsy in biopsies.query("type != ' Malignant'")['biopsy_id'].to_list():
		malignant_biopsies[malignant_biopsies == biopsy] = 0
	malignant_biopsies[malignant_biopsies != 0] = 1
	benign_biopsies = biopsy_mask.copy()
	for biopsy in biopsies.query("type != ' Benign'")['biopsy_id'].to_list():
		benign_biopsies[benign_biopsies != biopsy] = 0
	benign_biopsies[benign_biopsies != 0] = 1
	undetermined_biopsies = biopsy_mask.copy()
	for biopsy in biopsies.query("type != ' Undetermined'")['biopsy_id'].to_list():
		undetermined_biopsies[undetermined_biopsies == biopsy] = 0
	undetermined_biopsies[undetermined_biopsies != 0] = 1
	cancer_mask = mark_boundaries(cancer_mask, malignant_biopsies, color=(1, 0, 0))
	cancer_mask = mark_boundaries(cancer_mask, benign_biopsies, color=(0, 1, 0))
	cancer_mask = mark_boundaries(cancer_mask, undetermined_biopsies, color=(0, 0, 1))
	if not os.path.isdir(f'./inspect_single_patient/{patient}/'):
		os.mkdir(f'./inspect_single_patient/{patient}/')
	save_image(cancer_mask, f'./inspect_single_patient/{patient}/{image}')


patient = '050'
biopsies = pd.read_csv(
	f'./biopsies/{patient}.csv',
	header=None,
	names=['biopsy_id', 'type', 'a', 'b'],
)
q = [
	mark_biopsies_and_cancer(patient, image, biopsies)
	for image in os.listdir(f'{path_original}{patient}/biopsyMask/')
]

## Calculations

In [None]:
patient = 66
slice_no = 13
n_segments = 50


segments_list = slic_slice(
	patient, slice_no, n_segments, path_norm, path_maskP, path_segments, path_original,
)


In [None]:
n_segments = 50

for patient, slice_no in [
	# (15, 7),
	# (28, 23),
	# (40, 20),
	(50, 13),
	# (63, 11),
	# (64, 17),
	# (69, 17),
	# (75, 14),
]:
	print(f'patient {patient}, slice={slice_no}:')
	print('slicing ...')
	segments_list = slic_slice(patient, slice_no, n_segments)
	print('bsplining ...')
	
	segment_means = [
		segments_means_foreach_slice(
			patient, slice_no, segments, n_segments,
			confirmed_cancer, unconfirmed_cancer, benign_zones,
			agg_func= np.mean
		).assign(segment=cycle_id)
		for segments, cycle_id, confirmed_cancer, unconfirmed_cancer, benign_zones in segments_list
	]
	segment_means = pd.concat(segment_means)
	res = create_row(
		patient, slice_no, n_segments, 11, segment_means, path_segments,
		pallete=None, temp_file_name='./temp.png',
		differentials=False, differentials_lvl2=False, countour=False, filled=False,
		group_lineplots=True,
	)



## Boxplot all

In [None]:
n_segments = 25
patient = 64
slice_no = 17
print(f'patient {patient}, slice={slice_no}:')
print('slicing ...')
segments_list = slic_slice(patient, slice_no, n_segments, True)
print('bsplining ...')
bsplined_data = [
	bspline_patient(
		patient, slice_no, segments, n_segments,
		confirmed_cancer, unconfirmed_cancer, benign_zones,
	).assign(segment=cycle_id)
	for segments, cycle_id, confirmed_cancer, unconfirmed_cancer, benign_zones in segments_list
]
bsplined_data = pd.concat(bsplined_data)
bsplined_data = bsplined_data[~bsplined_data.fd_smooth.isnull()]

bsplined = concat_fdatas(bsplined_data.fd_smooth.tolist())
groups = {
	lambda x: x['confirmed_cancer']: 'red',
	lambda x: x['unconfirmed_cancer']: 'orange',
	lambda x: x['benign']: 'green',
	lambda x: True: 'white',
}
bsplined_data['color'] = bsplined_data.apply(
	lambda x: [color for condition, color in groups.items() if condition(x)][0],
	axis='columns',
)

fdata = cut_ends(bsplined, 1)
boxplot = Boxplot(fdata, depth_method=MBD, factor=0.58, prob=(0.5, 0.25, 0.1, 0.01))
boxplot.show_full_outliers = True
bsplined_data['is_outlier'] = boxplot.outliers
bsplined_data.to_csv('outliers.csv', index=False)

plt.close()
res = boxplot.plot()

In [None]:

fdata = get_landmark_registration(bsplined, 1)

In [None]:
boxplot = Boxplot(fdata, depth_method=MBD, factor=0.5, prob=(0.5, 0.25, 0.1, 0.01))
boxplot.show_full_outliers = True
bsplined_data['is_outlier'] = boxplot.outliers

plt.close()
res = boxplot.plot()

## Boxplotting model

In [None]:
n_segments = 50
# patient = 50
patient = 64
bsplined = []
# for slice_no in range(9, 16):
for slice_no in range(15, 18):
	print(f'patient {patient}, slice={slice_no}:')
	print('slicing ...')
	segments_list = slic_slice(patient, slice_no, n_segments)
	print('bsplining ...')
	bsplined_data = [
		bspline_patient(
			patient, slice_no, segments, n_segments,
			confirmed_cancer, unconfirmed_cancer, benign_zones,
		).assign(segment=cycle_id)
		for segments, cycle_id, confirmed_cancer, unconfirmed_cancer, benign_zones in segments_list
	]
	bsplined += [pd.concat(bsplined_data)]
bsplined_data = pd.concat(bsplined)
# bsplined_data = agg_fdatas(bsplined_data)
bsplined_data = agg_fdatas(bsplined_data.query('confirmed_cancer or benign').reset_index())

print('modeling ...')

experiment_boxplots(bsplined_data, [0.7, 0.8, 0.9, 1, 1.1, 1.2, 1.3, 1.4], f'-{patient}')

## Workflow

In [None]:
n_segments = 25

for patient, slice_no in [
	# (15, 7),
	# (28, 23),
	# (40, 20),
	(50, 13),
	# (63, 11),
	(64, 17),
	# (69, 17),
	# (75, 14),
]:
	print(f'patient {patient}, slice={slice_no}:')
	print('slicing ...')
	# segments_list = slic_slice(patient, slice_no, n_segments)
	segments_list = slic_slice(patient, slice_no, n_segments, True)
	print('bsplining ...')
	bsplined_data = [
		bspline_patient(
			patient, slice_no, segments, n_segments,
			confirmed_cancer, unconfirmed_cancer, benign_zones,
		).assign(segment=cycle_id)
		for segments, cycle_id, confirmed_cancer, unconfirmed_cancer, benign_zones in segments_list
	]
	bsplined_data = pd.concat(bsplined_data)
	bsplined_data = agg_fdatas(bsplined_data)
	print('plotting ...')
	plot_bsplined(
		bsplined_data, n_segments, range(5, 15),
	)

	print('correlations ...')
	plot_bsplined(
		bsplined_data, n_segments, range(5, 15),
		filename_suff='-corr',
		row_config=[
			(
				lambda x, fig, ax, color_labels, **kwargs: plot_corr_matrix(
					x, color_labels,
				),
				'',
			)
		],
	)
	experiment_boxplots(bsplined_data, [1], f'-{patient}-{slice_no}-{n_segments}')
# 	print('modeling ...')
	
# 	bsplined_data = add_depths(bsplined_data, 1)
# 	bsplined_data = add_max_values_points(bsplined_data, 1)
# 	bsplined_data = add_point_values(bsplined_data, 10, 1)
# 	bsplined_data['models'] = bsplined_data.apply(train_models, axis='columns')
# 	eval_all_metrics(bsplined_data)
# 	plot_all_metrics(bsplined_data, 0)
# 	experiment(bsplined_data, 10)


In [None]:
n_segments = 50
patient = 50
# patient = 64
bsplined = []
# thresholds = [(green_thresh, 1 - green_thresh) for green_thresh in np.linspace(0.5, 0.9, 5)]
thresholds = [(0.5, 0.5)]
# for slice_no in range(9, 16):
for slice_no in range(7, 33):
# for slice_no in range(15, 18):
	print(f'patient {patient}, slice={slice_no}:')
	print('slicing ...')
	segments_list = slic_slice(patient, slice_no, n_segments)
	print('bsplining ...')
	bsplined_data = [
		bspline_patient(
			patient, slice_no, segments, n_segments,
			confirmed_cancer, unconfirmed_cancer, benign_zones,
		).assign(segment=cycle_id)
		for segments, cycle_id, confirmed_cancer, unconfirmed_cancer, benign_zones in segments_list
	]
	bsplined += [pd.concat(bsplined_data)]
bsplined_data = pd.concat(bsplined)
train_data = agg_fdatas(
	bsplined_data.query('confirmed_cancer or benign').reset_index().copy(),
	['patient_id', 'segment'],
)
other_data = agg_fdatas(
	bsplined_data.query('(not benign and not confirmed_cancer) or unconfirmed_cancer')
		.reset_index()
		.copy(),
	['patient_id', 'segment'],
	True,
)

print('modeling ...')

train_data = add_depths(train_data, 1)
train_data = add_max_values_points(train_data, 1)
train_data = add_point_values(train_data, 10, 1)
train_data['models'] = train_data.apply(train_models, axis='columns')
experiment(train_data, thresholds=thresholds)

# preds = (
# 	other_data[['patient_id', 'segment', 'fd_smooth', 'segment_ids', 'slice_ids', 'color']]
# 		.merge(train_data[['segment', 'models']])
# )
# preds = add_depths(preds, 1)
# preds = add_max_values_points(preds, 1)
# preds = add_point_values(preds, 10, 1)
# preds = predict_experiment(preds, thresholds=thresholds)
# (
# 	preds[[
# 		'segment', 'segment_ids', 'slice_ids', 'patient_id', 'color',
# 		'knn_preds', 'log_reg_preds', 'svc_preds', 'threshold'
# 	]]
# 		.explode(
# 			[
# 				'slice_ids', 'segment_ids', 'color',
# 				'knn_preds', 'log_reg_preds', 'svc_preds',
# 			]
# 		)
# 		.to_csv(f'./predictions-{patient}-{n_segments}.csv', index=False)
# )

In [None]:
kfolds = StratifiedKFold(n_splits=5)
sample = train_data.query("segment == 7").iloc[0]
X, Y = prep_x_data(sample)
f1s = []
precisions = []
recalls = []
accuracies = []
specificities = []

train_f1s = []
train_precisions = []
train_recalls = []
train_accuracies = []
train_specificities = []

for train_index, test_index in kfolds.split(X, Y):
	print("TRAIN:", len(train_index), "TEST:", len(test_index))
	X_train, X_test = X[train_index], X[test_index]
	y_train, y_test = Y[train_index], Y[test_index]
	model = SVC()
	model.fit(X_train, y_train)
	preds = model.predict(X_test)
	f1s += [f1_score(y_test, preds, zero_division=0)]
	precisions += [precision_score(y_test, preds, zero_division=0)]
	recalls += [recall_score(y_test, preds, zero_division=0)]
	accuracies += [balanced_accuracy_score(y_test, preds)]
	specificities += [specificity(y_test, preds)]
	
	preds = model.predict(X_train)
	train_f1s += [f1_score(y_train, preds, zero_division=0)]
	train_precisions += [precision_score(y_train, preds, zero_division=0)]
	train_recalls += [recall_score(y_train, preds, zero_division=0)]
	train_accuracies += [balanced_accuracy_score(y_train, preds)]
	train_specificities += [specificity(y_train, preds)]

print('means:')
print('test metrics:')
print(f'F1 = {np.mean(f1s)}')
print(f'precision = {np.mean(precisions)}')
print(f'recall = {np.mean(recalls)}')
print(f'accuracy = {np.mean(accuracies)}')
print(f'specificity = {np.mean(specificities)}')

print('train metrics:')
print(f'F1 = {np.mean(train_f1s)}')
print(f'precision = {np.mean(train_precisions)}')
print(f'recall = {np.mean(train_recalls)}')
print(f'accuracy = {np.mean(train_accuracies)}')
print(f'specificity = {np.mean(train_specificities)}')
print('stds:')

print('test metrics:')
print(f'F1 = {np.std(f1s)}')
print(f'precision = {np.std(precisions)}')
print(f'recall = {np.std(recalls)}')
print(f'accuracy = {np.std(accuracies)}')
print(f'specificity = {np.std(specificities)}')

print('train metrics:')
print(f'F1 = {np.std(train_f1s)}')
print(f'precision = {np.std(train_precisions)}')
print(f'recall = {np.std(train_recalls)}')
print(f'accuracy = {np.std(train_accuracies)}')
print(f'specificity = {np.std(train_specificities)}')

In [None]:
preds = predict_model(train_data.query("segment == 9").iloc[0])
Y = [1 if label == 'red' else 0 for label in train_data.query("segment == 9")['colorr'].iloc[0]]
specificity(Y, preds)

In [None]:
n_segments = 50
patient = 50
bsplined = []
for slice_no in range(7, 33):
	print(f'patient {patient}, slice={slice_no}:')
	print('slicing ...')
	segments_list = slic_slice(patient, slice_no, n_segments)
	print('bsplining ...')
	bsplined_data = [
		bspline_patient(
			patient, slice_no, segments, n_segments,
			confirmed_cancer, unconfirmed_cancer, benign_zones,
		).assign(segment=cycle_id)
		for segments, cycle_id, , confirmed_cancer, unconfirmed_cancer, benign_zones in segments_list
	]
	bsplined += [pd.concat(bsplined_data)]
bsplined_data = pd.concat(bsplined)
bsplined_data = agg_fdatas(bsplined_data, ['patient_id', 'segment'])

print('modeling ...')

bsplined_data = add_depths(bsplined_data, 1)
bsplined_data = add_max_values_points(bsplined_data, 1)
bsplined_data = add_point_values(bsplined_data, 10, 1)
bsplined_data['models'] = bsplined_data.apply(train_models, axis='columns')
eval_all_metrics(bsplined_data)
plot_all_metrics(bsplined_data, 0)

## All patients modeled

In [10]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()


def kfolds(dataset, nfolds=5):
	class_ratios = (
		dataset
			.query('label')
			.patient_id
			.value_counts()
			.reset_index()
			.rename(
				{'patient_id': 'cancerous_regions', 'index': 'patient_id'},
				axis='columns',
			)
			.merge(
				dataset
					.patient_id
					.value_counts()
					.reset_index()
					.rename({'patient_id': 'n', 'index': 'patient_id'}, axis='columns')
			)
		.assign(patient_ratio=lambda x: x.cancerous_regions / x.n)
		.sort_values('patient_ratio', ascending=False)
		.assign(
			fold_id=lambda x:
				(
					(list(range(0, nfolds)) + list(reversed(range(0, nfolds)))) *
					math.ceil(x.shape[0] / (nfolds * 2))
				)
				[:x.shape[0]]
		)
	)
	return class_ratios.groupby('fold_id').patient_id.apply(list).to_numpy()


def get_landmark_registration(bsplined, order=0):
	bsplined_grid = cut_ends(bsplined, order)
	landmark_indexes = cut_ends(bsplined, order, prc_rm_end=0.5).data_matrix.argmax(axis=1)
	grid_points = bsplined_grid.grid_points[0]
	landmarks = [grid_points[index] for index in np.concatenate(landmark_indexes)]
	return landmark_registration(bsplined_grid, landmarks)


def get_descrete_points(fd_smooth):
	t_cut = np.linspace(
		start=0,
		stop=fd_smooth.data_matrix.shape[1] - 1,
		num=train_n_points, endpoint=True, dtype=int,
	)
	return fd_smooth.data_matrix[:, t_cut, 0]


In [22]:
agg_columns = ['patient_id', 'slice_id', 'img_type']
n_points = 100
train_n_points = 10
segmentation_id = 4


file = f'./segmentations/segmentation-{segmentation_id}.csv'


dataset = (
	pd.read_csv(
		file,
		dtype={
			'img_type': int,
			'patient_id': int,
			'cycle_id': int,
			'slice_id': int,
			'label': str,
			'mask_int_mean': float,
			'segment': int,
		},
	)
		.assign(label=lambda x: x.label.astype(str) == 'True')
		.drop_duplicates()
		.sort_values(agg_columns + ['cycle_id'])
)
ts = (
	dataset[['patient_id', 'cycle_id']].drop_duplicates()
		.groupby('patient_id').cycle_id.count()
		.apply(lambda x: np.linspace(0, 1, int(x)))
		.reset_index()
)

dataset = dataset[dataset.patient_id.apply(lambda x: x not in [3, 6, 7, 8, 10, 11, 14, 19, 27, 29, 37, 60, 66, 76, 86, 108, 119, 122, 123, 128, 133, 140])]
dataset = dataset[dataset.patient_id.apply(lambda x: x not in [2, 32, 35, 40, 41, 45, 52, 92, 107, 110, 114, 116, 139])]
dataset = dataset.merge(dataset.query('label == True').patient_id.drop_duplicates())
dataset = dataset.groupby(agg_columns + ['label']).mask_int_mean.apply(list).reset_index()
bsplined = dataset.groupby('patient_id').mask_int_mean.apply(list).reset_index().merge(ts)
bsplined = bsplined.apply(
	lambda x: smoother.fit_transform(
		FDataGrid(data_matrix=x['mask_int_mean'], grid_points=x['cycle_id'])
	),
	axis='columns',
)

print('registering ...')
bsplined = [get_landmark_registration(fd_smooth, 1) for fd_smooth in bsplined]
print('getting integral depths ...')
ids = np.vstack([ID(fd_smooth).reshape(-1, 1) for fd_smooth in bsplined])
print('getting modified bandth depths ...')
mbds = np.vstack([MBD(fd_smooth).reshape(-1, 1) for fd_smooth in bsplined])
print('getting other features ...')
max_values = np.vstack(
	[fd_smooth.data_matrix.max(axis=1).reshape(-1, 1) for fd_smooth in bsplined]
)
patients = dataset.patient_id.to_numpy().reshape(-1, 1)

descret_points = np.vstack([get_descrete_points(fd_smooth) for fd_smooth in bsplined])
train_set = np.hstack([ids, mbds, max_values, descret_points, patients])
labels = l_enc.fit_transform(dataset.label.astype(int).tolist())

registering ...
getting integral depths ...
getting modified bandth depths ...
getting other features ...


In [14]:
def train_kfolds(Y, X, folds):
	f1s = []
	precisions = []
	recalls = []
	accuracies = []
	specificities = []

	train_f1s = []
	train_precisions = []
	train_recalls = []
	train_accuracies = []
	train_specificities = []

	i = 0
	
	patients = pd.DataFrame(X)[13]
	X = pd.DataFrame(
		np.vstack(pd.DataFrame(X).groupby(13).apply(scaler.fit_transform).to_numpy())
			[:,0:-1]
	).assign(Y=Y, patient=patients)
	
	for test_patients in folds:
		print(i)
		i += 1
		X_train = X[X.patient.apply(lambda x: x not in test_patients)]
		X_test = X[X.patient.apply(lambda x: x in test_patients)]
		
		y_train = X_train.Y
		y_test = X_test.Y
		X_train = X_train.drop(['Y', 'patient'], axis='columns')
		X_test = X_test.drop(['Y', 'patient'], axis='columns')
		
		model = SVC(class_weight='balanced')
		model.fit(X_train, y_train)
		preds = model.predict(X_test)
		
		f1s += [f1_score(y_test, preds, zero_division=0)]
		precisions += [precision_score(y_test, preds, zero_division=0)]
		recalls += [recall_score(y_test, preds, zero_division=0)]
		accuracies += [balanced_accuracy_score(y_test, preds)]
		specificities += [specificity(y_test, preds)]

		preds = model.predict(X_train)
		train_f1s += [f1_score(y_train, preds, zero_division=0)]
		train_precisions += [precision_score(y_train, preds, zero_division=0)]
		train_recalls += [recall_score(y_train, preds, zero_division=0)]
		train_accuracies += [balanced_accuracy_score(y_train, preds)]
		train_specificities += [specificity(y_train, preds)]
	
	return (
		np.mean(f1s),
		np.mean(precisions),
		np.mean(recalls),
		np.mean(accuracies),
		np.mean(specificities),
		
		np.mean(train_f1s),
		np.mean(train_precisions),
		np.mean(train_recalls),
		np.mean(train_accuracies),
		np.mean(train_specificities),
	)

pd.DataFrame(
	[train_kfolds(labels, train_set, kfolds(dataset, 5))],
	columns=[
		'F1', 'precision', 'recall', 'accuracy',
		'train_F1', 'train_precision', 'train_recall', 'train_accuracy',
	]
)

In [21]:
(
	dataset[dataset.patient_id.apply(lambda x: x not in [2, 32, 35, 40, 41, 45, 52, 92, 107, 110, 114, 116, 139])]
		.patient_id.value_counts().reset_index()
		# ['index'].apply(lambda x: f'patient-{x}-segment-4.png').tolist()
)

Unnamed: 0,index,patient_id
0,50,1176
1,15,925
2,75,779
3,80,779
4,28,764
5,101,726
6,69,723
7,39,710
8,63,700
9,64,690


In [None]:
import keras_tuner as kt


class NNmodel():
    
    def __init__(self):
        self.nn_model = models.Sequential()
        self.nn_model.add(layers.Dense(128, input_dim=13, activation='relu'))
        self.nn_model.add(layers.Dense(64, activation='relu'))
        self.nn_model.add(layers.Dense(32, activation='relu'))
        self.nn_model.add(layers.Dense(16, activation='relu'))
        self.nn_model.add(layers.Dense(8, activation='relu'))
        self.nn_model.add(layers.Dense(4, activation='relu'))
        self.nn_model.add(layers.Dense(2, activation='relu'))
        self.nn_model.add(layers.Dense(1, activation='sigmoid'))
        self.nn_model.compile(
            loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']
        )


    def fit(self, X_train, y_train):
        self.nn_model.fit(
            X_train, y_train,
            class_weight=(1 - pd.Series(y_train).value_counts() / len(y_train)).to_dict(),
        )


    def predict(self, X_test):
        return (self.nn_model.predict(X_test) > 0.5).reshape(-1).astype(int)


In [31]:
agg_columns = ['patient_id', 'slice_id', 'img_type']
n_points = 100
train_n_points = 10
kfolds = StratifiedKFold(n_splits=5)

# from sklearn.ensemble import RandomForestClassifier
# import xgboost as xgb


def train_kfolds(Y, X):
	f1s = []
	precisions = []
	recalls = []
	accuracies = []
	specificities = []

	train_f1s = []
	train_precisions = []
	train_recalls = []
	train_accuracies = []
	train_specificities = []

	for train_index, test_index in kfolds.split(X, Y):
		X_train, X_test = X[train_index], X[test_index]
		y_train, y_test = Y[train_index], Y[test_index]
		# model = SVC(class_weight='balanced')
		# model = RandomForestClassifier(class_weight='balanced', n_jobs=4)
		# model = xgb.XGBClassifier(tree_method='gpu_hist', eval_metric='logloss', use_label_encoder=False)
		model = NNmodel()
		model.fit(X_train, y_train)
		preds = model.predict(X_test)
		
		f1s += [f1_score(y_test, preds, zero_division=0)]
		precisions += [precision_score(y_test, preds, zero_division=0)]
		recalls += [recall_score(y_test, preds, zero_division=0)]
		accuracies += [balanced_accuracy_score(y_test, preds)]
		specificities += [specificity(y_test, preds)]

		preds = model.predict(X_train)
		train_f1s += [f1_score(y_train, preds, zero_division=0)]
		train_precisions += [precision_score(y_train, preds, zero_division=0)]
		train_recalls += [recall_score(y_train, preds, zero_division=0)]
		train_accuracies += [balanced_accuracy_score(y_train, preds)]
		train_specificities += [specificity(y_train, preds)]
	
	return (
		np.mean(f1s),
		np.mean(precisions),
		np.mean(recalls),
		np.mean(accuracies),
		np.mean(specificities),
		
		np.mean(train_f1s),
		np.mean(train_precisions),
		np.mean(train_recalls),
		np.mean(train_accuracies),
		np.mean(train_specificities),
	)


def process_patient(patient, dataset, ts):
	# print(patient)
	sample = dataset.query(f'patient_id == {patient}')

	bsplined = sample.groupby('patient_id').mask_int_mean.apply(list).reset_index().merge(ts)
	bsplined = bsplined.apply(
		lambda x: smoother.fit_transform(
			FDataGrid(data_matrix=x['mask_int_mean'], grid_points=x['cycle_id'])
		),
		axis='columns',
	)
	bsplined = [get_landmark_registration(fd_smooth, 1) for fd_smooth in bsplined]
	ids = np.vstack(
		[scaler.fit_transform(ID(fd_smooth).reshape(-1, 1)) for fd_smooth in bsplined]
	)
	mbds = np.vstack(
		[scaler.fit_transform(MBD(fd_smooth).reshape(-1, 1)) for fd_smooth in bsplined]
	)
	max_values = np.vstack(
		[
			scaler.fit_transform(fd_smooth.data_matrix.max(axis=1).reshape(-1, 1))
			for fd_smooth in bsplined
		]
	)
	max_points = np.vstack(
		[
			scaler.fit_transform(
				(fd_smooth.data_matrix.argmax(axis=1) / n_points).reshape(-1, 1)
			)
			for fd_smooth in bsplined
		]
	)

	descret_points = np.vstack([get_descrete_points(fd_smooth) for fd_smooth in bsplined])
	train_set = np.hstack([ids, mbds, max_values, max_points, descret_points])
	# labels = l_enc.fit_transform(sample.label.astype(int).tolist())
	labels = sample.label.astype(int).to_numpy()

	return pd.DataFrame(
		[(patient,) + train_kfolds(labels, train_set)],
		columns=[
			'patient', 'F1', 'precision', 'recall', 'accuracy',
			'specificity',
			'train_F1', 'train_precision', 'train_recall', 'train_accuracy',
			'train_specificity',
		]
	)


def print_patient(segmentation_id, patient, dataset, ts, slice_id=None):
	query = f'patient_id == {patient}'
	if slice_id:
		query += f' and slice_id == {slice_id}'
	bsplined = dataset.query(query).sort_values('label').mask_int_mean.tolist()
	ts = ts.query(f'patient_id == {patient}').cycle_id.to_list()
	labels = [
		'red' if label else 'green'
		for label in dataset.query(query).sort_values('label').label.astype(bool).tolist()
	]
	bsplined = smoother.fit_transform(FDataGrid(data_matrix=bsplined, grid_points=ts))
	bsplined = get_landmark_registration(bsplined, 1)
	color_cycle = cycler(color=labels)
	fig = plt.figure(figsize=(11, 7))
	ax = plt.axes(xlabel='t')
	ax.set_prop_cycle(color_cycle)
	bsplined.plot(fig=fig, ax=ax)
	name = f'./patient_plots/patient-{patient}-segment-{segmentation_id}.png'
	if slice_id:
		name = f'./patient_slice_plots/patient-{patient}-slice{slice_id}-segment-{segmentation_id}.png'
	plt.savefig(name, dpi=150)
	plt.close()


patient_labels = None

for segmentation_id in range(0, 25):
	print(segmentation_id)
	file = f'./segmentations/segmentation-{segmentation_id}.csv'
	dataset = (
		pd.read_csv(
			file,
			dtype={
				'img_type': int,
				'patient_id': int,
				'cycle_id': int,
				'slice_id': int,
				'label': str,
				'mask_int_mean': float,
				'segment': int,
			},
		)
			.assign(label=lambda x: x.label.astype(str) == 'True')
			.drop_duplicates()
			.sort_values(agg_columns + ['cycle_id'])
	)
	dataset = dataset.merge(
		dataset.query('label == True')[['patient_id']].drop_duplicates()
	)
	ts = (
		dataset[['patient_id', 'cycle_id']].drop_duplicates()
			.groupby('patient_id').cycle_id.count()
			.apply(lambda x: np.linspace(0, 1, int(x)))
			.reset_index()
	)

	dataset = dataset.groupby(agg_columns + ['label']).mask_int_mean.apply(list).reset_index()
	if patient_labels is not None:
		patient_labels = pd.concat(
			[
				patient_labels,
				dataset
					.query('label')
					.groupby('patient_id').label.count().reset_index()
					.rename({'patient_id': 'patient'}, axis='columns')
					.query('label >= 5')
					.assign(segmentation=segmentation_id),
					# .patient_id.tolist()
			],
			copy=False,
		)
	else:
		patient_labels = (
			dataset
				.query('label')
				.groupby('patient_id').label.count().reset_index()
				.rename({'patient_id': 'patient'}, axis='columns')
				.query('label >= 5')
				.assign(segmentation=segmentation_id)
		)

	patients = (
		dataset
			.query('label')
			.groupby('patient_id').label.count().reset_index()
			.query('label >= 5')
			.patient_id.tolist()
	)
	# patients = [2, 32, 35, 40, 41, 45, 52, 92, 107, 110, 114, 116, 139]
	# (
	# 	dataset[dataset.patient_id.apply(lambda x: x in patients)]
	# 		.apply(
	# 			lambda x: print_patient(
	# 				segmentation_id, x['patient_id'], dataset, ts, x['slice_id'],
	# 			),
	# 			axis='columns',
	# 		)
	# )
	(
		pd.concat([process_patient(patient, dataset, ts) for patient in patients])
			.assign(segmentation=segmentation_id)
			.to_csv('./patient_metrics/patient_metrics_xgb.csv', index=False, mode='a', header=False)
	)


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24


In [32]:
pd.read_csv('./patient_metrics/patient_metrics_xgb.csv').merge(patient_labels).to_csv('./patient_metrics/patient_metrics_xgb.csv', index=False)

## V1

Kreives nudazomos:

* 80% SLIC zonos - raudonas
* Kitu atveju - baltas
* T = 1 - 10
* Augimai
* sulaukt pacientu nr

paskui:
* raudona spalva kur biopzija (+3, +4)
* geltona - kur nepiktas vezys
* zalia - kur nera vezio

## V2

* Normalizuoti i 0-1
* Mediana
* patient ids - 59, 64

## V3

* 1, 2 lygio isvestines
* Nulines atskaitos ismetimas
* SLIC'u = 50
* Raudona >= 80%
* Melyna > 0%

## V4

* functional depth (boxplots)
* Kreiviu registravimas (registration) pagal 1 taska (maksimumas) be x_0 ir x_n. (landmarkai - gali ir nebuti susije su registravimu)
* Correlation between plots
* AUC - plotas po suglodinta kreive (dar nedaryti)

## V5

* functional depth - boxplots of derivative 1 registered and derivative 1
* numeric correlation of derivative 1 max points
* no 2nd derivative
* modelis? Klasifikavimas (K-means, funkcine logisitine regresija, svm) vieno paciento lygmenyje. parametrai:
    * depth
    * max taskas, max value
    * diskretizuoti duomenys (isvestines ir gal originaliu)

## V6

* pasinagrineti boxplot funkcijos rezultata (outlier gram https://astamm.github.io/roahd/reference/outliergram.html)
* dek i boxplota visais laiko momentais suSLICuotus regionus
* diskretizuoti corr matrica


## V7

* see how outliers correspond to classes

## v8

* Stratified Kfold for SVM https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedKFold.html#sklearn.model_selection.StratifiedKFold - n = 5
* No class balancing for SVM


## Plots

In [None]:

n_segments = 50
for patient, slice_no in [
	(28, 23),
	(40, 20),
	(50, 13),
	(63, 11),
	(69, 17),
	(75, 14),
]:
	print_graph_with_segments(
		patient=patient,
		slice_no=slice_no,
		n_segments=n_segments,
		path_segments=path_segments,
		segmentations=list(range(0, 10)),
		differentials=True, differentials_lvl2=True, filled=False,
		page_size=12,
		group_lineplots=True,
	)

