In [None]:
import os
import math
import numpy as np
import pandas as pd
import argparse
from tqdm import tqdm
import pyperclip
import re

from skfda import FDataGrid
from skfda.representation.basis import BSpline
from skfda.preprocessing.smoothing import BasisSmoother
from skfda.exploratory.depth import IntegratedDepth, ModifiedBandDepth
import xgboost as xgb

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import precision_score, recall_score, f1_score, balanced_accuracy_score, roc_curve, confusion_matrix


agg_columns = ['patient_id', 'slice_id', 'img_type']
features = r'^(discrete_\d+|ids|mbds|max_values)'

n_basis=18
order=4

prc_rm=0.05
n_points =111
train_n_points = 100

basis = BSpline(domain_range=(0, 1), n_basis=n_basis, order=order)
smoother = BasisSmoother(basis=basis, return_basis=True, method='svd')
ID = IntegratedDepth()
MBD = ModifiedBandDepth()
k_folds = StratifiedKFold(n_splits=5)


def cut_ends(bsplined, order=0, prc_rm_start=prc_rm, prc_rm_end=prc_rm, n_points=n_points):
	bsplined_grid = bsplined.derivative(order=order).to_grid(np.linspace(0, 1, n_points))
	return FDataGrid(
		data_matrix=bsplined_grid.data_matrix[
			..., int(n_points * prc_rm_start): int(n_points * (1 - prc_rm_end)), 0
		],
		grid_points=bsplined_grid.grid_points[0][
			int(n_points * prc_rm_start): int(n_points * (1 - prc_rm_end))
		]
	)


def specificity(y_true, y_pred, zero_division=0):
	tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
	if tn+fp == 0 and zero_division:
		return zero_division
	return tn / (tn+fp)


def get_descrete_points(fd_smooth):
	t_cut = np.linspace(
		start=0,
		stop=fd_smooth.data_matrix.shape[1] - 1,
		num=train_n_points, endpoint=True, dtype=int,
	)
	return fd_smooth.data_matrix[:, t_cut, 0]


def extract_features(prefix='peripheral'):
	segmentations = []

	for segmentation_id in tqdm(range(0, 25)):
		file = f'./segmentations/{prefix}-{segmentation_id}.csv'
		dataset = (
		pd.read_csv(
			file,
			dtype={
				'img_type': int,
				'patient_id': int,
				'cycle_id': int,
				'slice_id': int,
				'label': bool,
				'mask_int_mean': float,
				'segment': int,
			},
		)
		.drop_duplicates()
		.sort_values(agg_columns + ['cycle_id'])
		)
		ts = (
			dataset[['patient_id', 'cycle_id']].drop_duplicates()
				.groupby('patient_id').cycle_id.count()
				.apply(lambda x: np.linspace(0, 1, int(x)))
				.reset_index()
		)

		dataset = dataset.groupby(agg_columns + ['label']).mask_int_mean.apply(list).reset_index()
		bsplined = dataset.groupby('patient_id').mask_int_mean.apply(list).reset_index().merge(ts)
		bsplined = bsplined.apply(
			lambda x: smoother.fit_transform(
				FDataGrid(data_matrix=x['mask_int_mean'], grid_points=x['cycle_id'])
			),
			axis='columns',
		)
		unregistered = [cut_ends(fd_smooth, 1) for fd_smooth in bsplined]

		dataset['ids'] = np.concatenate(
			[ID(fd_smooth).reshape(-1, 1) for fd_smooth in unregistered]
		)
		dataset['mbds'] = np.concatenate(
			[MBD(fd_smooth).reshape(-1, 1) for fd_smooth in unregistered]
		)
		dataset['max_values'] = np.concatenate(
			[fd_smooth.data_matrix.max(axis=1).reshape(-1, 1) for fd_smooth in unregistered]
		)

		unregistered = [get_descrete_points(fd_smooth) for fd_smooth in unregistered]

		segmentations += [
			dataset.drop('mask_int_mean', axis='columns')
				.assign(segmentation_id=segmentation_id)
				.join(
					pd.DataFrame(np.concatenate(unregistered), columns=[f'discrete_{i}' for i in range(100)])
				)
		]
	return pd.concat(segmentations)


def train_fold(segmentation_id, X, Y, indices):
	train_indices, test_indices = indices
	X_train, X_test = X[train_indices], X[test_indices]
	y_train, y_test = Y[train_indices], Y[test_indices]

	scaler = MinMaxScaler(feature_range=(0, 1), copy=False)
	scaler = scaler.fit(X_train)
	X_train = scaler.transform(X_train)
	X_test = scaler.transform(X_test)
	labels = dataset['label']
	neg_class_weight = (y_train == 1).sum() / len(y_train)
	weights = [neg_class_weight if label == 0 else 1 - neg_class_weight for label in y_train]

	scaler = MinMaxScaler(feature_range=(0, 1), copy=False)
	scaler = scaler.fit(X_train)
	X_train = scaler.transform(X_train)
	X_test = scaler.transform(X_test)

	model = xgb.XGBClassifier(tree_method='gpu_hist', eval_metric='logloss', use_label_encoder=False)
	model.fit(
		X_train, y_train,
		sample_weight=[neg_class_weight if label == 0 else 1 - neg_class_weight for label in y_train],
		eval_set=[( X_train, y_train), ( X_test, y_test)],
		verbose=False,
	)

	pred = model.predict(X_test)
	return {
		'segmentation_id': segmentation_id,
		'precision': precision_score(y_test, pred, zero_division=0),
		'recall': recall_score(y_test, pred, zero_division=0),
		'f1': f1_score(y_test, pred, zero_division=0),
		'balanced_accuracy': balanced_accuracy_score(y_test, pred),
		'specificity': specificity(y_test, pred, zero_division=0),
	}



def train_model(dataset):
	Y = dataset.label.astype(int).to_numpy()
	X = dataset[[name for name in dataset.columns if re.match(features, name)]].to_numpy()
	segmentation_id = dataset['segmentation_id'].tolist()[0]
	return [train_fold(segmentation_id, X, Y, indices) for indices in k_folds.split(X, Y)]



## Generate dataset

In [None]:
extract_features('peripheral').to_csv('peripheral.csv')
extract_features('transitional').to_csv('transitional.csv')

## Training

In [None]:
dataset = pd.read_csv('transitional.csv')
result = pd.concat(
	[
		pd.DataFrame.from_records(train_model(dataset.query('segmentation_id == @segmentation_id')))
			.mean()
			.to_frame()
			.transpose()
		for segmentation_id in tqdm(range(0, 25), position=0, leave=True)
	]
)

In [None]:
result.to_csv('./show/peripheral_transitional/transitional.csv', index=False)

In [None]:
tz_metrics = pd.read_csv('./show/peripheral_transitional/transitional.csv')
tz_metrics.columns = ['tz_' + col if col != 'segmentation_id' else col for col in tz_metrics.columns]

pz_metrics = pd.read_csv('./show/peripheral_transitional/peripheral.csv')
pz_metrics.columns = ['pz_' + col if col != 'segmentation_id' else col for col in pz_metrics.columns]

tz_metrics.merge(pz_metrics).iloc[:,[0, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10]].to_csv('./show/peripheral_transitional/peripheral_transitional.csv')