In [1]:
import os
import math
import cv2 as cv
import numpy as np
import pandas as pd


## FOLDER SETUP

In [2]:
path_original =  './data/'
path_processed = './processed/'
path_norm =      './processed/full_normalized/'
path_segments =  './processed/segmented/'

path_maskP =     './processed/mask_prostate/'
path_maskC =     './processed/mask_cancer/'
path_maskB =     './processed/mask_biopsy/'


## GET METADATA

In [None]:
metadata_path = []
metadata_patient = []
metadata_cycle = []
metadata_slice = []
metadata_height = []
metadata_width = []
metadata_intmin = []
metadata_intavg = []
metadata_intmax = []

for patient_i in os.listdir(path_original):
	print(f'>> PATIENT {patient_i}')

	for cycle_i in os.listdir(f'{path_original}{patient_i}'):
		if not cycle_i.startswith("contrast-t"):
			continue

		for slice_i in os.listdir(f'{path_original}{patient_i}/{cycle_i}'):
			image = cv.imread(f'{path_original}{patient_i}/{cycle_i}/{slice_i}', cv.IMREAD_ANYDEPTH)

			metadata_path.append(f'{patient_i}/{cycle_i}/{slice_i}')
			metadata_patient.append(int(patient_i.lstrip('0')))
			metadata_cycle.append(int(cycle_i.replace('contrast-t', '')))
			metadata_slice.append(int(os.path.splitext(slice_i)[0]))

			metadata_height.append(image.shape[0])
			metadata_width.append(image.shape[1])

			metadata_intmin.append(np.min(image))
			metadata_intavg.append(np.round(np.mean(image), decimals = 2))
			metadata_intmax.append(np.max(image))

metadata = pd.DataFrame({
    'path':     metadata_path,
    'patient':  metadata_patient,
    'cycle':    metadata_cycle,
    'slice':    metadata_slice,
    'height':   metadata_height,
    'width':    metadata_width,
    'intmin':   metadata_intmin,
    'intavg':   metadata_intavg,
    'intmax':   metadata_intmax,
})


if not os.path.exists(path_processed):
	os.makedirs(path_processed)

metadata.to_csv(f'{path_processed}metadata.csv', index=False)



## RESCALE PROSTATE MASKS

In [None]:
metadata = pd.read_csv(f'{path_processed}metadata.csv')
metadata['has_prostate'] = False

for patient_i in os.listdir(path_original):
# for patient_i in ['059']:

	patient_i_rename = int(patient_i.lstrip('0')) # patient name not from metadata, needs manual name adjustment

	for cycle_i in os.listdir(f'{path_original}{patient_i}'):
		if not cycle_i.startswith("prostateMask"):
			continue

		for slice_i in os.listdir(f'{path_original}{patient_i}/{cycle_i}'):
			image = cv.imread(f'{path_original}{patient_i}/{cycle_i}/{slice_i}', cv.IMREAD_ANYDEPTH)
			slice_no = int(slice_i.replace('.png', ''))
			if np.any(image != 0):
				metadata.loc[
					(metadata['patient'] == patient_i_rename) & (metadata['slice'] == slice_no),
					['has_prostate']
				] = True

			image_normalized = image * 255

			if not os.path.exists(f'{path_maskP}{patient_i_rename}'): # if folder doesnt exist - create
				os.makedirs(f'{path_maskP}{patient_i_rename}')

			cv.imwrite(f'{path_maskP}{patient_i_rename}/{slice_i}', image_normalized)

metadata.to_csv(f'{path_processed}metadata.csv', index=False)

## RESCALE CANCER MASKS

In [4]:
for patient_i in os.listdir(path_original):
# for patient_i in ['053', '086', '109']:

	patient_i_rename = int(patient_i.lstrip('0')) # patient name not from metadata, needs manual name adjustment

	for cycle_i in os.listdir(f'{path_original}{patient_i}'):
		if not cycle_i.startswith("regionMask"):
			continue

		for slice_i in os.listdir(f'{path_original}{patient_i}/{cycle_i}'):
			image = cv.imread(f'{path_original}{patient_i}/{cycle_i}/{slice_i}', cv.IMREAD_ANYDEPTH)

			image_normalized = image * 255

			if not os.path.exists(f'{path_maskC}{patient_i_rename}'): # if folder doesnt exist - create
				os.makedirs(f'{path_maskC}{patient_i_rename}')

			cv.imwrite(f'{path_maskC}{patient_i_rename}/{slice_i}', image_normalized)



## RESCALE BIOPSY MASKS

In [None]:
for patient_i in os.listdir(path_original):

	patient_i_rename = int(patient_i.lstrip('0')) # patient name not from metadata, needs manual name adjustment

	if patient_i_rename < 3:
		continue
	for cycle_i in os.listdir(f'{path_original}{patient_i}'):
		if not cycle_i.startswith("biopsyMask"):
			continue

		for slice_i in os.listdir(f'{path_original}{patient_i}/{cycle_i}'):
			image = cv.imread(f'{path_original}{patient_i}/{cycle_i}/{slice_i}', cv.IMREAD_ANYDEPTH)

			image_normalized = image / image * 255

			if not os.path.exists(f'{path_maskB}{patient_i_rename}'): # if folder doesnt exist - create
				os.makedirs(f'{path_maskB}{patient_i_rename}')

			cv.imwrite(f'{path_maskB}{patient_i_rename}/{slice_i}', image_normalized)



## NORMALIZE IMAGES

In [None]:
metadata = pd.read_csv(f'{path_processed}metadata.csv')

total_patients = len(np.unique(metadata.query('has_prostate')['patient']))
print(f'Total patients: {total_patients}')

norm_intavg = np.zeros(shape = [len(metadata['path'])])
norm_intstd = np.zeros(shape = [len(metadata['path'])])

for patient_i in np.unique(metadata['patient']):
# for patient_i in [53, 86, 109]:
	print(patient_i)
	bool_patient = np.equal(metadata['patient'], patient_i)
	intmax_patient = max(metadata['intmax'] * bool_patient) # select max intensity for given patient

	for enum_index, enum_value in enumerate(metadata['patient']):

		if enum_value != patient_i: # only for given patient
			continue

		image_cycle = metadata['cycle'][enum_index]
		image_slice = metadata['slice'][enum_index]

		image = cv.imread(f'{path_original}{metadata["path"][enum_index]}', cv.IMREAD_ANYDEPTH)
		image_normalized = image / intmax_patient * 255

		norm_intavg[enum_index] = np.mean(image_normalized)
		norm_intstd[enum_index] = np.std(image_normalized)

		if not os.path.exists(f'{path_norm}{patient_i}'): # if folder doesnt exist - create
			os.makedirs(f'{path_norm}{patient_i}')

		# SAVE ALL SLICES:
		cv.imwrite(f'{path_norm}{patient_i}/{image_cycle}-{image_slice}.png', image_normalized)



# Calculate SLIC region min sizes

In [9]:
from PIL import Image
import numpy as np
import pandas as pd
import os

slic_size = 50


def calculate_sizes(row):
	with Image.open(f'./processed/mask_prostate/{row["patient"]}/{row["slice"]}.png') as img:
		matrix = np.array(img)
	return np.sum(matrix) / 255


def has_cancer(row):
	file = f'./filtered_cancer_masks/{row["patient"]:03}/regionMaskByType/{row["slice"]}.png'
	if not os.path.exists(file):
		return False
	with Image.open(file) as img:
		matrix = np.array(img)
	return np.any(matrix == 2)


metadata = pd.read_csv('./processed/metadata.csv') # .drop([['has_cancer', 'slic_size']], axis='columns')
metadata = (
	metadata[['patient', 'slice']].drop_duplicates()
		.assign(
			has_cancer=lambda x: x.apply(has_cancer, axis='columns'),
			slic_size=lambda x: x.apply(calculate_sizes, axis='columns'),
		)
		.assign(max_size=lambda x: x.groupby('patient').slic_size.transform('max'))
		.query('max_size != 0')
		.assign(slic_size=lambda x: slic_size * x.slic_size / x.max_size)
		.assign(slic_size=lambda x: x.slic_size.astype(int))
		.drop('max_size', axis='columns')
		.merge(metadata, how='right')
		.to_csv('./processed/metadata.csv', index=False)
)
