In [1]:
from breastcancer.inference import check_subsetting, predict_mitoses
from time import gmtime, strftime
import shutil

## Predict the mitosis number & locations for each ROI
1. generate the ROIs for each slide
2. generate the tiles for each ROI
3. load the model to predict the mitosis number for each ROI
4. parallelize the workflow using PySpark and run them on GPU

In [2]:
# Ship a fresh copy of the `breastcancer` package to the Spark workers.
# Note: The zip must include the `breastcancer` directory itself,
# as well as all files within it for `addPyFile` to work correctly.
# This is equivalent to `zip -r breastcancer.zip breastcancer`.
dirname = "breastcancer"
zipname = dirname + ".zip"
shutil.make_archive(dirname, 'zip', dirname + "/..", dirname)
sc.addPyFile(zipname)
sc.addPyFile("train_mitoses.py")
sc.addPyFile("preprocess_mitoses.py")
sc.addPyFile("resnet50.py")

In [None]:
model_name = 'vgg'
#model_name = 'resnet'
skipROI = True
marginalization = False
batch_size = 128
tile_overlap = 48
suffix = '*.tif'


if model_name == 'vgg':
    model_file = '/8tb/deep_histopath/pred/model/0.67899_f1_3.5448_loss_3_epoch_model.hdf5'
    dir = '/8tb/deep_histopath/pred/data/vgg/val'
    
    #model_file = '/8tb/deep_histopath/pred/model/0.74172_f1_1.7319_loss_8_epoch_model.hdf5'
    #dir = '/8tb/deep_histopath/pred/data/vgg/train'
elif model_name == 'resnet':
    model_file = '/8tb/deep_histopath/pred/model/0.72938_f1_0.067459_loss_7_epoch_model.hdf5'
    dir = '/8tb/deep_histopath/pred/data/resnet/val'

#dir = "/8tb/deep_histopath/data/mitoses/patches_aug_strat_sampled_fp_oversampling_png_improved_gen_dense3/val/mitosis/"
#suffix = '*.png'

node_num = 2
gpu_per_node = 4
partition_num = gpu_per_node * node_num
ROI_size = 6000
ROI_overlap = 16
ROI_channel = 3
tile_size = 64
tile_channel = 3
threshold = 0
isGPU = True
isDebug = True
save_mitosis_locations=True
save_mask = True
isDebug = True

predict_result_rdd = predict_mitoses(sc, model_path=model_file, model_name = model_name, input_dir=dir, 
                                     file_suffix=suffix, partition_num=partition_num,
                                     ROI_size=ROI_size, ROI_overlap=ROI_overlap, ROI_channel=ROI_channel,
                                     skipROI=skipROI,
                                     marginalization=marginalization,
                                     tile_size=tile_size, tile_overlap=tile_overlap, tile_channel=tile_channel,
                                     threshold=threshold, isGPU=isGPU, 
                                     save_mitosis_locations=save_mitosis_locations,
                                     save_mask=save_mask,
                                     batch_size=batch_size, isDebug=isDebug)
predict_result_rdd.cache()

Counter({'rr-ram4.softlayer.com': 0, 'rr-ram3.softlayer.com': 0})
[(0, 'rr-ram4.softlayer.com'), (1, 'rr-ram3.softlayer.com'), (2, 'rr-ram4.softlayer.com'), (3, 'rr-ram3.softlayer.com'), (4, 'rr-ram4.softlayer.com'), (5, 'rr-ram3.softlayer.com'), (6, 'rr-ram4.softlayer.com'), (7, 'rr-ram3.softlayer.com')]
{0: 3, 1: 3, 2: 2, 3: 2, 4: 1, 5: 1, 6: 0, 7: 0}


PythonRDD[2] at RDD at PythonRDD.scala:48

In [None]:
start_time = strftime("%Y-%m-%d %H:%M:%S",gmtime())
print(start_time)
result = predict_result_rdd.collect()
print(len(result))
print(result)
end_time = strftime("%Y-%m-%d %H:%M:%S",gmtime())
print(end_time)

2017-12-04 21:50:01


## Save the prediction results into CSV

In [None]:
from pyspark.sql import SparkSession
import re

def flat_result_2_row(predictions, reg_exp="\d+_\d+"):
  assert predictions is not None
  result = []
  slide_id, ROI, mitosis_num, mitosis_location_scores = predictions
  for r, c, score in mitosis_location_scores:
    slide_id = re.findall(reg_exp, slide_id)[0]
    result.append((slide_id, ROI, mitosis_num, r, c, score))
  return result

csv_path = "/8tb/deep_histopath/pred/result/vgg/val_48_vgg2_nm_128batch_threshold0.csv"
spark = (SparkSession.builder.appName("aggreate_predictions_2_csv").getOrCreate())


pred_rows = predict_result_rdd.filter(lambda t: t is not None).flatMap(lambda t: flat_result_2_row(t)).cache()

df = spark.createDataFrame(pred_rows, ['slide_id', 'ROI_id', 'mitosis_num_per_ROI', 'row_num', 'col_num', 'score'])

dir = os.path.dirname(csv_path)
os.makedirs(dir, exist_ok=True)
df.toPandas().to_csv(csv_path, header=True, index=False)
df.show()

## Add the ground truth locations into the predicted result

In [None]:
from breastcancer.evaluation import add_ground_truth_mark
from breastcancer.visualization import Shape
from pathlib import Path
import re

im_dir = "/home/fei/result/mitoses_train_image_result_test/48_val/"
partition_num = 4
im_suffix = "*mark.tif"
ground_truth_file_suffix = "*.csv"


# Add ground truth mark. Note that the ground truth file does not have file schema
ground_truth_dir = "/home/fei/data/mitoses_ground_truth_test/val/"
mark_color=(255, 255, 0, 200)
hasHeader = False
add_ground_truth_mark(sc, partition_num, im_dir, im_suffix, ground_truth_dir, ground_truth_file_suffix, 
                      shape=Shape.CIRCLE, mark_color=mark_color, hasHeader=hasHeader)


# Add the prediction result
#ground_truth_dir = "/home/fei/result/mitoses_train_image_result_test/32-val_vis/cluster"
#mark_color=(255, 0, 0, 200)
#hasHeader = True
#add_ground_truth_mark(sc, partition_num, im_dir, im_suffix, ground_truth_dir, ground_truth_file_suffix, 
#                      shape=Shape.CROSS, mark_color=mark_color, hasHeader=hasHeader)

## Cluster the prediction result

In [1]:
from breastcancer.evaluation import cluster_prediction_result

cluster_prediction_result(pred_dir="/8tb/deep_histopath/pred/result/vgg/val/", eps=32, min_samples=1, 
                          hasHeader=True, isWeightedAvg=False, prob_threshold=0.45)

## Compute F1 score

In [2]:
from breastcancer.evaluation import evaluate_global_f1

pred_dir = "/8tb/deep_histopath/pred/result/vgg/val/cluster"
ground_true_dir = "/home/fei/data/mitoses_ground_truth_test/val/"
threshold = 30
f1_list, over_detected, non_detected, FP, TP, FN = evaluate_global_f1(pred_dir, ground_true_dir, threshold)

print(f1_list)
print(over_detected)
print(non_detected)


Point (1558 , 1633) has multiple points in the circle
Point (1506 , 1653) has multiple points in the circle
TP: 189; FP: 128; FN: 157
precision: 0.5962145110410094; recall: 0.546242774566474
0.5701357466063348
['/8tb/deep_histopath/pred/result/vgg/val/cluster/clustered_12/09.csv']
['/home/fei/data/mitoses_ground_truth_test/val/22/26.csv', '/home/fei/data/mitoses_ground_truth_test/val/22/22.csv', '/home/fei/data/mitoses_ground_truth_test/val/12/08.csv', '/home/fei/data/mitoses_ground_truth_test/val/11/08.csv', '/home/fei/data/mitoses_ground_truth_test/val/22/11.csv', '/home/fei/data/mitoses_ground_truth_test/val/21/37.csv', '/home/fei/data/mitoses_ground_truth_test/val/22/14.csv', '/home/fei/data/mitoses_ground_truth_test/val/22/05.csv', '/home/fei/data/mitoses_ground_truth_test/val/11/03.csv', '/home/fei/data/mitoses_ground_truth_test/val/22/21.csv', '/home/fei/data/mitoses_ground_truth_test/val/22/25.csv', '/home/fei/data/mitoses_ground_truth_test/val/22/09.csv', '/home/fei/data/mitos

## Export FP to CSV File

In [None]:
from breastcancer.evaluation import export_F1_inputs_TP_FP_FN, export_single_F1_input
export_F1_inputs_TP_FP_FN('/8tb/deep_histopath/pred/result/vgg/train-60-FP-TP_FN-label.csv', FP, TP, FN)
export_single_F1_input('/8tb/deep_histopath/pred/result/vgg/', FP, "FP")
export_single_F1_input('/8tb/deep_histopath/pred/result/vgg/', FN, "FN")

## Experiment Test
1. Generate a ROI
2. Generate the tiles from the ROI
3. Check if the tiles match the ROI
4. Predict the mitosis number for each tile and sum the prediction result
5. This could be used to test the performance of a single ROI prediction

In [None]:
import openslide
import numpy as np
import keras
from keras.models import load_model
from breastcancer.preprocessing import create_tile_generator, get_20x_zoom_level
from skimage.util.shape import view_as_windows

model_file = 'model/0.95114_acc_0.58515_loss_530_epoch_model.hdf5'
filepath = 'data/training_image_data/TUPAC-TR-500.svs'
ROI_size = 6000
ROI_overlap = 0
tile_size = 64
tile_overlap = 10
threshold = 0.5
base_model = load_model(model_file)
probs = keras.layers.Activation('sigmoid')(base_model.output)
model = keras.models.Model(inputs=base_model.input, outputs=probs)
slide = openslide.open_slide(str(filepath))
generator = create_tile_generator(slide, ROI_size, ROI_overlap)
zoom_level = get_20x_zoom_level(slide, generator)
cols, rows = generator.level_tiles[zoom_level]
ROI_indices = [(zoom_level, col, row) for col in range(cols) for row in range(rows)]
ROI_index = ROI_indices[15]
zl, col, row = ROI_index
ROI = np.asarray(generator.get_tile(zl, (col, row)))

In [None]:
tiles = view_as_windows(ROI, (tile_size, tile_size, 3), step=tile_size - tile_overlap).reshape(-1, tile_size, tile_size, 3)
print(f"The shape of tiles {tiles.shape};\nThe shape of ROI {ROI.shape}")

In [None]:
isSame = check_subsetting(ROI, ROI_size, tiles, tile_size, tile_overlap)
print(f"Is the ROI subsetting right? {isSame}")

In [None]:
prediction = model.predict(tiles, batch_size=128, verbose=True) > threshold
result = np.sum(prediction, dtype=np.int32)
print(f"The number of mitoses is {result}")

In [None]:
from breastcancer.inference import predict_mitoses_help
model_file = '/home/fei/deep-histopath/deep-histopath/model/0.74172_f1_1.7319_loss_8_epoch_model.hdf5'
model_name = "vgg"
index = 1
file_partition = ['/home/fei/data/training_image_data/TUPAC-TR-085.svs']

ROI_size = 6000
ROI_overlap = 0

result = predict_mitoses_help(model_file, model_name, index, file_partition,
                           ROI_size, ROI_overlap, ROI_channel=3, skipROI=False,
                           tile_size=64, tile_overlap=0, tile_channel=3,
                           threshold=0.5, isGPU=True, batch_size=32,
                           save_mitosis_locations=True,
                           save_mask=True,
                           isDebug=True)
for element in result:
    print(element)
    print("\n")